aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-10-23 01:01:49 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-10-23 01:01:49 -0400
commit3dd41424090a0ca3a660218d06afe6ff4441bad3 (patch)
tree511ef1bb1799027fc5aad574adce49120ecadd87 /fs
parent5c5456402d467969b217d7fdd6670f8c8600f5a8 (diff)
parentf6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff)
Merge commit 'v2.6.36' into wip-merge-2.6.36
Conflicts: Makefile arch/x86/include/asm/unistd_32.h arch/x86/kernel/syscall_table_32.S kernel/sched.c kernel/time/tick-sched.c Relevant API and functions changes (solved in this commit): - (API) .enqueue_task() (enqueue_task_litmus), dequeue_task() (dequeue_task_litmus), [litmus/sched_litmus.c] - (API) .select_task_rq() (select_task_rq_litmus) [litmus/sched_litmus.c] - (API) sysrq_dump_trace_buffer() and sysrq_handle_kill_rt_tasks() [litmus/sched_trace.c] - struct kfifo internal buffer name changed (buffer -> buf) [litmus/sched_trace.c] - add_wait_queue_exclusive_locked -> __add_wait_queue_tail_exclusive [litmus/fmlp.c] - syscall numbers for both x86_32 and x86_64
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Makefile4
-rw-r--r--fs/9p/fid.c114
-rw-r--r--fs/9p/v9fs.c3
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/v9fs_vfs.h5
-rw-r--r--fs/9p/vfs_dir.c148
-rw-r--r--fs/9p/vfs_file.c43
-rw-r--r--fs/9p/vfs_inode.c877
-rw-r--r--fs/9p/vfs_super.c119
-rw-r--r--fs/9p/xattr.c160
-rw-r--r--fs/9p/xattr.h27
-rw-r--r--fs/9p/xattr_user.c80
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c17
-rw-r--r--fs/affs/affs.h5
-rw-r--r--fs/affs/file.c15
-rw-r--r--fs/affs/inode.c38
-rw-r--r--fs/affs/namei.c2
-rw-r--r--fs/affs/super.c32
-rw-r--r--fs/afs/Kconfig1
-rw-r--r--fs/afs/cell.c96
-rw-r--r--fs/afs/dir.c53
-rw-r--r--fs/afs/file.c64
-rw-r--r--fs/afs/inode.c91
-rw-r--r--fs/afs/internal.h28
-rw-r--r--fs/afs/main.c9
-rw-r--r--fs/afs/mntpt.c84
-rw-r--r--fs/afs/proc.c2
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/server.c5
-rw-r--r--fs/afs/super.c22
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/aio.c105
-rw-r--r--fs/attr.c84
-rw-r--r--fs/autofs/root.c68
-rw-r--r--fs/autofs4/dev-ioctl.c18
-rw-r--r--fs/autofs4/root.c74
-rw-r--r--fs/bad_inode.c10
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/bfs.h1
-rw-r--r--fs/bfs/dir.c6
-rw-r--r--fs/bfs/file.c17
-rw-r--r--fs/bfs/inode.c116
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_elf_fdpic.c26
-rw-r--r--fs/binfmt_flat.c27
-rw-r--r--fs/binfmt_misc.c9
-rw-r--r--fs/binfmt_script.c3
-rw-r--r--fs/bio-integrity.c4
-rw-r--r--fs/bio.c5
-rw-r--r--fs/block_dev.c407
-rw-r--r--fs/btrfs/acl.c12
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c238
-rw-r--r--fs/btrfs/ctree.h169
-rw-r--r--fs/btrfs/delayed-ref.c101
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c188
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2258
-rw-r--r--fs/btrfs/extent_io.c87
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/file-item.c28
-rw-r--r--fs/btrfs/file.c181
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1762
-rw-r--r--fs/btrfs/ioctl.c222
-rw-r--r--fs/btrfs/ordered-data.c82
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/relocation.c1974
-rw-r--r--fs/btrfs/root-tree.c26
-rw-r--r--fs/btrfs/super.c43
-rw-r--r--fs/btrfs/transaction.c232
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c241
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c35
-rw-r--r--fs/btrfs/xattr.c14
-rw-r--r--fs/btrfs/xattr.h6
-rw-r--r--fs/buffer.c202
-rw-r--r--fs/cachefiles/bind.c2
-rw-r--r--fs/cachefiles/daemon.c38
-rw-r--r--fs/cachefiles/internal.h13
-rw-r--r--fs/cachefiles/namei.c13
-rw-r--r--fs/cachefiles/rdwr.c4
-rw-r--r--fs/ceph/Kconfig3
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/addr.c44
-rw-r--r--fs/ceph/armor.c6
-rw-r--r--fs/ceph/auth.c14
-rw-r--r--fs/ceph/auth.h8
-rw-r--r--fs/ceph/auth_none.c9
-rw-r--r--fs/ceph/auth_x.c57
-rw-r--r--fs/ceph/buffer.c16
-rw-r--r--fs/ceph/caps.c528
-rw-r--r--fs/ceph/ceph_frag.h4
-rw-r--r--fs/ceph/ceph_fs.c50
-rw-r--r--fs/ceph/ceph_fs.h144
-rw-r--r--fs/ceph/ceph_hash.h4
-rw-r--r--fs/ceph/ceph_strings.c19
-rw-r--r--fs/ceph/crush/crush.h4
-rw-r--r--fs/ceph/crush/hash.h4
-rw-r--r--fs/ceph/crush/mapper.c41
-rw-r--r--fs/ceph/crush/mapper.h4
-rw-r--r--fs/ceph/crypto.c27
-rw-r--r--fs/ceph/crypto.h4
-rw-r--r--fs/ceph/debugfs.c40
-rw-r--r--fs/ceph/decode.h6
-rw-r--r--fs/ceph/dir.c83
-rw-r--r--fs/ceph/export.c37
-rw-r--r--fs/ceph/file.c55
-rw-r--r--fs/ceph/inode.c143
-rw-r--r--fs/ceph/ioctl.c26
-rw-r--r--fs/ceph/ioctl.h2
-rw-r--r--fs/ceph/locks.c260
-rw-r--r--fs/ceph/mds_client.c807
-rw-r--r--fs/ceph/mds_client.h42
-rw-r--r--fs/ceph/mdsmap.c6
-rw-r--r--fs/ceph/mdsmap.h8
-rw-r--r--fs/ceph/messenger.c181
-rw-r--r--fs/ceph/messenger.h11
-rw-r--r--fs/ceph/mon_client.c409
-rw-r--r--fs/ceph/mon_client.h32
-rw-r--r--fs/ceph/msgpool.c180
-rw-r--r--fs/ceph/msgpool.h12
-rw-r--r--fs/ceph/msgr.h25
-rw-r--r--fs/ceph/osd_client.c125
-rw-r--r--fs/ceph/osdmap.c65
-rw-r--r--fs/ceph/pagelist.c14
-rw-r--r--fs/ceph/rados.h36
-rw-r--r--fs/ceph/snap.c179
-rw-r--r--fs/ceph/super.c228
-rw-r--r--fs/ceph/super.h89
-rw-r--r--fs/ceph/xattr.c38
-rw-r--r--fs/char_dev.c5
-rw-r--r--fs/cifs/Kconfig27
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/README15
-rw-r--r--fs/cifs/asn1.c103
-rw-r--r--fs/cifs/cache.c331
-rw-r--r--fs/cifs/cifs_debug.c73
-rw-r--r--fs/cifs/cifs_debug.h42
-rw-r--r--fs/cifs/cifs_dfs_ref.c67
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_spnego.c13
-rw-r--r--fs/cifs/cifs_unicode.c5
-rw-r--r--fs/cifs/cifs_unicode.h18
-rw-r--r--fs/cifs/cifs_uniupr.h16
-rw-r--r--fs/cifs/cifsacl.c76
-rw-r--r--fs/cifs/cifsencrypt.c67
-rw-r--r--fs/cifs/cifsfs.c213
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h57
-rw-r--r--fs/cifs/cifsproto.h36
-rw-r--r--fs/cifs/cifssmb.c486
-rw-r--r--fs/cifs/connect.c874
-rw-r--r--fs/cifs/dir.c274
-rw-r--r--fs/cifs/dns_resolve.c170
-rw-r--r--fs/cifs/dns_resolve.h2
-rw-r--r--fs/cifs/export.c2
-rw-r--r--fs/cifs/file.c416
-rw-r--r--fs/cifs/fscache.c236
-rw-r--r--fs/cifs/fscache.h136
-rw-r--r--fs/cifs/inode.c312
-rw-r--r--fs/cifs/ioctl.c13
-rw-r--r--fs/cifs/link.c10
-rw-r--r--fs/cifs/misc.c97
-rw-r--r--fs/cifs/netmisc.c81
-rw-r--r--fs/cifs/readdir.c90
-rw-r--r--fs/cifs/sess.c91
-rw-r--r--fs/cifs/smberr.h1
-rw-r--r--fs/cifs/transport.c92
-rw-r--r--fs/cifs/xattr.c40
-rw-r--r--fs/coda/coda_int.h3
-rw-r--r--fs/coda/file.c6
-rw-r--r--fs/coda/inode.c8
-rw-r--r--fs/coda/pioctl.c76
-rw-r--r--fs/coda/psdev.c21
-rw-r--r--fs/coda/upcall.c12
-rw-r--r--fs/compat.c182
-rw-r--r--fs/compat_ioctl.c51
-rw-r--r--fs/configfs/inode.c14
-rw-r--r--fs/cramfs/inode.c88
-rw-r--r--fs/dcache.c310
-rw-r--r--fs/debugfs/file.c21
-rw-r--r--fs/devpts/inode.c9
-rw-r--r--fs/direct-io.c105
-rw-r--r--fs/dlm/lock.c5
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/dlm/netlink.c15
-rw-r--r--fs/dlm/user.c88
-rw-r--r--fs/drop_caches.c26
-rw-r--r--fs/ecryptfs/crypto.c5
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c66
-rw-r--r--fs/ecryptfs/inode.c175
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/kthread.c2
-rw-r--r--fs/ecryptfs/main.c166
-rw-r--r--fs/ecryptfs/messaging.c21
-rw-r--r--fs/ecryptfs/miscdev.c2
-rw-r--r--fs/ecryptfs/mmap.c19
-rw-r--r--fs/ecryptfs/read_write.c13
-rw-r--r--fs/ecryptfs/super.c36
-rw-r--r--fs/eventpoll.c3
-rw-r--r--fs/exec.c293
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/exofs.h3
-rw-r--r--fs/exofs/file.c33
-rw-r--r--fs/exofs/inode.c177
-rw-r--r--fs/exofs/ios.c46
-rw-r--r--fs/exofs/super.c3
-rw-r--r--fs/ext2/acl.c5
-rw-r--r--fs/ext2/balloc.c17
-rw-r--r--fs/ext2/dir.c23
-rw-r--r--fs/ext2/ext2.h8
-rw-r--r--fs/ext2/file.c7
-rw-r--r--fs/ext2/ialloc.c34
-rw-r--r--fs/ext2/inode.c209
-rw-r--r--fs/ext2/super.c131
-rw-r--r--fs/ext2/xattr.c37
-rw-r--r--fs/ext2/xattr.h12
-rw-r--r--fs/ext2/xattr_security.c2
-rw-r--r--fs/ext2/xattr_trusted.c2
-rw-r--r--fs/ext2/xattr_user.c2
-rw-r--r--fs/ext3/Kconfig1
-rw-r--r--fs/ext3/acl.c5
-rw-r--r--fs/ext3/balloc.c6
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/fsync.c27
-rw-r--r--fs/ext3/ialloc.c25
-rw-r--r--fs/ext3/inode.c145
-rw-r--r--fs/ext3/namei.c3
-rw-r--r--fs/ext3/resize.c2
-rw-r--r--fs/ext3/super.c144
-rw-r--r--fs/ext3/xattr.c22
-rw-r--r--fs/ext3/xattr.h12
-rw-r--r--fs/ext3/xattr_security.c2
-rw-r--r--fs/ext3/xattr_trusted.c2
-rw-r--r--fs/ext3/xattr_user.c2
-rw-r--r--fs/ext4/acl.c5
-rw-r--r--fs/ext4/balloc.c11
-rw-r--r--fs/ext4/block_validity.c12
-rw-r--r--fs/ext4/dir.c41
-rw-r--r--fs/ext4/ext4.h320
-rw-r--r--fs/ext4/ext4_jbd2.c71
-rw-r--r--fs/ext4/ext4_jbd2.h64
-rw-r--r--fs/ext4/extents.c435
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/fsync.c47
-rw-r--r--fs/ext4/ialloc.c105
-rw-r--r--fs/ext4/inode.c977
-rw-r--r--fs/ext4/ioctl.c27
-rw-r--r--fs/ext4/mballoc.c261
-rw-r--r--fs/ext4/migrate.c4
-rw-r--r--fs/ext4/move_extent.c26
-rw-r--r--fs/ext4/namei.c97
-rw-r--r--fs/ext4/resize.c11
-rw-r--r--fs/ext4/super.c451
-rw-r--r--fs/ext4/symlink.c2
-rw-r--r--fs/ext4/xattr.c64
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/ext4/xattr_security.c2
-rw-r--r--fs/ext4/xattr_trusted.c2
-rw-r--r--fs/ext4/xattr_user.c2
-rw-r--r--fs/fat/cache.c13
-rw-r--r--fs/fat/dir.c28
-rw-r--r--fs/fat/fat.h21
-rw-r--r--fs/fat/file.c72
-rw-r--r--fs/fat/inode.c61
-rw-r--r--fs/fat/misc.c26
-rw-r--r--fs/fcntl.c99
-rw-r--r--fs/file.c60
-rw-r--r--fs/file_table.c164
-rw-r--r--fs/freevxfs/vxfs_extern.h2
-rw-r--r--fs/freevxfs/vxfs_inode.c8
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/freevxfs/vxfs_super.c4
-rw-r--r--fs/fs-writeback.c722
-rw-r--r--fs/fs_struct.c39
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fscache/internal.h22
-rw-r--r--fs/fscache/main.c106
-rw-r--r--fs/fscache/object-list.c13
-rw-r--r--fs/fscache/object.c106
-rw-r--r--fs/fscache/operation.c67
-rw-r--r--fs/fscache/page.c72
-rw-r--r--fs/fuse/dev.c791
-rw-r--r--fs/fuse/dir.c24
-rw-r--r--fs/fuse/file.c58
-rw-r--r--fs/fuse/fuse_i.h9
-rw-r--r--fs/fuse/inode.c6
-rw-r--r--fs/generic_acl.c5
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/acl.c6
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c31
-rw-r--r--fs/gfs2/bmap.c31
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/dir.c46
-rw-r--r--fs/gfs2/export.c2
-rw-r--r--fs/gfs2/file.c15
-rw-r--r--fs/gfs2/glock.c100
-rw-r--r--fs/gfs2/incore.h15
-rw-r--r--fs/gfs2/inode.c136
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/log.c164
-rw-r--r--fs/gfs2/log.h30
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/main.c16
-rw-r--r--fs/gfs2/meta_io.c13
-rw-r--r--fs/gfs2/ops_fstype.c56
-rw-r--r--fs/gfs2/ops_inode.c19
-rw-r--r--fs/gfs2/quota.c135
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/recovery.c54
-rw-r--r--fs/gfs2/recovery.h6
-rw-r--r--fs/gfs2/rgrp.c81
-rw-r--r--fs/gfs2/super.c63
-rw-r--r--fs/gfs2/super.h2
-rw-r--r--fs/gfs2/sys.c66
-rw-r--r--fs/gfs2/trans.c18
-rw-r--r--fs/gfs2/xattr.c30
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c70
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h4
-rw-r--r--fs/hfsplus/inode.c79
-rw-r--r--fs/hfsplus/ioctl.c12
-rw-r--r--fs/hfsplus/super.c10
-rw-r--r--fs/hostfs/hostfs.h22
-rw-r--r--fs/hostfs/hostfs_kern.c521
-rw-r--r--fs/hostfs/hostfs_user.c112
-rw-r--r--fs/hpfs/file.c15
-rw-r--r--fs/hpfs/hpfs_fn.h4
-rw-r--r--fs/hpfs/inode.c24
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hppfs/hppfs.c10
-rw-r--r--fs/hugetlbfs/inode.c43
-rw-r--r--fs/inode.c215
-rw-r--r--fs/internal.h9
-rw-r--r--fs/ioctl.c33
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/isofs/inode.c7
-rw-r--r--fs/jbd/checkpoint.c4
-rw-r--r--fs/jbd/commit.c57
-rw-r--r--fs/jbd/journal.c42
-rw-r--r--fs/jbd/recovery.c11
-rw-r--r--fs/jbd/revoke.c2
-rw-r--r--fs/jbd2/checkpoint.c25
-rw-r--r--fs/jbd2/commit.c89
-rw-r--r--fs/jbd2/journal.c140
-rw-r--r--fs/jbd2/recovery.c10
-rw-r--r--fs/jbd2/revoke.c2
-rw-r--r--fs/jbd2/transaction.c265
-rw-r--r--fs/jffs2/acl.c7
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/background.c4
-rw-r--r--fs/jffs2/build.c1
-rw-r--r--fs/jffs2/compr.c5
-rw-r--r--fs/jffs2/compr.h1
-rw-r--r--fs/jffs2/compr_lzo.c1
-rw-r--r--fs/jffs2/compr_rtime.c1
-rw-r--r--fs/jffs2/compr_rubin.c1
-rw-r--r--fs/jffs2/compr_zlib.c1
-rw-r--r--fs/jffs2/debug.c1
-rw-r--r--fs/jffs2/debug.h1
-rw-r--r--fs/jffs2/dir.c124
-rw-r--r--fs/jffs2/erase.c13
-rw-r--r--fs/jffs2/file.c5
-rw-r--r--fs/jffs2/fs.c28
-rw-r--r--fs/jffs2/gc.c18
-rw-r--r--fs/jffs2/ioctl.c1
-rw-r--r--fs/jffs2/jffs2_fs_i.h1
-rw-r--r--fs/jffs2/jffs2_fs_sb.h1
-rw-r--r--fs/jffs2/nodelist.h11
-rw-r--r--fs/jffs2/nodemgmt.c28
-rw-r--r--fs/jffs2/os-linux.h7
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/security.c2
-rw-r--r--fs/jffs2/super.c4
-rw-r--r--fs/jffs2/wbuf.c8
-rw-r--r--fs/jffs2/xattr.c12
-rw-r--r--fs/jffs2/xattr.h8
-rw-r--r--fs/jffs2/xattr_trusted.c2
-rw-r--r--fs/jffs2/xattr_user.c2
-rw-r--r--fs/jfs/file.c20
-rw-r--r--fs/jfs/inode.c63
-rw-r--r--fs/jfs/jfs_dmap.c2
-rw-r--r--fs/jfs/jfs_inode.c12
-rw-r--r--fs/jfs/jfs_inode.h4
-rw-r--r--fs/jfs/super.c24
-rw-r--r--fs/jfs/xattr.c87
-rw-r--r--fs/libfs.c104
-rw-r--r--fs/logfs/dev_bdev.c6
-rw-r--r--fs/logfs/dev_mtd.c26
-rw-r--r--fs/logfs/dir.c9
-rw-r--r--fs/logfs/file.c44
-rw-r--r--fs/logfs/gc.c49
-rw-r--r--fs/logfs/inode.c66
-rw-r--r--fs/logfs/journal.c9
-rw-r--r--fs/logfs/logfs.h24
-rw-r--r--fs/logfs/logfs_abi.h10
-rw-r--r--fs/logfs/readwrite.c81
-rw-r--r--fs/logfs/segment.c8
-rw-r--r--fs/logfs/super.c31
-rw-r--r--fs/mbcache.c201
-rw-r--r--fs/minix/bitmap.c11
-rw-r--r--fs/minix/dir.c32
-rw-r--r--fs/minix/file.c24
-rw-r--r--fs/minix/inode.c35
-rw-r--r--fs/minix/itree_v2.c27
-rw-r--r--fs/minix/minix.h6
-rw-r--r--fs/minix/namei.c11
-rw-r--r--fs/namei.c149
-rw-r--r--fs/namespace.c226
-rw-r--r--fs/ncpfs/dir.c3
-rw-r--r--fs/ncpfs/file.c4
-rw-r--r--fs/ncpfs/inode.c40
-rw-r--r--fs/ncpfs/ioctl.c26
-rw-r--r--fs/nfs/Kconfig29
-rw-r--r--fs/nfs/callback.c11
-rw-r--r--fs/nfs/callback_proc.c19
-rw-r--r--fs/nfs/client.c160
-rw-r--r--fs/nfs/delegation.c18
-rw-r--r--fs/nfs/delegation.h4
-rw-r--r--fs/nfs/dir.c170
-rw-r--r--fs/nfs/direct.c29
-rw-r--r--fs/nfs/dns_resolve.c24
-rw-r--r--fs/nfs/dns_resolve.h12
-rw-r--r--fs/nfs/file.c88
-rw-r--r--fs/nfs/fscache.c3
-rw-r--r--fs/nfs/getroot.c191
-rw-r--r--fs/nfs/inode.c145
-rw-r--r--fs/nfs/internal.h18
-rw-r--r--fs/nfs/iostat.h6
-rw-r--r--fs/nfs/namespace.c20
-rw-r--r--fs/nfs/nfs2xdr.c7
-rw-r--r--fs/nfs/nfs3acl.c23
-rw-r--r--fs/nfs/nfs3proc.c128
-rw-r--r--fs/nfs/nfs3xdr.c10
-rw-r--r--fs/nfs/nfs4_fs.h65
-rw-r--r--fs/nfs/nfs4namespace.c12
-rw-r--r--fs/nfs/nfs4proc.c653
-rw-r--r--fs/nfs/nfs4renewd.c4
-rw-r--r--fs/nfs/nfs4state.c118
-rw-r--r--fs/nfs/nfs4xdr.c133
-rw-r--r--fs/nfs/nfsroot.c16
-rw-r--r--fs/nfs/pagelist.c22
-rw-r--r--fs/nfs/proc.c144
-rw-r--r--fs/nfs/read.c7
-rw-r--r--fs/nfs/super.c196
-rw-r--r--fs/nfs/unlink.c6
-rw-r--r--fs/nfs/write.c59
-rw-r--r--fs/nfsd/Kconfig2
-rw-r--r--fs/nfsd/export.c44
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4callback.c181
-rw-r--r--fs/nfsd/nfs4proc.c50
-rw-r--r--fs/nfsd/nfs4recover.c87
-rw-r--r--fs/nfsd/nfs4state.c763
-rw-r--r--fs/nfsd/nfs4xdr.c36
-rw-r--r--fs/nfsd/nfsctl.c92
-rw-r--r--fs/nfsd/nfsd.h7
-rw-r--r--fs/nfsd/nfsfh.h2
-rw-r--r--fs/nfsd/nfsproc.c4
-rw-r--r--fs/nfsd/nfssvc.c153
-rw-r--r--fs/nfsd/state.h87
-rw-r--r--fs/nfsd/vfs.c113
-rw-r--r--fs/nfsd/vfs.h5
-rw-r--r--fs/nfsd/xdr4.h11
-rw-r--r--fs/nilfs2/alloc.c154
-rw-r--r--fs/nilfs2/alloc.h7
-rw-r--r--fs/nilfs2/bmap.c6
-rw-r--r--fs/nilfs2/bmap.h16
-rw-r--r--fs/nilfs2/bmap_union.h42
-rw-r--r--fs/nilfs2/btnode.c23
-rw-r--r--fs/nilfs2/btnode.h4
-rw-r--r--fs/nilfs2/btree.c1005
-rw-r--r--fs/nilfs2/btree.h29
-rw-r--r--fs/nilfs2/dir.c58
-rw-r--r--fs/nilfs2/direct.c96
-rw-r--r--fs/nilfs2/direct.h11
-rw-r--r--fs/nilfs2/file.c4
-rw-r--r--fs/nilfs2/gcdat.c2
-rw-r--r--fs/nilfs2/gcinode.c17
-rw-r--r--fs/nilfs2/inode.c93
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/nilfs.h26
-rw-r--r--fs/nilfs2/page.c5
-rw-r--r--fs/nilfs2/page.h2
-rw-r--r--fs/nilfs2/recovery.c357
-rw-r--r--fs/nilfs2/segbuf.c72
-rw-r--r--fs/nilfs2/segbuf.h36
-rw-r--r--fs/nilfs2/segment.c176
-rw-r--r--fs/nilfs2/segment.h18
-rw-r--r--fs/nilfs2/super.c551
-rw-r--r--fs/nilfs2/the_nilfs.c182
-rw-r--r--fs/nilfs2/the_nilfs.h23
-rw-r--r--fs/notify/Kconfig1
-rw-r--r--fs/notify/Makefile4
-rw-r--r--fs/notify/dnotify/dnotify.c213
-rw-r--r--fs/notify/fanotify/Kconfig26
-rw-r--r--fs/notify/fanotify/Makefile1
-rw-r--r--fs/notify/fanotify/fanotify.c209
-rw-r--r--fs/notify/fanotify/fanotify_user.c787
-rw-r--r--fs/notify/fsnotify.c205
-rw-r--r--fs/notify/fsnotify.h27
-rw-r--r--fs/notify/group.c182
-rw-r--r--fs/notify/inode_mark.c337
-rw-r--r--fs/notify/inotify/Kconfig15
-rw-r--r--fs/notify/inotify/Makefile1
-rw-r--r--fs/notify/inotify/inotify.c933
-rw-r--r--fs/notify/inotify/inotify.h7
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c151
-rw-r--r--fs/notify/inotify/inotify_user.c369
-rw-r--r--fs/notify/mark.c371
-rw-r--r--fs/notify/notification.c209
-rw-r--r--fs/notify/vfsmount_mark.c187
-rw-r--r--fs/ntfs/dir.c5
-rw-r--r--fs/ntfs/file.c37
-rw-r--r--fs/ntfs/inode.c10
-rw-r--r--fs/ntfs/inode.h2
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/acl.c40
-rw-r--r--fs/ocfs2/alloc.c910
-rw-r--r--fs/ocfs2/alloc.h12
-rw-r--r--fs/ocfs2/aops.c113
-rw-r--r--fs/ocfs2/blockcheck.c8
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/tcp.c22
-rw-r--r--fs/ocfs2/dir.c99
-rw-r--r--fs/ocfs2/dlm/dlmast.c8
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h5
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c15
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c32
-rw-r--r--fs/ocfs2/dlm/dlmlock.c6
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c101
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c51
-rw-r--r--fs/ocfs2/dlm/dlmthread.c130
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c3
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c18
-rw-r--r--fs/ocfs2/dlmglue.c7
-rw-r--r--fs/ocfs2/dlmglue.h1
-rw-r--r--fs/ocfs2/file.c591
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/inode.c80
-rw-r--r--fs/ocfs2/inode.h7
-rw-r--r--fs/ocfs2/journal.c60
-rw-r--r--fs/ocfs2/journal.h15
-rw-r--r--fs/ocfs2/localalloc.c282
-rw-r--r--fs/ocfs2/localalloc.h3
-rw-r--r--fs/ocfs2/mmap.c56
-rw-r--r--fs/ocfs2/namei.c394
-rw-r--r--fs/ocfs2/ocfs2.h22
-rw-r--r--fs/ocfs2/ocfs2_fs.h181
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h8
-rw-r--r--fs/ocfs2/quota.h12
-rw-r--r--fs/ocfs2/quota_global.c351
-rw-r--r--fs/ocfs2/quota_local.c187
-rw-r--r--fs/ocfs2/refcounttree.c116
-rw-r--r--fs/ocfs2/refcounttree.h4
-rw-r--r--fs/ocfs2/reservations.c844
-rw-r--r--fs/ocfs2/reservations.h159
-rw-r--r--fs/ocfs2/resize.c19
-rw-r--r--fs/ocfs2/suballoc.c887
-rw-r--r--fs/ocfs2/suballoc.h42
-rw-r--r--fs/ocfs2/super.c147
-rw-r--r--fs/ocfs2/super.h7
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/xattr.c315
-rw-r--r--fs/ocfs2/xattr.h12
-rw-r--r--fs/omfs/dir.c22
-rw-r--r--fs/omfs/file.c48
-rw-r--r--fs/omfs/inode.c58
-rw-r--r--fs/omfs/omfs.h1
-rw-r--r--fs/omfs/omfs_fs.h1
-rw-r--r--fs/open.c188
-rw-r--r--fs/partitions/acorn.c103
-rw-r--r--fs/partitions/acorn.h10
-rw-r--r--fs/partitions/amiga.c33
-rw-r--r--fs/partitions/amiga.h2
-rw-r--r--fs/partitions/atari.c20
-rw-r--r--fs/partitions/atari.h2
-rw-r--r--fs/partitions/check.c107
-rw-r--r--fs/partitions/check.h18
-rw-r--r--fs/partitions/efi.c95
-rw-r--r--fs/partitions/efi.h2
-rw-r--r--fs/partitions/ibm.c48
-rw-r--r--fs/partitions/ibm.h2
-rw-r--r--fs/partitions/karma.c6
-rw-r--r--fs/partitions/karma.h2
-rw-r--r--fs/partitions/ldm.c111
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/partitions/mac.c17
-rw-r--r--fs/partitions/mac.h2
-rw-r--r--fs/partitions/msdos.c154
-rw-r--r--fs/partitions/msdos.h2
-rw-r--r--fs/partitions/osf.c6
-rw-r--r--fs/partitions/osf.h2
-rw-r--r--fs/partitions/sgi.c8
-rw-r--r--fs/partitions/sgi.h2
-rw-r--r--fs/partitions/sun.c8
-rw-r--r--fs/partitions/sun.h2
-rw-r--r--fs/partitions/sysv68.c15
-rw-r--r--fs/partitions/sysv68.h2
-rw-r--r--fs/partitions/ultrix.c6
-rw-r--r--fs/partitions/ultrix.h2
-rw-r--r--fs/pipe.c170
-rw-r--r--fs/pnode.c11
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/array.c6
-rw-r--r--fs/proc/base.c174
-rw-r--r--fs/proc/generic.c33
-rw-r--r--fs/proc/inode.c23
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/kmsg.c1
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/proc_devtree.c3
-rw-r--r--fs/proc/proc_sysctl.c15
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/task_mmu.c17
-rw-r--r--fs/proc/task_nommu.c20
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/qnx4/dir.c3
-rw-r--r--fs/qnx4/inode.c11
-rw-r--r--fs/quota/dquot.c487
-rw-r--r--fs/quota/quota.c99
-rw-r--r--fs/quota/quota_tree.c95
-rw-r--r--fs/quota/quota_v1.c7
-rw-r--r--fs/quota/quota_v2.c11
-rw-r--r--fs/ramfs/file-mmu.c3
-rw-r--r--fs/ramfs/file-nommu.c12
-rw-r--r--fs/ramfs/inode.c22
-rw-r--r--fs/read_write.c25
-rw-r--r--fs/readdir.c8
-rw-r--r--fs/reiserfs/dir.c9
-rw-r--r--fs/reiserfs/file.c58
-rw-r--r--fs/reiserfs/inode.c140
-rw-r--r--fs/reiserfs/ioctl.c7
-rw-r--r--fs/reiserfs/journal.c3
-rw-r--r--fs/reiserfs/namei.c18
-rw-r--r--fs/reiserfs/super.c58
-rw-r--r--fs/reiserfs/xattr.c16
-rw-r--r--fs/reiserfs/xattr_acl.c4
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/reiserfs/xattr_trusted.c2
-rw-r--r--fs/reiserfs/xattr_user.c2
-rw-r--r--fs/signalfd.c2
-rw-r--r--fs/smbfs/dir.c3
-rw-r--r--fs/smbfs/file.c5
-rw-r--r--fs/smbfs/inode.c12
-rw-r--r--fs/smbfs/ioctl.c10
-rw-r--r--fs/smbfs/proto.h2
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/splice.c176
-rw-r--r--fs/squashfs/Kconfig34
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/decompressor.c6
-rw-r--r--fs/squashfs/inode.c92
-rw-r--r--fs/squashfs/lzo_wrapper.c136
-rw-r--r--fs/squashfs/namei.c6
-rw-r--r--fs/squashfs/squashfs.h15
-rw-r--r--fs/squashfs/squashfs_fs.h92
-rw-r--r--fs/squashfs/squashfs_fs_i.h3
-rw-r--r--fs/squashfs/squashfs_fs_sb.h3
-rw-r--r--fs/squashfs/super.c30
-rw-r--r--fs/squashfs/symlink.c11
-rw-r--r--fs/squashfs/xattr.c323
-rw-r--r--fs/squashfs/xattr.h46
-rw-r--r--fs/squashfs/xattr_id.c100
-rw-r--r--fs/stat.c29
-rw-r--r--fs/statfs.c243
-rw-r--r--fs/super.c375
-rw-r--r--fs/sync.c113
-rw-r--r--fs/sysfs/bin.c26
-rw-r--r--fs/sysfs/dir.c114
-rw-r--r--fs/sysfs/file.c22
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysfs/inode.c20
-rw-r--r--fs/sysfs/mount.c97
-rw-r--r--fs/sysfs/symlink.c58
-rw-r--r--fs/sysfs/sysfs.h36
-rw-r--r--fs/sysv/dir.c23
-rw-r--r--fs/sysv/file.c24
-rw-r--r--fs/sysv/ialloc.c18
-rw-r--r--fs/sysv/inode.c20
-rw-r--r--fs/sysv/itree.c19
-rw-r--r--fs/sysv/super.c74
-rw-r--r--fs/sysv/sysv.h4
-rw-r--r--fs/timerfd.c25
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/dir.c9
-rw-r--r--fs/ubifs/file.c26
-rw-r--r--fs/ubifs/io.c1
-rw-r--r--fs/ubifs/lpt.c14
-rw-r--r--fs/ubifs/lpt_commit.c2
-rw-r--r--fs/ubifs/recovery.c23
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c16
-rw-r--r--fs/ubifs/ubifs.h6
-rw-r--r--fs/udf/balloc.c43
-rw-r--r--fs/udf/dir.c5
-rw-r--r--fs/udf/file.c66
-rw-r--r--fs/udf/ialloc.c34
-rw-r--r--fs/udf/inode.c64
-rw-r--r--fs/udf/namei.c30
-rw-r--r--fs/udf/super.c18
-rw-r--r--fs/udf/udfdecl.h7
-rw-r--r--fs/ufs/balloc.c48
-rw-r--r--fs/ufs/dir.c15
-rw-r--r--fs/ufs/file.c5
-rw-r--r--fs/ufs/ialloc.c43
-rw-r--r--fs/ufs/inode.c67
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/ufs/super.c114
-rw-r--r--fs/ufs/symlink.c8
-rw-r--r--fs/ufs/truncate.c48
-rw-r--r--fs/ufs/ufs.h4
-rw-r--r--fs/ufs/ufs_fs.h1
-rw-r--r--fs/ufs/util.c20
-rw-r--r--fs/ufs/util.h7
-rw-r--r--fs/utimes.c7
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xfs/Makefile5
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c856
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c114
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h122
-rw-r--r--fs/xfs/linux-2.6/xfs_dmapi_priv.h28
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c116
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.h25
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c47
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c25
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c53
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c21
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c213
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h9
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c322
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h5
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h700
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c8
-rw-r--r--fs/xfs/quota/xfs_dquot.c313
-rw-r--r--fs/xfs/quota/xfs_dquot.h35
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c317
-rw-r--r--fs/xfs/quota/xfs_qm.c653
-rw-r--r--fs/xfs/quota/xfs_qm.h23
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c10
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c12
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c306
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h102
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c60
-rw-r--r--fs/xfs/support/debug.c1
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_ag.h25
-rw-r--r--fs/xfs/xfs_alloc.c372
-rw-r--r--fs/xfs/xfs_alloc.h27
-rw-r--r--fs/xfs/xfs_alloc_btree.c7
-rw-r--r--fs/xfs/xfs_attr.c91
-rw-r--r--fs/xfs/xfs_attr_leaf.c5
-rw-r--r--fs/xfs/xfs_bmap.c343
-rw-r--r--fs/xfs/xfs_bmap.h37
-rw-r--r--fs/xfs/xfs_bmap_btree.c5
-rw-r--r--fs/xfs/xfs_btree.c5
-rw-r--r--fs/xfs/xfs_buf_item.c393
-rw-r--r--fs/xfs/xfs_buf_item.h22
-rw-r--r--fs/xfs/xfs_da_btree.c20
-rw-r--r--fs/xfs/xfs_dfrag.c21
-rw-r--r--fs/xfs/xfs_dir2.c11
-rw-r--r--fs/xfs/xfs_dir2_block.c8
-rw-r--r--fs/xfs/xfs_dir2_data.c2
-rw-r--r--fs/xfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/xfs_dmapi.h170
-rw-r--r--fs/xfs/xfs_dmops.c55
-rw-r--r--fs/xfs/xfs_error.c36
-rw-r--r--fs/xfs/xfs_error.h9
-rw-r--r--fs/xfs/xfs_extfree_item.c288
-rw-r--r--fs/xfs/xfs_filestream.c84
-rw-r--r--fs/xfs/xfs_filestream.h82
-rw-r--r--fs/xfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_fsops.c36
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c150
-rw-r--r--fs/xfs/xfs_ialloc_btree.c4
-rw-r--r--fs/xfs/xfs_iget.c147
-rw-r--r--fs/xfs/xfs_inode.c222
-rw-r--r--fs/xfs/xfs_inode.h10
-rw-r--r--fs/xfs/xfs_inode_item.c286
-rw-r--r--fs/xfs/xfs_inode_item.h12
-rw-r--r--fs/xfs/xfs_iomap.c195
-rw-r--r--fs/xfs/xfs_iomap.h67
-rw-r--r--fs/xfs/xfs_itable.c293
-rw-r--r--fs/xfs/xfs_itable.h17
-rw-r--r--fs/xfs/xfs_log.c811
-rw-r--r--fs/xfs/xfs_log.h36
-rw-r--r--fs/xfs/xfs_log_cil.c780
-rw-r--r--fs/xfs/xfs_log_priv.h144
-rw-r--r--fs/xfs/xfs_log_recover.c400
-rw-r--r--fs/xfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/xfs_mount.c80
-rw-r--r--fs/xfs/xfs_mount.h72
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_rename.c63
-rw-r--r--fs/xfs/xfs_rtalloc.c17
-rw-r--r--fs/xfs/xfs_rtalloc.h11
-rw-r--r--fs/xfs/xfs_rw.c15
-rw-r--r--fs/xfs/xfs_trans.c1382
-rw-r--r--fs/xfs/xfs_trans.h582
-rw-r--r--fs/xfs/xfs_trans_ail.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c250
-rw-r--r--fs/xfs/xfs_trans_extfree.c23
-rw-r--r--fs/xfs/xfs_trans_inode.c76
-rw-r--r--fs/xfs/xfs_trans_item.c549
-rw-r--r--fs/xfs/xfs_trans_priv.h26
-rw-r--r--fs/xfs/xfs_types.h2
-rw-r--r--fs/xfs/xfs_utils.c87
-rw-r--r--fs/xfs/xfs_utils.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c346
834 files changed, 41646 insertions, 29632 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 1a940ec7af61..91fba025fcbe 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,6 +8,8 @@ obj-$(CONFIG_9P_FS) := 9p.o
8 vfs_dir.o \ 8 vfs_dir.o \
9 vfs_dentry.o \ 9 vfs_dentry.o \
10 v9fs.o \ 10 v9fs.o \
11 fid.o 11 fid.o \
12 xattr.o \
13 xattr_user.o
12 14
139p-$(CONFIG_9P_FSCACHE) += cache.o 159p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 7317b39b2815..6406f896bf95 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -97,6 +97,34 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
97 return ret; 97 return ret;
98} 98}
99 99
100/*
101 * We need to hold v9ses->rename_sem as long as we hold references
102 * to returned path array. Array element contain pointers to
103 * dentry names.
104 */
105static int build_path_from_dentry(struct v9fs_session_info *v9ses,
106 struct dentry *dentry, char ***names)
107{
108 int n = 0, i;
109 char **wnames;
110 struct dentry *ds;
111
112 for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
113 n++;
114
115 wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
116 if (!wnames)
117 goto err_out;
118
119 for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent)
120 wnames[i] = (char *)ds->d_name.name;
121
122 *names = wnames;
123 return n;
124err_out:
125 return -ENOMEM;
126}
127
100/** 128/**
101 * v9fs_fid_lookup - lookup for a fid, try to walk if not found 129 * v9fs_fid_lookup - lookup for a fid, try to walk if not found
102 * @dentry: dentry to look for fid in 130 * @dentry: dentry to look for fid in
@@ -112,7 +140,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
112 int i, n, l, clone, any, access; 140 int i, n, l, clone, any, access;
113 u32 uid; 141 u32 uid;
114 struct p9_fid *fid, *old_fid = NULL; 142 struct p9_fid *fid, *old_fid = NULL;
115 struct dentry *d, *ds; 143 struct dentry *ds;
116 struct v9fs_session_info *v9ses; 144 struct v9fs_session_info *v9ses;
117 char **wnames, *uname; 145 char **wnames, *uname;
118 146
@@ -139,49 +167,62 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
139 fid = v9fs_fid_find(dentry, uid, any); 167 fid = v9fs_fid_find(dentry, uid, any);
140 if (fid) 168 if (fid)
141 return fid; 169 return fid;
142 170 /*
171 * we don't have a matching fid. To do a TWALK we need
172 * parent fid. We need to prevent rename when we want to
173 * look at the parent.
174 */
175 down_read(&v9ses->rename_sem);
143 ds = dentry->d_parent; 176 ds = dentry->d_parent;
144 fid = v9fs_fid_find(ds, uid, any); 177 fid = v9fs_fid_find(ds, uid, any);
145 if (!fid) { /* walk from the root */ 178 if (fid) {
146 n = 0; 179 /* Found the parent fid do a lookup with that */
147 for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) 180 fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1);
148 n++; 181 goto fid_out;
182 }
183 up_read(&v9ses->rename_sem);
149 184
150 fid = v9fs_fid_find(ds, uid, any); 185 /* start from the root and try to do a lookup */
151 if (!fid) { /* the user is not attached to the fs yet */ 186 fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
152 if (access == V9FS_ACCESS_SINGLE) 187 if (!fid) {
153 return ERR_PTR(-EPERM); 188 /* the user is not attached to the fs yet */
189 if (access == V9FS_ACCESS_SINGLE)
190 return ERR_PTR(-EPERM);
154 191
155 if (v9fs_proto_dotu(v9ses)) 192 if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
156 uname = NULL; 193 uname = NULL;
157 else 194 else
158 uname = v9ses->uname; 195 uname = v9ses->uname;
159 196
160 fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, 197 fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
161 v9ses->aname); 198 v9ses->aname);
162 199 if (IS_ERR(fid))
163 if (IS_ERR(fid)) 200 return fid;
164 return fid;
165
166 v9fs_fid_add(ds, fid);
167 }
168 } else /* walk from the parent */
169 n = 1;
170 201
171 if (ds == dentry) 202 v9fs_fid_add(dentry->d_sb->s_root, fid);
203 }
204 /* If we are root ourself just return that */
205 if (dentry->d_sb->s_root == dentry)
172 return fid; 206 return fid;
173 207 /*
174 wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); 208 * Do a multipath walk with attached root.
175 if (!wnames) 209 * When walking parent we need to make sure we
176 return ERR_PTR(-ENOMEM); 210 * don't have a parallel rename happening
177 211 */
178 for (d = dentry, i = (n-1); i >= 0; i--, d = d->d_parent) 212 down_read(&v9ses->rename_sem);
179 wnames[i] = (char *) d->d_name.name; 213 n = build_path_from_dentry(v9ses, dentry, &wnames);
180 214 if (n < 0) {
215 fid = ERR_PTR(n);
216 goto err_out;
217 }
181 clone = 1; 218 clone = 1;
182 i = 0; 219 i = 0;
183 while (i < n) { 220 while (i < n) {
184 l = min(n - i, P9_MAXWELEM); 221 l = min(n - i, P9_MAXWELEM);
222 /*
223 * We need to hold rename lock when doing a multipath
224 * walk to ensure none of the patch component change
225 */
185 fid = p9_client_walk(fid, l, &wnames[i], clone); 226 fid = p9_client_walk(fid, l, &wnames[i], clone);
186 if (IS_ERR(fid)) { 227 if (IS_ERR(fid)) {
187 if (old_fid) { 228 if (old_fid) {
@@ -193,15 +234,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
193 p9_client_clunk(old_fid); 234 p9_client_clunk(old_fid);
194 } 235 }
195 kfree(wnames); 236 kfree(wnames);
196 return fid; 237 goto err_out;
197 } 238 }
198 old_fid = fid; 239 old_fid = fid;
199 i += l; 240 i += l;
200 clone = 0; 241 clone = 0;
201 } 242 }
202
203 kfree(wnames); 243 kfree(wnames);
204 v9fs_fid_add(dentry, fid); 244fid_out:
245 if (!IS_ERR(fid))
246 v9fs_fid_add(dentry, fid);
247err_out:
248 up_read(&v9ses->rename_sem);
205 return fid; 249 return fid;
206} 250}
207 251
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f8b86e92cd66..38dc0e067599 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -237,6 +237,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
237 __putname(v9ses->uname); 237 __putname(v9ses->uname);
238 return ERR_PTR(-ENOMEM); 238 return ERR_PTR(-ENOMEM);
239 } 239 }
240 init_rwsem(&v9ses->rename_sem);
240 241
241 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); 242 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
242 if (rc) { 243 if (rc) {
@@ -278,7 +279,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
278 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 279 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
279 280
280 /* for legacy mode, fall back to V9FS_ACCESS_ANY */ 281 /* for legacy mode, fall back to V9FS_ACCESS_ANY */
281 if (!v9fs_proto_dotu(v9ses) && 282 if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
282 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { 283 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
283 284
284 v9ses->flags &= ~V9FS_ACCESS_MASK; 285 v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index bec4d0bcb458..4c963c9fc41f 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -104,6 +104,7 @@ struct v9fs_session_info {
104 struct p9_client *clnt; /* 9p client */ 104 struct p9_client *clnt; /* 9p client */
105 struct list_head slist; /* list of sessions registered with v9fs */ 105 struct list_head slist; /* list of sessions registered with v9fs */
106 struct backing_dev_info bdi; 106 struct backing_dev_info bdi;
107 struct rw_semaphore rename_sem;
107}; 108};
108 109
109struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 110struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..88418c419ea7 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
40extern struct file_system_type v9fs_fs_type; 40extern struct file_system_type v9fs_fs_type;
41extern const struct address_space_operations v9fs_addr_operations; 41extern const struct address_space_operations v9fs_addr_operations;
42extern const struct file_operations v9fs_file_operations; 42extern const struct file_operations v9fs_file_operations;
43extern const struct file_operations v9fs_file_operations_dotl;
43extern const struct file_operations v9fs_dir_operations; 44extern const struct file_operations v9fs_dir_operations;
45extern const struct file_operations v9fs_dir_operations_dotl;
44extern const struct dentry_operations v9fs_dentry_operations; 46extern const struct dentry_operations v9fs_dentry_operations;
45extern const struct dentry_operations v9fs_cached_dentry_operations; 47extern const struct dentry_operations v9fs_cached_dentry_operations;
46 48
@@ -50,9 +52,10 @@ void v9fs_destroy_inode(struct inode *inode);
50#endif 52#endif
51 53
52struct inode *v9fs_get_inode(struct super_block *sb, int mode); 54struct inode *v9fs_get_inode(struct super_block *sb, int mode);
53void v9fs_clear_inode(struct inode *inode); 55void v9fs_evict_inode(struct inode *inode);
54ino_t v9fs_qid2ino(struct p9_qid *qid); 56ino_t v9fs_qid2ino(struct p9_qid *qid);
55void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); 57void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
58void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
56int v9fs_dir_release(struct inode *inode, struct file *filp); 59int v9fs_dir_release(struct inode *inode, struct file *filp);
57int v9fs_file_open(struct inode *inode, struct file *file); 60int v9fs_file_open(struct inode *inode, struct file *file);
58void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); 61void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0adfd64dfcee..899f168fd19c 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -87,29 +87,19 @@ static void p9stat_init(struct p9_wstat *stbuf)
87} 87}
88 88
89/** 89/**
90 * v9fs_dir_readdir - read a directory 90 * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
91 * @filp: opened file structure 91 * @filp: opened file structure
92 * @dirent: directory structure ??? 92 * @buflen: Length in bytes of buffer to allocate
93 * @filldir: function to populate directory structure ???
94 * 93 *
95 */ 94 */
96 95
97static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) 96static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
98{ 97{
99 int over;
100 struct p9_wstat st;
101 int err = 0;
102 struct p9_fid *fid;
103 int buflen;
104 int reclen = 0;
105 struct p9_rdir *rdir; 98 struct p9_rdir *rdir;
99 struct p9_fid *fid;
100 int err = 0;
106 101
107 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
108 fid = filp->private_data; 102 fid = filp->private_data;
109
110 buflen = fid->clnt->msize - P9_IOHDRSZ;
111
112 /* allocate rdir on demand */
113 if (!fid->rdir) { 103 if (!fid->rdir) {
114 rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); 104 rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
115 105
@@ -128,6 +118,36 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
128 spin_unlock(&filp->f_dentry->d_lock); 118 spin_unlock(&filp->f_dentry->d_lock);
129 kfree(rdir); 119 kfree(rdir);
130 } 120 }
121exit:
122 return err;
123}
124
125/**
126 * v9fs_dir_readdir - read a directory
127 * @filp: opened file structure
128 * @dirent: directory structure ???
129 * @filldir: function to populate directory structure ???
130 *
131 */
132
133static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
134{
135 int over;
136 struct p9_wstat st;
137 int err = 0;
138 struct p9_fid *fid;
139 int buflen;
140 int reclen = 0;
141 struct p9_rdir *rdir;
142
143 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
144 fid = filp->private_data;
145
146 buflen = fid->clnt->msize - P9_IOHDRSZ;
147
148 err = v9fs_alloc_rdir_buf(filp, buflen);
149 if (err)
150 goto exit;
131 rdir = (struct p9_rdir *) fid->rdir; 151 rdir = (struct p9_rdir *) fid->rdir;
132 152
133 err = mutex_lock_interruptible(&rdir->mutex); 153 err = mutex_lock_interruptible(&rdir->mutex);
@@ -146,7 +166,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
146 while (rdir->head < rdir->tail) { 166 while (rdir->head < rdir->tail) {
147 p9stat_init(&st); 167 p9stat_init(&st);
148 err = p9stat_read(rdir->buf + rdir->head, 168 err = p9stat_read(rdir->buf + rdir->head,
149 buflen - rdir->head, &st, 169 rdir->tail - rdir->head, &st,
150 fid->clnt->proto_version); 170 fid->clnt->proto_version);
151 if (err) { 171 if (err) {
152 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 172 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
@@ -176,6 +196,88 @@ exit:
176 return err; 196 return err;
177} 197}
178 198
199/**
200 * v9fs_dir_readdir_dotl - read a directory
201 * @filp: opened file structure
202 * @dirent: buffer to fill dirent structures
203 * @filldir: function to populate dirent structures
204 *
205 */
206static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
207 filldir_t filldir)
208{
209 int over;
210 int err = 0;
211 struct p9_fid *fid;
212 int buflen;
213 struct p9_rdir *rdir;
214 struct p9_dirent curdirent;
215 u64 oldoffset = 0;
216
217 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
218 fid = filp->private_data;
219
220 buflen = fid->clnt->msize - P9_READDIRHDRSZ;
221
222 err = v9fs_alloc_rdir_buf(filp, buflen);
223 if (err)
224 goto exit;
225 rdir = (struct p9_rdir *) fid->rdir;
226
227 err = mutex_lock_interruptible(&rdir->mutex);
228 if (err)
229 return err;
230
231 while (err == 0) {
232 if (rdir->tail == rdir->head) {
233 err = p9_client_readdir(fid, rdir->buf, buflen,
234 filp->f_pos);
235 if (err <= 0)
236 goto unlock_and_exit;
237
238 rdir->head = 0;
239 rdir->tail = err;
240 }
241
242 while (rdir->head < rdir->tail) {
243
244 err = p9dirent_read(rdir->buf + rdir->head,
245 buflen - rdir->head, &curdirent,
246 fid->clnt->proto_version);
247 if (err < 0) {
248 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
249 err = -EIO;
250 goto unlock_and_exit;
251 }
252
253 /* d_off in dirent structure tracks the offset into
254 * the next dirent in the dir. However, filldir()
255 * expects offset into the current dirent. Hence
256 * while calling filldir send the offset from the
257 * previous dirent structure.
258 */
259 over = filldir(dirent, curdirent.d_name,
260 strlen(curdirent.d_name),
261 oldoffset, v9fs_qid2ino(&curdirent.qid),
262 curdirent.d_type);
263 oldoffset = curdirent.d_off;
264
265 if (over) {
266 err = 0;
267 goto unlock_and_exit;
268 }
269
270 filp->f_pos = curdirent.d_off;
271 rdir->head += err;
272 }
273 }
274
275unlock_and_exit:
276 mutex_unlock(&rdir->mutex);
277exit:
278 return err;
279}
280
179 281
180/** 282/**
181 * v9fs_dir_release - close a directory 283 * v9fs_dir_release - close a directory
@@ -190,9 +292,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
190 292
191 fid = filp->private_data; 293 fid = filp->private_data;
192 P9_DPRINTK(P9_DEBUG_VFS, 294 P9_DPRINTK(P9_DEBUG_VFS,
193 "inode: %p filp: %p fid: %d\n", inode, filp, fid->fid); 295 "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
296 inode, filp, fid ? fid->fid : -1);
194 filemap_write_and_wait(inode->i_mapping); 297 filemap_write_and_wait(inode->i_mapping);
195 p9_client_clunk(fid); 298 if (fid)
299 p9_client_clunk(fid);
196 return 0; 300 return 0;
197} 301}
198 302
@@ -203,3 +307,11 @@ const struct file_operations v9fs_dir_operations = {
203 .open = v9fs_file_open, 307 .open = v9fs_file_open,
204 .release = v9fs_dir_release, 308 .release = v9fs_dir_release,
205}; 309};
310
311const struct file_operations v9fs_dir_operations_dotl = {
312 .read = generic_read_dir,
313 .llseek = generic_file_llseek,
314 .readdir = v9fs_dir_readdir_dotl,
315 .open = v9fs_file_open,
316 .release = v9fs_dir_release,
317};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..e97c92bd6f16 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,9 +59,13 @@ int v9fs_file_open(struct inode *inode, struct file *file)
59 struct p9_fid *fid; 59 struct p9_fid *fid;
60 int omode; 60 int omode;
61 61
62 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); 62 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
63 v9ses = v9fs_inode2v9ses(inode); 63 v9ses = v9fs_inode2v9ses(inode);
64 omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); 64 if (v9fs_proto_dotl(v9ses))
65 omode = file->f_flags;
66 else
67 omode = v9fs_uflags2omode(file->f_flags,
68 v9fs_proto_dotu(v9ses));
65 fid = file->private_data; 69 fid = file->private_data;
66 if (!fid) { 70 if (!fid) {
67 fid = v9fs_fid_clone(file->f_path.dentry); 71 fid = v9fs_fid_clone(file->f_path.dentry);
@@ -73,11 +77,12 @@ int v9fs_file_open(struct inode *inode, struct file *file)
73 p9_client_clunk(fid); 77 p9_client_clunk(fid);
74 return err; 78 return err;
75 } 79 }
76 if (omode & P9_OTRUNC) { 80 if (file->f_flags & O_TRUNC) {
77 i_size_write(inode, 0); 81 i_size_write(inode, 0);
78 inode->i_blocks = 0; 82 inode->i_blocks = 0;
79 } 83 }
80 if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses))) 84 if ((file->f_flags & O_APPEND) &&
85 (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
81 generic_file_llseek(file, 0, SEEK_END); 86 generic_file_llseek(file, 0, SEEK_END);
82 } 87 }
83 88
@@ -139,7 +144,7 @@ ssize_t
139v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, 144v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
140 u64 offset) 145 u64 offset)
141{ 146{
142 int n, total; 147 int n, total, size;
143 struct p9_fid *fid = filp->private_data; 148 struct p9_fid *fid = filp->private_data;
144 149
145 P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, 150 P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
@@ -147,6 +152,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
147 152
148 n = 0; 153 n = 0;
149 total = 0; 154 total = 0;
155 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
150 do { 156 do {
151 n = p9_client_read(fid, data, udata, offset, count); 157 n = p9_client_read(fid, data, udata, offset, count);
152 if (n <= 0) 158 if (n <= 0)
@@ -160,7 +166,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
160 offset += n; 166 offset += n;
161 count -= n; 167 count -= n;
162 total += n; 168 total += n;
163 } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ)); 169 } while (count > 0 && n == size);
164 170
165 if (n < 0) 171 if (n < 0)
166 total = n; 172 total = n;
@@ -183,11 +189,13 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
183{ 189{
184 int ret; 190 int ret;
185 struct p9_fid *fid; 191 struct p9_fid *fid;
192 size_t size;
186 193
187 P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); 194 P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
188 fid = filp->private_data; 195 fid = filp->private_data;
189 196
190 if (count > (fid->clnt->msize - P9_IOHDRSZ)) 197 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
198 if (count > size)
191 ret = v9fs_file_readn(filp, NULL, udata, count, *offset); 199 ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
192 else 200 else
193 ret = p9_client_read(fid, NULL, udata, *offset, count); 201 ret = p9_client_read(fid, NULL, udata, *offset, count);
@@ -224,9 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
224 fid = filp->private_data; 232 fid = filp->private_data;
225 clnt = fid->clnt; 233 clnt = fid->clnt;
226 234
227 rsize = fid->iounit; 235 rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ;
228 if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
229 rsize = clnt->msize - P9_IOHDRSZ;
230 236
231 do { 237 do {
232 if (count < rsize) 238 if (count < rsize)
@@ -257,15 +263,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
257 return total; 263 return total;
258} 264}
259 265
260static int v9fs_file_fsync(struct file *filp, struct dentry *dentry, 266static int v9fs_file_fsync(struct file *filp, int datasync)
261 int datasync)
262{ 267{
263 struct p9_fid *fid; 268 struct p9_fid *fid;
264 struct p9_wstat wstat; 269 struct p9_wstat wstat;
265 int retval; 270 int retval;
266 271
267 P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp, 272 P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
268 dentry, datasync);
269 273
270 fid = filp->private_data; 274 fid = filp->private_data;
271 v9fs_blank_wstat(&wstat); 275 v9fs_blank_wstat(&wstat);
@@ -296,3 +300,14 @@ const struct file_operations v9fs_file_operations = {
296 .mmap = generic_file_readonly_mmap, 300 .mmap = generic_file_readonly_mmap,
297 .fsync = v9fs_file_fsync, 301 .fsync = v9fs_file_fsync,
298}; 302};
303
304const struct file_operations v9fs_file_operations_dotl = {
305 .llseek = generic_file_llseek,
306 .read = v9fs_file_read,
307 .write = v9fs_file_write,
308 .open = v9fs_file_open,
309 .release = v9fs_dir_release,
310 .lock = v9fs_file_lock,
311 .mmap = generic_file_readonly_mmap,
312 .fsync = v9fs_file_fsync,
313};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f2434fc9d2c4..9e670d527646 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -35,6 +35,7 @@
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/xattr.h>
38#include <net/9p/9p.h> 39#include <net/9p/9p.h>
39#include <net/9p/client.h> 40#include <net/9p/client.h>
40 41
@@ -42,11 +43,15 @@
42#include "v9fs_vfs.h" 43#include "v9fs_vfs.h"
43#include "fid.h" 44#include "fid.h"
44#include "cache.h" 45#include "cache.h"
46#include "xattr.h"
45 47
46static const struct inode_operations v9fs_dir_inode_operations; 48static const struct inode_operations v9fs_dir_inode_operations;
47static const struct inode_operations v9fs_dir_inode_operations_ext; 49static const struct inode_operations v9fs_dir_inode_operations_dotu;
50static const struct inode_operations v9fs_dir_inode_operations_dotl;
48static const struct inode_operations v9fs_file_inode_operations; 51static const struct inode_operations v9fs_file_inode_operations;
52static const struct inode_operations v9fs_file_inode_operations_dotl;
49static const struct inode_operations v9fs_symlink_inode_operations; 53static const struct inode_operations v9fs_symlink_inode_operations;
54static const struct inode_operations v9fs_symlink_inode_operations_dotl;
50 55
51/** 56/**
52 * unixmode2p9mode - convert unix mode bits to plan 9 57 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -233,6 +238,41 @@ void v9fs_destroy_inode(struct inode *inode)
233#endif 238#endif
234 239
235/** 240/**
241 * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
242 * new file system object. This checks the S_ISGID to determine the owning
243 * group of the new file system object.
244 */
245
246static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
247{
248 BUG_ON(dir_inode == NULL);
249
250 if (dir_inode->i_mode & S_ISGID) {
251 /* set_gid bit is set.*/
252 return dir_inode->i_gid;
253 }
254 return current_fsgid();
255}
256
257/**
258 * v9fs_dentry_from_dir_inode - helper function to get the dentry from
259 * dir inode.
260 *
261 */
262
263static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
264{
265 struct dentry *dentry;
266
267 spin_lock(&dcache_lock);
268 /* Directory should have only one entry. */
269 BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
270 dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
271 spin_unlock(&dcache_lock);
272 return dentry;
273}
274
275/**
236 * v9fs_get_inode - helper function to setup an inode 276 * v9fs_get_inode - helper function to setup an inode
237 * @sb: superblock 277 * @sb: superblock
238 * @mode: mode to setup inode with 278 * @mode: mode to setup inode with
@@ -253,9 +293,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
253 return ERR_PTR(-ENOMEM); 293 return ERR_PTR(-ENOMEM);
254 } 294 }
255 295
256 inode->i_mode = mode; 296 inode_init_owner(inode, NULL, mode);
257 inode->i_uid = current_fsuid();
258 inode->i_gid = current_fsgid();
259 inode->i_blocks = 0; 297 inode->i_blocks = 0;
260 inode->i_rdev = 0; 298 inode->i_rdev = 0;
261 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 299 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -266,7 +304,13 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
266 case S_IFBLK: 304 case S_IFBLK:
267 case S_IFCHR: 305 case S_IFCHR:
268 case S_IFSOCK: 306 case S_IFSOCK:
269 if (!v9fs_proto_dotu(v9ses)) { 307 if (v9fs_proto_dotl(v9ses)) {
308 inode->i_op = &v9fs_file_inode_operations_dotl;
309 inode->i_fop = &v9fs_file_operations_dotl;
310 } else if (v9fs_proto_dotu(v9ses)) {
311 inode->i_op = &v9fs_file_inode_operations;
312 inode->i_fop = &v9fs_file_operations;
313 } else {
270 P9_DPRINTK(P9_DEBUG_ERROR, 314 P9_DPRINTK(P9_DEBUG_ERROR,
271 "special files without extended mode\n"); 315 "special files without extended mode\n");
272 err = -EINVAL; 316 err = -EINVAL;
@@ -275,25 +319,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
275 init_special_inode(inode, inode->i_mode, inode->i_rdev); 319 init_special_inode(inode, inode->i_mode, inode->i_rdev);
276 break; 320 break;
277 case S_IFREG: 321 case S_IFREG:
278 inode->i_op = &v9fs_file_inode_operations; 322 if (v9fs_proto_dotl(v9ses)) {
279 inode->i_fop = &v9fs_file_operations; 323 inode->i_op = &v9fs_file_inode_operations_dotl;
324 inode->i_fop = &v9fs_file_operations_dotl;
325 } else {
326 inode->i_op = &v9fs_file_inode_operations;
327 inode->i_fop = &v9fs_file_operations;
328 }
329
280 break; 330 break;
331
281 case S_IFLNK: 332 case S_IFLNK:
282 if (!v9fs_proto_dotu(v9ses)) { 333 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
283 P9_DPRINTK(P9_DEBUG_ERROR, 334 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
284 "extended modes used w/o 9P2000.u\n"); 335 "legacy protocol.\n");
285 err = -EINVAL; 336 err = -EINVAL;
286 goto error; 337 goto error;
287 } 338 }
288 inode->i_op = &v9fs_symlink_inode_operations; 339
340 if (v9fs_proto_dotl(v9ses))
341 inode->i_op = &v9fs_symlink_inode_operations_dotl;
342 else
343 inode->i_op = &v9fs_symlink_inode_operations;
344
289 break; 345 break;
290 case S_IFDIR: 346 case S_IFDIR:
291 inc_nlink(inode); 347 inc_nlink(inode);
292 if (v9fs_proto_dotu(v9ses)) 348 if (v9fs_proto_dotl(v9ses))
293 inode->i_op = &v9fs_dir_inode_operations_ext; 349 inode->i_op = &v9fs_dir_inode_operations_dotl;
350 else if (v9fs_proto_dotu(v9ses))
351 inode->i_op = &v9fs_dir_inode_operations_dotu;
294 else 352 else
295 inode->i_op = &v9fs_dir_inode_operations; 353 inode->i_op = &v9fs_dir_inode_operations;
296 inode->i_fop = &v9fs_dir_operations; 354
355 if (v9fs_proto_dotl(v9ses))
356 inode->i_fop = &v9fs_dir_operations_dotl;
357 else
358 inode->i_fop = &v9fs_dir_operations;
359
297 break; 360 break;
298 default: 361 default:
299 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", 362 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -367,8 +430,10 @@ error:
367 * @inode: inode to release 430 * @inode: inode to release
368 * 431 *
369 */ 432 */
370void v9fs_clear_inode(struct inode *inode) 433void v9fs_evict_inode(struct inode *inode)
371{ 434{
435 truncate_inode_pages(inode->i_mapping, 0);
436 end_writeback(inode);
372 filemap_fdatawrite(inode->i_mapping); 437 filemap_fdatawrite(inode->i_mapping);
373 438
374#ifdef CONFIG_9P_FSCACHE 439#ifdef CONFIG_9P_FSCACHE
@@ -376,23 +441,14 @@ void v9fs_clear_inode(struct inode *inode)
376#endif 441#endif
377} 442}
378 443
379/**
380 * v9fs_inode_from_fid - populate an inode by issuing a attribute request
381 * @v9ses: session information
382 * @fid: fid to issue attribute request for
383 * @sb: superblock on which to create inode
384 *
385 */
386
387static struct inode * 444static struct inode *
388v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, 445v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
389 struct super_block *sb) 446 struct super_block *sb)
390{ 447{
391 int err, umode; 448 int err, umode;
392 struct inode *ret; 449 struct inode *ret = NULL;
393 struct p9_wstat *st; 450 struct p9_wstat *st;
394 451
395 ret = NULL;
396 st = p9_client_stat(fid); 452 st = p9_client_stat(fid);
397 if (IS_ERR(st)) 453 if (IS_ERR(st))
398 return ERR_CAST(st); 454 return ERR_CAST(st);
@@ -413,15 +469,62 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
413#endif 469#endif
414 p9stat_free(st); 470 p9stat_free(st);
415 kfree(st); 471 kfree(st);
416
417 return ret; 472 return ret;
418
419error: 473error:
420 p9stat_free(st); 474 p9stat_free(st);
421 kfree(st); 475 kfree(st);
422 return ERR_PTR(err); 476 return ERR_PTR(err);
423} 477}
424 478
479static struct inode *
480v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
481 struct super_block *sb)
482{
483 struct inode *ret = NULL;
484 int err;
485 struct p9_stat_dotl *st;
486
487 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
488 if (IS_ERR(st))
489 return ERR_CAST(st);
490
491 ret = v9fs_get_inode(sb, st->st_mode);
492 if (IS_ERR(ret)) {
493 err = PTR_ERR(ret);
494 goto error;
495 }
496
497 v9fs_stat2inode_dotl(st, ret);
498 ret->i_ino = v9fs_qid2ino(&st->qid);
499#ifdef CONFIG_9P_FSCACHE
500 v9fs_vcookie_set_qid(ret, &st->qid);
501 v9fs_cache_inode_get_cookie(ret);
502#endif
503 kfree(st);
504 return ret;
505error:
506 kfree(st);
507 return ERR_PTR(err);
508}
509
510/**
511 * v9fs_inode_from_fid - Helper routine to populate an inode by
512 * issuing a attribute request
513 * @v9ses: session information
514 * @fid: fid to issue attribute request for
515 * @sb: superblock on which to create inode
516 *
517 */
518static inline struct inode *
519v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
520 struct super_block *sb)
521{
522 if (v9fs_proto_dotl(v9ses))
523 return v9fs_inode_dotl(v9ses, fid, sb);
524 else
525 return v9fs_inode(v9ses, fid, sb);
526}
527
425/** 528/**
426 * v9fs_remove - helper function to remove files and directories 529 * v9fs_remove - helper function to remove files and directories
427 * @dir: directory inode that is being deleted 530 * @dir: directory inode that is being deleted
@@ -434,14 +537,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
434{ 537{
435 int retval; 538 int retval;
436 struct inode *file_inode; 539 struct inode *file_inode;
437 struct v9fs_session_info *v9ses;
438 struct p9_fid *v9fid; 540 struct p9_fid *v9fid;
439 541
440 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, 542 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
441 rmdir); 543 rmdir);
442 544
443 file_inode = file->d_inode; 545 file_inode = file->d_inode;
444 v9ses = v9fs_inode2v9ses(file_inode);
445 v9fid = v9fs_fid_clone(file); 546 v9fid = v9fs_fid_clone(file);
446 if (IS_ERR(v9fid)) 547 if (IS_ERR(v9fid))
447 return PTR_ERR(v9fid); 548 return PTR_ERR(v9fid);
@@ -484,12 +585,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
484 ofid = NULL; 585 ofid = NULL;
485 fid = NULL; 586 fid = NULL;
486 name = (char *) dentry->d_name.name; 587 name = (char *) dentry->d_name.name;
487 dfid = v9fs_fid_clone(dentry->d_parent); 588 dfid = v9fs_fid_lookup(dentry->d_parent);
488 if (IS_ERR(dfid)) { 589 if (IS_ERR(dfid)) {
489 err = PTR_ERR(dfid); 590 err = PTR_ERR(dfid);
490 P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); 591 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
491 dfid = NULL; 592 return ERR_PTR(err);
492 goto error;
493 } 593 }
494 594
495 /* clone a fid to use for creation */ 595 /* clone a fid to use for creation */
@@ -497,8 +597,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
497 if (IS_ERR(ofid)) { 597 if (IS_ERR(ofid)) {
498 err = PTR_ERR(ofid); 598 err = PTR_ERR(ofid);
499 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 599 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
500 ofid = NULL; 600 return ERR_PTR(err);
501 goto error;
502 } 601 }
503 602
504 err = p9_client_fcreate(ofid, name, perm, mode, extension); 603 err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -508,14 +607,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
508 } 607 }
509 608
510 /* now walk from the parent so we can get unopened fid */ 609 /* now walk from the parent so we can get unopened fid */
511 fid = p9_client_walk(dfid, 1, &name, 0); 610 fid = p9_client_walk(dfid, 1, &name, 1);
512 if (IS_ERR(fid)) { 611 if (IS_ERR(fid)) {
513 err = PTR_ERR(fid); 612 err = PTR_ERR(fid);
514 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 613 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
515 fid = NULL; 614 fid = NULL;
516 goto error; 615 goto error;
517 } else 616 }
518 dfid = NULL;
519 617
520 /* instantiate inode and assign the unopened fid to the dentry */ 618 /* instantiate inode and assign the unopened fid to the dentry */
521 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 619 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -538,9 +636,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
538 return ofid; 636 return ofid;
539 637
540error: 638error:
541 if (dfid)
542 p9_client_clunk(dfid);
543
544 if (ofid) 639 if (ofid)
545 p9_client_clunk(ofid); 640 p9_client_clunk(ofid);
546 641
@@ -551,6 +646,121 @@ error:
551} 646}
552 647
553/** 648/**
649 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
650 * @dir: directory inode that is being created
651 * @dentry: dentry that is being deleted
652 * @mode: create permissions
653 * @nd: path information
654 *
655 */
656
657static int
658v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
659 struct nameidata *nd)
660{
661 int err = 0;
662 char *name = NULL;
663 gid_t gid;
664 int flags;
665 struct v9fs_session_info *v9ses;
666 struct p9_fid *fid = NULL;
667 struct p9_fid *dfid, *ofid;
668 struct file *filp;
669 struct p9_qid qid;
670 struct inode *inode;
671
672 v9ses = v9fs_inode2v9ses(dir);
673 if (nd && nd->flags & LOOKUP_OPEN)
674 flags = nd->intent.open.flags - 1;
675 else
676 flags = O_RDWR;
677
678 name = (char *) dentry->d_name.name;
679 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
680 "mode:0x%x\n", name, flags, mode);
681
682 dfid = v9fs_fid_lookup(dentry->d_parent);
683 if (IS_ERR(dfid)) {
684 err = PTR_ERR(dfid);
685 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
686 return err;
687 }
688
689 /* clone a fid to use for creation */
690 ofid = p9_client_walk(dfid, 0, NULL, 1);
691 if (IS_ERR(ofid)) {
692 err = PTR_ERR(ofid);
693 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
694 return err;
695 }
696
697 gid = v9fs_get_fsgid_for_create(dir);
698 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
699 if (err < 0) {
700 P9_DPRINTK(P9_DEBUG_VFS,
701 "p9_client_open_dotl failed in creat %d\n",
702 err);
703 goto error;
704 }
705
706 /* No need to populate the inode if we are not opening the file AND
707 * not in cached mode.
708 */
709 if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) {
710 /* Not in cached mode. No need to populate inode with stat */
711 dentry->d_op = &v9fs_dentry_operations;
712 p9_client_clunk(ofid);
713 d_instantiate(dentry, NULL);
714 return 0;
715 }
716
717 /* Now walk from the parent so we can get an unopened fid. */
718 fid = p9_client_walk(dfid, 1, &name, 1);
719 if (IS_ERR(fid)) {
720 err = PTR_ERR(fid);
721 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
722 fid = NULL;
723 goto error;
724 }
725
726 /* instantiate inode and assign the unopened fid to dentry */
727 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
728 if (IS_ERR(inode)) {
729 err = PTR_ERR(inode);
730 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
731 goto error;
732 }
733 if (v9ses->cache)
734 dentry->d_op = &v9fs_cached_dentry_operations;
735 else
736 dentry->d_op = &v9fs_dentry_operations;
737 d_instantiate(dentry, inode);
738 err = v9fs_fid_add(dentry, fid);
739 if (err < 0)
740 goto error;
741
742 /* if we are opening a file, assign the open fid to the file */
743 if (nd && nd->flags & LOOKUP_OPEN) {
744 filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
745 if (IS_ERR(filp)) {
746 p9_client_clunk(ofid);
747 return PTR_ERR(filp);
748 }
749 filp->private_data = ofid;
750 } else
751 p9_client_clunk(ofid);
752
753 return 0;
754
755error:
756 if (ofid)
757 p9_client_clunk(ofid);
758 if (fid)
759 p9_client_clunk(fid);
760 return err;
761}
762
763/**
554 * v9fs_vfs_create - VFS hook to create files 764 * v9fs_vfs_create - VFS hook to create files
555 * @dir: directory inode that is being created 765 * @dir: directory inode that is being created
556 * @dentry: dentry that is being deleted 766 * @dentry: dentry that is being deleted
@@ -640,6 +850,83 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
640 return err; 850 return err;
641} 851}
642 852
853
854/**
855 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
856 * @dir: inode that is being unlinked
857 * @dentry: dentry that is being unlinked
858 * @mode: mode for new directory
859 *
860 */
861
862static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
863 int mode)
864{
865 int err;
866 struct v9fs_session_info *v9ses;
867 struct p9_fid *fid = NULL, *dfid = NULL;
868 gid_t gid;
869 char *name;
870 struct inode *inode;
871 struct p9_qid qid;
872 struct dentry *dir_dentry;
873
874 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
875 err = 0;
876 v9ses = v9fs_inode2v9ses(dir);
877
878 mode |= S_IFDIR;
879 dir_dentry = v9fs_dentry_from_dir_inode(dir);
880 dfid = v9fs_fid_lookup(dir_dentry);
881 if (IS_ERR(dfid)) {
882 err = PTR_ERR(dfid);
883 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
884 dfid = NULL;
885 goto error;
886 }
887
888 gid = v9fs_get_fsgid_for_create(dir);
889 if (gid < 0) {
890 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
891 goto error;
892 }
893
894 name = (char *) dentry->d_name.name;
895 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
896 if (err < 0)
897 goto error;
898
899 /* instantiate inode and assign the unopened fid to the dentry */
900 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
901 fid = p9_client_walk(dfid, 1, &name, 1);
902 if (IS_ERR(fid)) {
903 err = PTR_ERR(fid);
904 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
905 err);
906 fid = NULL;
907 goto error;
908 }
909
910 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
911 if (IS_ERR(inode)) {
912 err = PTR_ERR(inode);
913 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
914 err);
915 goto error;
916 }
917 dentry->d_op = &v9fs_cached_dentry_operations;
918 d_instantiate(dentry, inode);
919 err = v9fs_fid_add(dentry, fid);
920 if (err < 0)
921 goto error;
922 fid = NULL;
923 }
924error:
925 if (fid)
926 p9_client_clunk(fid);
927 return err;
928}
929
643/** 930/**
644 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode 931 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
645 * @dir: inode that is being walked from 932 * @dir: inode that is being walked from
@@ -666,6 +953,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
666 953
667 sb = dir->i_sb; 954 sb = dir->i_sb;
668 v9ses = v9fs_inode2v9ses(dir); 955 v9ses = v9fs_inode2v9ses(dir);
956 /* We can walk d_parent because we hold the dir->i_mutex */
669 dfid = v9fs_fid_lookup(dentry->d_parent); 957 dfid = v9fs_fid_lookup(dentry->d_parent);
670 if (IS_ERR(dfid)) 958 if (IS_ERR(dfid))
671 return ERR_CAST(dfid); 959 return ERR_CAST(dfid);
@@ -675,8 +963,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
675 if (IS_ERR(fid)) { 963 if (IS_ERR(fid)) {
676 result = PTR_ERR(fid); 964 result = PTR_ERR(fid);
677 if (result == -ENOENT) { 965 if (result == -ENOENT) {
678 d_add(dentry, NULL); 966 inode = NULL;
679 return NULL; 967 goto inst_out;
680 } 968 }
681 969
682 return ERR_PTR(result); 970 return ERR_PTR(result);
@@ -693,7 +981,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
693 if (result < 0) 981 if (result < 0)
694 goto error; 982 goto error;
695 983
696 if ((fid->qid.version) && (v9ses->cache)) 984inst_out:
985 if (v9ses->cache)
697 dentry->d_op = &v9fs_cached_dentry_operations; 986 dentry->d_op = &v9fs_cached_dentry_operations;
698 else 987 else
699 dentry->d_op = &v9fs_dentry_operations; 988 dentry->d_op = &v9fs_dentry_operations;
@@ -772,20 +1061,33 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
772 goto clunk_olddir; 1061 goto clunk_olddir;
773 } 1062 }
774 1063
775 /* 9P can only handle file rename in the same directory */ 1064 down_write(&v9ses->rename_sem);
776 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { 1065 if (v9fs_proto_dotl(v9ses)) {
1066 retval = p9_client_rename(oldfid, newdirfid,
1067 (char *) new_dentry->d_name.name);
1068 if (retval != -ENOSYS)
1069 goto clunk_newdir;
1070 }
1071 if (old_dentry->d_parent != new_dentry->d_parent) {
1072 /*
1073 * 9P .u can only handle file rename in the same directory
1074 */
1075
777 P9_DPRINTK(P9_DEBUG_ERROR, 1076 P9_DPRINTK(P9_DEBUG_ERROR,
778 "old dir and new dir are different\n"); 1077 "old dir and new dir are different\n");
779 retval = -EXDEV; 1078 retval = -EXDEV;
780 goto clunk_newdir; 1079 goto clunk_newdir;
781 } 1080 }
782
783 v9fs_blank_wstat(&wstat); 1081 v9fs_blank_wstat(&wstat);
784 wstat.muid = v9ses->uname; 1082 wstat.muid = v9ses->uname;
785 wstat.name = (char *) new_dentry->d_name.name; 1083 wstat.name = (char *) new_dentry->d_name.name;
786 retval = p9_client_wstat(oldfid, &wstat); 1084 retval = p9_client_wstat(oldfid, &wstat);
787 1085
788clunk_newdir: 1086clunk_newdir:
1087 if (!retval)
1088 /* successful rename */
1089 d_move(old_dentry, new_dentry);
1090 up_write(&v9ses->rename_sem);
789 p9_client_clunk(newdirfid); 1091 p9_client_clunk(newdirfid);
790 1092
791clunk_olddir: 1093clunk_olddir:
@@ -829,6 +1131,43 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
829 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); 1131 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
830 generic_fillattr(dentry->d_inode, stat); 1132 generic_fillattr(dentry->d_inode, stat);
831 1133
1134 p9stat_free(st);
1135 kfree(st);
1136 return 0;
1137}
1138
1139static int
1140v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
1141 struct kstat *stat)
1142{
1143 int err;
1144 struct v9fs_session_info *v9ses;
1145 struct p9_fid *fid;
1146 struct p9_stat_dotl *st;
1147
1148 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
1149 err = -EPERM;
1150 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1151 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
1152 return simple_getattr(mnt, dentry, stat);
1153
1154 fid = v9fs_fid_lookup(dentry);
1155 if (IS_ERR(fid))
1156 return PTR_ERR(fid);
1157
1158 /* Ask for all the fields in stat structure. Server will return
1159 * whatever it supports
1160 */
1161
1162 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
1163 if (IS_ERR(st))
1164 return PTR_ERR(st);
1165
1166 v9fs_stat2inode_dotl(st, dentry->d_inode);
1167 generic_fillattr(dentry->d_inode, stat);
1168 /* Change block size to what the server returned */
1169 stat->blksize = st->st_blksize;
1170
832 kfree(st); 1171 kfree(st);
833 return 0; 1172 return 0;
834} 1173}
@@ -876,10 +1215,71 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
876 } 1215 }
877 1216
878 retval = p9_client_wstat(fid, &wstat); 1217 retval = p9_client_wstat(fid, &wstat);
879 if (retval >= 0) 1218 if (retval < 0)
880 retval = inode_setattr(dentry->d_inode, iattr); 1219 return retval;
1220
1221 if ((iattr->ia_valid & ATTR_SIZE) &&
1222 iattr->ia_size != i_size_read(dentry->d_inode)) {
1223 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
1224 if (retval)
1225 return retval;
1226 }
881 1227
882 return retval; 1228 setattr_copy(dentry->d_inode, iattr);
1229 mark_inode_dirty(dentry->d_inode);
1230 return 0;
1231}
1232
1233/**
1234 * v9fs_vfs_setattr_dotl - set file metadata
1235 * @dentry: file whose metadata to set
1236 * @iattr: metadata assignment structure
1237 *
1238 */
1239
1240static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
1241{
1242 int retval;
1243 struct v9fs_session_info *v9ses;
1244 struct p9_fid *fid;
1245 struct p9_iattr_dotl p9attr;
1246
1247 P9_DPRINTK(P9_DEBUG_VFS, "\n");
1248
1249 retval = inode_change_ok(dentry->d_inode, iattr);
1250 if (retval)
1251 return retval;
1252
1253 p9attr.valid = iattr->ia_valid;
1254 p9attr.mode = iattr->ia_mode;
1255 p9attr.uid = iattr->ia_uid;
1256 p9attr.gid = iattr->ia_gid;
1257 p9attr.size = iattr->ia_size;
1258 p9attr.atime_sec = iattr->ia_atime.tv_sec;
1259 p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
1260 p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
1261 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
1262
1263 retval = -EPERM;
1264 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1265 fid = v9fs_fid_lookup(dentry);
1266 if (IS_ERR(fid))
1267 return PTR_ERR(fid);
1268
1269 retval = p9_client_setattr(fid, &p9attr);
1270 if (retval < 0)
1271 return retval;
1272
1273 if ((iattr->ia_valid & ATTR_SIZE) &&
1274 iattr->ia_size != i_size_read(dentry->d_inode)) {
1275 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
1276 if (retval)
1277 return retval;
1278 }
1279
1280 setattr_copy(dentry->d_inode, iattr);
1281 mark_inode_dirty(dentry->d_inode);
1282 return 0;
883} 1283}
884 1284
885/** 1285/**
@@ -960,6 +1360,77 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
960} 1360}
961 1361
962/** 1362/**
1363 * v9fs_stat2inode_dotl - populate an inode structure with stat info
1364 * @stat: stat structure
1365 * @inode: inode to populate
1366 * @sb: superblock of filesystem
1367 *
1368 */
1369
1370void
1371v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
1372{
1373
1374 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
1375 inode->i_atime.tv_sec = stat->st_atime_sec;
1376 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1377 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1378 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1379 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1380 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1381 inode->i_uid = stat->st_uid;
1382 inode->i_gid = stat->st_gid;
1383 inode->i_nlink = stat->st_nlink;
1384 inode->i_mode = stat->st_mode;
1385 inode->i_rdev = new_decode_dev(stat->st_rdev);
1386
1387 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
1388 init_special_inode(inode, inode->i_mode, inode->i_rdev);
1389
1390 i_size_write(inode, stat->st_size);
1391 inode->i_blocks = stat->st_blocks;
1392 } else {
1393 if (stat->st_result_mask & P9_STATS_ATIME) {
1394 inode->i_atime.tv_sec = stat->st_atime_sec;
1395 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1396 }
1397 if (stat->st_result_mask & P9_STATS_MTIME) {
1398 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1399 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1400 }
1401 if (stat->st_result_mask & P9_STATS_CTIME) {
1402 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1403 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1404 }
1405 if (stat->st_result_mask & P9_STATS_UID)
1406 inode->i_uid = stat->st_uid;
1407 if (stat->st_result_mask & P9_STATS_GID)
1408 inode->i_gid = stat->st_gid;
1409 if (stat->st_result_mask & P9_STATS_NLINK)
1410 inode->i_nlink = stat->st_nlink;
1411 if (stat->st_result_mask & P9_STATS_MODE) {
1412 inode->i_mode = stat->st_mode;
1413 if ((S_ISBLK(inode->i_mode)) ||
1414 (S_ISCHR(inode->i_mode)))
1415 init_special_inode(inode, inode->i_mode,
1416 inode->i_rdev);
1417 }
1418 if (stat->st_result_mask & P9_STATS_RDEV)
1419 inode->i_rdev = new_decode_dev(stat->st_rdev);
1420 if (stat->st_result_mask & P9_STATS_SIZE)
1421 i_size_write(inode, stat->st_size);
1422 if (stat->st_result_mask & P9_STATS_BLOCKS)
1423 inode->i_blocks = stat->st_blocks;
1424 }
1425 if (stat->st_result_mask & P9_STATS_GEN)
1426 inode->i_generation = stat->st_gen;
1427
1428 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
1429 * because the inode structure does not have fields for them.
1430 */
1431}
1432
1433/**
963 * v9fs_qid2ino - convert qid into inode number 1434 * v9fs_qid2ino - convert qid into inode number
964 * @qid: qid to hash 1435 * @qid: qid to hash
965 * 1436 *
@@ -1002,7 +1473,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1002 if (IS_ERR(fid)) 1473 if (IS_ERR(fid))
1003 return PTR_ERR(fid); 1474 return PTR_ERR(fid);
1004 1475
1005 if (!v9fs_proto_dotu(v9ses)) 1476 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))
1006 return -EBADF; 1477 return -EBADF;
1007 1478
1008 st = p9_client_stat(fid); 1479 st = p9_client_stat(fid);
@@ -1022,6 +1493,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1022 1493
1023 retval = strnlen(buffer, buflen); 1494 retval = strnlen(buffer, buflen);
1024done: 1495done:
1496 p9stat_free(st);
1025 kfree(st); 1497 kfree(st);
1026 return retval; 1498 return retval;
1027} 1499}
@@ -1108,6 +1580,99 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1108} 1580}
1109 1581
1110/** 1582/**
1583 * v9fs_vfs_symlink_dotl - helper function to create symlinks
1584 * @dir: directory inode containing symlink
1585 * @dentry: dentry for symlink
1586 * @symname: symlink data
1587 *
1588 * See Also: 9P2000.L RFC for more information
1589 *
1590 */
1591
1592static int
1593v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
1594 const char *symname)
1595{
1596 struct v9fs_session_info *v9ses;
1597 struct p9_fid *dfid;
1598 struct p9_fid *fid = NULL;
1599 struct inode *inode;
1600 struct p9_qid qid;
1601 char *name;
1602 int err;
1603 gid_t gid;
1604
1605 name = (char *) dentry->d_name.name;
1606 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
1607 dir->i_ino, name, symname);
1608 v9ses = v9fs_inode2v9ses(dir);
1609
1610 dfid = v9fs_fid_lookup(dentry->d_parent);
1611 if (IS_ERR(dfid)) {
1612 err = PTR_ERR(dfid);
1613 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1614 return err;
1615 }
1616
1617 gid = v9fs_get_fsgid_for_create(dir);
1618
1619 if (gid < 0) {
1620 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
1621 goto error;
1622 }
1623
1624 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
1625 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
1626
1627 if (err < 0) {
1628 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
1629 goto error;
1630 }
1631
1632 if (v9ses->cache) {
1633 /* Now walk from the parent so we can get an unopened fid. */
1634 fid = p9_client_walk(dfid, 1, &name, 1);
1635 if (IS_ERR(fid)) {
1636 err = PTR_ERR(fid);
1637 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1638 err);
1639 fid = NULL;
1640 goto error;
1641 }
1642
1643 /* instantiate inode and assign the unopened fid to dentry */
1644 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1645 if (IS_ERR(inode)) {
1646 err = PTR_ERR(inode);
1647 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1648 err);
1649 goto error;
1650 }
1651 dentry->d_op = &v9fs_cached_dentry_operations;
1652 d_instantiate(dentry, inode);
1653 err = v9fs_fid_add(dentry, fid);
1654 if (err < 0)
1655 goto error;
1656 fid = NULL;
1657 } else {
1658 /* Not in cached mode. No need to populate inode with stat */
1659 inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
1660 if (IS_ERR(inode)) {
1661 err = PTR_ERR(inode);
1662 goto error;
1663 }
1664 dentry->d_op = &v9fs_dentry_operations;
1665 d_instantiate(dentry, inode);
1666 }
1667
1668error:
1669 if (fid)
1670 p9_client_clunk(fid);
1671
1672 return err;
1673}
1674
1675/**
1111 * v9fs_vfs_symlink - helper function to create symlinks 1676 * v9fs_vfs_symlink - helper function to create symlinks
1112 * @dir: directory inode containing symlink 1677 * @dir: directory inode containing symlink
1113 * @dentry: dentry for symlink 1678 * @dentry: dentry for symlink
@@ -1166,6 +1731,76 @@ clunk_fid:
1166} 1731}
1167 1732
1168/** 1733/**
1734 * v9fs_vfs_link_dotl - create a hardlink for dotl
1735 * @old_dentry: dentry for file to link to
1736 * @dir: inode destination for new link
1737 * @dentry: dentry for link
1738 *
1739 */
1740
1741static int
1742v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
1743 struct dentry *dentry)
1744{
1745 int err;
1746 struct p9_fid *dfid, *oldfid;
1747 char *name;
1748 struct v9fs_session_info *v9ses;
1749 struct dentry *dir_dentry;
1750
1751 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
1752 dir->i_ino, old_dentry->d_name.name,
1753 dentry->d_name.name);
1754
1755 v9ses = v9fs_inode2v9ses(dir);
1756 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1757 dfid = v9fs_fid_lookup(dir_dentry);
1758 if (IS_ERR(dfid))
1759 return PTR_ERR(dfid);
1760
1761 oldfid = v9fs_fid_lookup(old_dentry);
1762 if (IS_ERR(oldfid))
1763 return PTR_ERR(oldfid);
1764
1765 name = (char *) dentry->d_name.name;
1766
1767 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
1768
1769 if (err < 0) {
1770 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
1771 return err;
1772 }
1773
1774 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1775 /* Get the latest stat info from server. */
1776 struct p9_fid *fid;
1777 struct p9_stat_dotl *st;
1778
1779 fid = v9fs_fid_lookup(old_dentry);
1780 if (IS_ERR(fid))
1781 return PTR_ERR(fid);
1782
1783 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
1784 if (IS_ERR(st))
1785 return PTR_ERR(st);
1786
1787 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
1788
1789 kfree(st);
1790 } else {
1791 /* Caching disabled. No need to get upto date stat info.
1792 * This dentry will be released immediately. So, just i_count++
1793 */
1794 atomic_inc(&old_dentry->d_inode->i_count);
1795 }
1796
1797 dentry->d_op = old_dentry->d_op;
1798 d_instantiate(dentry, old_dentry->d_inode);
1799
1800 return err;
1801}
1802
1803/**
1169 * v9fs_vfs_mknod - create a special file 1804 * v9fs_vfs_mknod - create a special file
1170 * @dir: inode destination for new link 1805 * @dir: inode destination for new link
1171 * @dentry: dentry for file 1806 * @dentry: dentry for file
@@ -1197,6 +1832,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1197 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); 1832 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
1198 else if (S_ISFIFO(mode)) 1833 else if (S_ISFIFO(mode))
1199 *name = 0; 1834 *name = 0;
1835 else if (S_ISSOCK(mode))
1836 *name = 0;
1200 else { 1837 else {
1201 __putname(name); 1838 __putname(name);
1202 return -EINVAL; 1839 return -EINVAL;
@@ -1208,7 +1845,101 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1208 return retval; 1845 return retval;
1209} 1846}
1210 1847
1211static const struct inode_operations v9fs_dir_inode_operations_ext = { 1848/**
1849 * v9fs_vfs_mknod_dotl - create a special file
1850 * @dir: inode destination for new link
1851 * @dentry: dentry for file
1852 * @mode: mode for creation
1853 * @rdev: device associated with special file
1854 *
1855 */
1856static int
1857v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
1858 dev_t rdev)
1859{
1860 int err;
1861 char *name;
1862 struct v9fs_session_info *v9ses;
1863 struct p9_fid *fid = NULL, *dfid = NULL;
1864 struct inode *inode;
1865 gid_t gid;
1866 struct p9_qid qid;
1867 struct dentry *dir_dentry;
1868
1869 P9_DPRINTK(P9_DEBUG_VFS,
1870 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
1871 dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
1872
1873 if (!new_valid_dev(rdev))
1874 return -EINVAL;
1875
1876 v9ses = v9fs_inode2v9ses(dir);
1877 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1878 dfid = v9fs_fid_lookup(dir_dentry);
1879 if (IS_ERR(dfid)) {
1880 err = PTR_ERR(dfid);
1881 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1882 dfid = NULL;
1883 goto error;
1884 }
1885
1886 gid = v9fs_get_fsgid_for_create(dir);
1887 if (gid < 0) {
1888 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
1889 goto error;
1890 }
1891
1892 name = (char *) dentry->d_name.name;
1893
1894 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
1895 if (err < 0)
1896 goto error;
1897
1898 /* instantiate inode and assign the unopened fid to the dentry */
1899 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1900 fid = p9_client_walk(dfid, 1, &name, 1);
1901 if (IS_ERR(fid)) {
1902 err = PTR_ERR(fid);
1903 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1904 err);
1905 fid = NULL;
1906 goto error;
1907 }
1908
1909 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1910 if (IS_ERR(inode)) {
1911 err = PTR_ERR(inode);
1912 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1913 err);
1914 goto error;
1915 }
1916 dentry->d_op = &v9fs_cached_dentry_operations;
1917 d_instantiate(dentry, inode);
1918 err = v9fs_fid_add(dentry, fid);
1919 if (err < 0)
1920 goto error;
1921 fid = NULL;
1922 } else {
1923 /*
1924 * Not in cached mode. No need to populate inode with stat.
1925 * socket syscall returns a fd, so we need instantiate
1926 */
1927 inode = v9fs_get_inode(dir->i_sb, mode);
1928 if (IS_ERR(inode)) {
1929 err = PTR_ERR(inode);
1930 goto error;
1931 }
1932 dentry->d_op = &v9fs_dentry_operations;
1933 d_instantiate(dentry, inode);
1934 }
1935
1936error:
1937 if (fid)
1938 p9_client_clunk(fid);
1939 return err;
1940}
1941
1942static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1212 .create = v9fs_vfs_create, 1943 .create = v9fs_vfs_create,
1213 .lookup = v9fs_vfs_lookup, 1944 .lookup = v9fs_vfs_lookup,
1214 .symlink = v9fs_vfs_symlink, 1945 .symlink = v9fs_vfs_symlink,
@@ -1222,6 +1953,25 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = {
1222 .setattr = v9fs_vfs_setattr, 1953 .setattr = v9fs_vfs_setattr,
1223}; 1954};
1224 1955
1956static const struct inode_operations v9fs_dir_inode_operations_dotl = {
1957 .create = v9fs_vfs_create_dotl,
1958 .lookup = v9fs_vfs_lookup,
1959 .link = v9fs_vfs_link_dotl,
1960 .symlink = v9fs_vfs_symlink_dotl,
1961 .unlink = v9fs_vfs_unlink,
1962 .mkdir = v9fs_vfs_mkdir_dotl,
1963 .rmdir = v9fs_vfs_rmdir,
1964 .mknod = v9fs_vfs_mknod_dotl,
1965 .rename = v9fs_vfs_rename,
1966 .getattr = v9fs_vfs_getattr_dotl,
1967 .setattr = v9fs_vfs_setattr_dotl,
1968 .setxattr = generic_setxattr,
1969 .getxattr = generic_getxattr,
1970 .removexattr = generic_removexattr,
1971 .listxattr = v9fs_listxattr,
1972
1973};
1974
1225static const struct inode_operations v9fs_dir_inode_operations = { 1975static const struct inode_operations v9fs_dir_inode_operations = {
1226 .create = v9fs_vfs_create, 1976 .create = v9fs_vfs_create,
1227 .lookup = v9fs_vfs_lookup, 1977 .lookup = v9fs_vfs_lookup,
@@ -1239,6 +1989,15 @@ static const struct inode_operations v9fs_file_inode_operations = {
1239 .setattr = v9fs_vfs_setattr, 1989 .setattr = v9fs_vfs_setattr,
1240}; 1990};
1241 1991
1992static const struct inode_operations v9fs_file_inode_operations_dotl = {
1993 .getattr = v9fs_vfs_getattr_dotl,
1994 .setattr = v9fs_vfs_setattr_dotl,
1995 .setxattr = generic_setxattr,
1996 .getxattr = generic_getxattr,
1997 .removexattr = generic_removexattr,
1998 .listxattr = v9fs_listxattr,
1999};
2000
1242static const struct inode_operations v9fs_symlink_inode_operations = { 2001static const struct inode_operations v9fs_symlink_inode_operations = {
1243 .readlink = generic_readlink, 2002 .readlink = generic_readlink,
1244 .follow_link = v9fs_vfs_follow_link, 2003 .follow_link = v9fs_vfs_follow_link,
@@ -1246,3 +2005,15 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
1246 .getattr = v9fs_vfs_getattr, 2005 .getattr = v9fs_vfs_getattr,
1247 .setattr = v9fs_vfs_setattr, 2006 .setattr = v9fs_vfs_setattr,
1248}; 2007};
2008
2009static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
2010 .readlink = generic_readlink,
2011 .follow_link = v9fs_vfs_follow_link,
2012 .put_link = v9fs_vfs_put_link,
2013 .getattr = v9fs_vfs_getattr_dotl,
2014 .setattr = v9fs_vfs_setattr_dotl,
2015 .setxattr = generic_setxattr,
2016 .getxattr = generic_getxattr,
2017 .removexattr = generic_removexattr,
2018 .listxattr = v9fs_listxattr,
2019};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 806da5d3b3a0..1d12ba0ed3db 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -38,14 +38,16 @@
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/statfs.h>
41#include <net/9p/9p.h> 42#include <net/9p/9p.h>
42#include <net/9p/client.h> 43#include <net/9p/client.h>
43 44
44#include "v9fs.h" 45#include "v9fs.h"
45#include "v9fs_vfs.h" 46#include "v9fs_vfs.h"
46#include "fid.h" 47#include "fid.h"
48#include "xattr.h"
47 49
48static const struct super_operations v9fs_super_ops; 50static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
49 51
50/** 52/**
51 * v9fs_set_super - set the superblock 53 * v9fs_set_super - set the superblock
@@ -76,7 +78,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
76 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 78 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
77 sb->s_blocksize = 1 << sb->s_blocksize_bits; 79 sb->s_blocksize = 1 << sb->s_blocksize_bits;
78 sb->s_magic = V9FS_MAGIC; 80 sb->s_magic = V9FS_MAGIC;
79 sb->s_op = &v9fs_super_ops; 81 if (v9fs_proto_dotl(v9ses)) {
82 sb->s_op = &v9fs_super_ops_dotl;
83 sb->s_xattr = v9fs_xattr_handlers;
84 } else
85 sb->s_op = &v9fs_super_ops;
80 sb->s_bdi = &v9ses->bdi; 86 sb->s_bdi = &v9ses->bdi;
81 87
82 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 88 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
@@ -103,7 +109,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
103 struct inode *inode = NULL; 109 struct inode *inode = NULL;
104 struct dentry *root = NULL; 110 struct dentry *root = NULL;
105 struct v9fs_session_info *v9ses = NULL; 111 struct v9fs_session_info *v9ses = NULL;
106 struct p9_wstat *st = NULL;
107 int mode = S_IRWXUGO | S_ISVTX; 112 int mode = S_IRWXUGO | S_ISVTX;
108 struct p9_fid *fid; 113 struct p9_fid *fid;
109 int retval = 0; 114 int retval = 0;
@@ -117,19 +122,17 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
117 fid = v9fs_session_init(v9ses, dev_name, data); 122 fid = v9fs_session_init(v9ses, dev_name, data);
118 if (IS_ERR(fid)) { 123 if (IS_ERR(fid)) {
119 retval = PTR_ERR(fid); 124 retval = PTR_ERR(fid);
125 /*
126 * we need to call session_close to tear down some
127 * of the data structure setup by session_init
128 */
120 goto close_session; 129 goto close_session;
121 } 130 }
122 131
123 st = p9_client_stat(fid);
124 if (IS_ERR(st)) {
125 retval = PTR_ERR(st);
126 goto clunk_fid;
127 }
128
129 sb = sget(fs_type, NULL, v9fs_set_super, v9ses); 132 sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
130 if (IS_ERR(sb)) { 133 if (IS_ERR(sb)) {
131 retval = PTR_ERR(sb); 134 retval = PTR_ERR(sb);
132 goto free_stat; 135 goto clunk_fid;
133 } 136 }
134 v9fs_fill_super(sb, v9ses, flags, data); 137 v9fs_fill_super(sb, v9ses, flags, data);
135 138
@@ -145,35 +148,53 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
145 retval = -ENOMEM; 148 retval = -ENOMEM;
146 goto release_sb; 149 goto release_sb;
147 } 150 }
148
149 sb->s_root = root; 151 sb->s_root = root;
150 root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
151 152
152 v9fs_stat2inode(st, root->d_inode, sb); 153 if (v9fs_proto_dotl(v9ses)) {
154 struct p9_stat_dotl *st = NULL;
155 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
156 if (IS_ERR(st)) {
157 retval = PTR_ERR(st);
158 goto release_sb;
159 }
160
161 v9fs_stat2inode_dotl(st, root->d_inode);
162 kfree(st);
163 } else {
164 struct p9_wstat *st = NULL;
165 st = p9_client_stat(fid);
166 if (IS_ERR(st)) {
167 retval = PTR_ERR(st);
168 goto release_sb;
169 }
170
171 root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
172 v9fs_stat2inode(st, root->d_inode, sb);
173
174 p9stat_free(st);
175 kfree(st);
176 }
153 177
154 v9fs_fid_add(root, fid); 178 v9fs_fid_add(root, fid);
155 p9stat_free(st);
156 kfree(st);
157 179
158P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); 180 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
159 simple_set_mnt(mnt, sb); 181 simple_set_mnt(mnt, sb);
160 return 0; 182 return 0;
161 183
162free_stat:
163 p9stat_free(st);
164 kfree(st);
165
166clunk_fid: 184clunk_fid:
167 p9_client_clunk(fid); 185 p9_client_clunk(fid);
168
169close_session: 186close_session:
170 v9fs_session_close(v9ses); 187 v9fs_session_close(v9ses);
171 kfree(v9ses); 188 kfree(v9ses);
172 return retval; 189 return retval;
173
174release_sb: 190release_sb:
175 p9stat_free(st); 191 /*
176 kfree(st); 192 * we will do the session_close and root dentry release
193 * in the below call. But we need to clunk fid, because we haven't
194 * attached the fid to dentry so it won't get clunked
195 * automatically.
196 */
197 p9_client_clunk(fid);
177 deactivate_locked_super(sb); 198 deactivate_locked_super(sb);
178 return retval; 199 return retval;
179} 200}
@@ -211,13 +232,60 @@ v9fs_umount_begin(struct super_block *sb)
211 v9fs_session_begin_cancel(v9ses); 232 v9fs_session_begin_cancel(v9ses);
212} 233}
213 234
235static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
236{
237 struct v9fs_session_info *v9ses;
238 struct p9_fid *fid;
239 struct p9_rstatfs rs;
240 int res;
241
242 fid = v9fs_fid_lookup(dentry);
243 if (IS_ERR(fid)) {
244 res = PTR_ERR(fid);
245 goto done;
246 }
247
248 v9ses = v9fs_inode2v9ses(dentry->d_inode);
249 if (v9fs_proto_dotl(v9ses)) {
250 res = p9_client_statfs(fid, &rs);
251 if (res == 0) {
252 buf->f_type = rs.type;
253 buf->f_bsize = rs.bsize;
254 buf->f_blocks = rs.blocks;
255 buf->f_bfree = rs.bfree;
256 buf->f_bavail = rs.bavail;
257 buf->f_files = rs.files;
258 buf->f_ffree = rs.ffree;
259 buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
260 buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
261 buf->f_namelen = rs.namelen;
262 }
263 if (res != -ENOSYS)
264 goto done;
265 }
266 res = simple_statfs(dentry, buf);
267done:
268 return res;
269}
270
214static const struct super_operations v9fs_super_ops = { 271static const struct super_operations v9fs_super_ops = {
215#ifdef CONFIG_9P_FSCACHE 272#ifdef CONFIG_9P_FSCACHE
216 .alloc_inode = v9fs_alloc_inode, 273 .alloc_inode = v9fs_alloc_inode,
217 .destroy_inode = v9fs_destroy_inode, 274 .destroy_inode = v9fs_destroy_inode,
218#endif 275#endif
219 .statfs = simple_statfs, 276 .statfs = simple_statfs,
220 .clear_inode = v9fs_clear_inode, 277 .evict_inode = v9fs_evict_inode,
278 .show_options = generic_show_options,
279 .umount_begin = v9fs_umount_begin,
280};
281
282static const struct super_operations v9fs_super_ops_dotl = {
283#ifdef CONFIG_9P_FSCACHE
284 .alloc_inode = v9fs_alloc_inode,
285 .destroy_inode = v9fs_destroy_inode,
286#endif
287 .statfs = v9fs_statfs,
288 .evict_inode = v9fs_evict_inode,
221 .show_options = generic_show_options, 289 .show_options = generic_show_options,
222 .umount_begin = v9fs_umount_begin, 290 .umount_begin = v9fs_umount_begin,
223}; 291};
@@ -227,4 +295,5 @@ struct file_system_type v9fs_fs_type = {
227 .get_sb = v9fs_get_sb, 295 .get_sb = v9fs_get_sb,
228 .kill_sb = v9fs_kill_super, 296 .kill_sb = v9fs_kill_super,
229 .owner = THIS_MODULE, 297 .owner = THIS_MODULE,
298 .fs_flags = FS_RENAME_DOES_D_MOVE,
230}; 299};
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
new file mode 100644
index 000000000000..f88e5c2dc873
--- /dev/null
+++ b/fs/9p/xattr.c
@@ -0,0 +1,160 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/fs.h>
17#include <linux/sched.h>
18#include <net/9p/9p.h>
19#include <net/9p/client.h>
20
21#include "fid.h"
22#include "xattr.h"
23
24/*
25 * v9fs_xattr_get()
26 *
27 * Copy an extended attribute into the buffer
28 * provided, or compute the buffer size required.
29 * Buffer is NULL to compute the size of the buffer required.
30 *
31 * Returns a negative error number on failure, or the number of bytes
32 * used / required on success.
33 */
34ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
35 void *buffer, size_t buffer_size)
36{
37 ssize_t retval;
38 int msize, read_count;
39 u64 offset = 0, attr_size;
40 struct p9_fid *fid, *attr_fid;
41
42 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
43 __func__, name, buffer_size);
44
45 fid = v9fs_fid_lookup(dentry);
46 if (IS_ERR(fid))
47 return PTR_ERR(fid);
48
49 attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
50 if (IS_ERR(attr_fid)) {
51 retval = PTR_ERR(attr_fid);
52 P9_DPRINTK(P9_DEBUG_VFS,
53 "p9_client_attrwalk failed %zd\n", retval);
54 attr_fid = NULL;
55 goto error;
56 }
57 if (!buffer_size) {
58 /* request to get the attr_size */
59 retval = attr_size;
60 goto error;
61 }
62 if (attr_size > buffer_size) {
63 retval = -ERANGE;
64 goto error;
65 }
66 msize = attr_fid->clnt->msize;
67 while (attr_size) {
68 if (attr_size > (msize - P9_IOHDRSZ))
69 read_count = msize - P9_IOHDRSZ;
70 else
71 read_count = attr_size;
72 read_count = p9_client_read(attr_fid, ((char *)buffer)+offset,
73 NULL, offset, read_count);
74 if (read_count < 0) {
75 /* error in xattr read */
76 retval = read_count;
77 goto error;
78 }
79 offset += read_count;
80 attr_size -= read_count;
81 }
82 /* Total read xattr bytes */
83 retval = offset;
84error:
85 if (attr_fid)
86 p9_client_clunk(attr_fid);
87 return retval;
88
89}
90
91/*
92 * v9fs_xattr_set()
93 *
94 * Create, replace or remove an extended attribute for this inode. Buffer
95 * is NULL to remove an existing extended attribute, and non-NULL to
96 * either replace an existing extended attribute, or create a new extended
97 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
98 * specify that an extended attribute must exist and must not exist
99 * previous to the call, respectively.
100 *
101 * Returns 0, or a negative error number on failure.
102 */
103int v9fs_xattr_set(struct dentry *dentry, const char *name,
104 const void *value, size_t value_len, int flags)
105{
106 u64 offset = 0;
107 int retval, msize, write_count;
108 struct p9_fid *fid = NULL;
109
110 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
111 __func__, name, value_len, flags);
112
113 fid = v9fs_fid_clone(dentry);
114 if (IS_ERR(fid)) {
115 retval = PTR_ERR(fid);
116 fid = NULL;
117 goto error;
118 }
119 /*
120 * On success fid points to xattr
121 */
122 retval = p9_client_xattrcreate(fid, name, value_len, flags);
123 if (retval < 0) {
124 P9_DPRINTK(P9_DEBUG_VFS,
125 "p9_client_xattrcreate failed %d\n", retval);
126 goto error;
127 }
128 msize = fid->clnt->msize;;
129 while (value_len) {
130 if (value_len > (msize - P9_IOHDRSZ))
131 write_count = msize - P9_IOHDRSZ;
132 else
133 write_count = value_len;
134 write_count = p9_client_write(fid, ((char *)value)+offset,
135 NULL, offset, write_count);
136 if (write_count < 0) {
137 /* error in xattr write */
138 retval = write_count;
139 goto error;
140 }
141 offset += write_count;
142 value_len -= write_count;
143 }
144 /* Total read xattr bytes */
145 retval = offset;
146error:
147 if (fid)
148 retval = p9_client_clunk(fid);
149 return retval;
150}
151
152ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
153{
154 return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
155}
156
157const struct xattr_handler *v9fs_xattr_handlers[] = {
158 &v9fs_xattr_user_handler,
159 NULL
160};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
new file mode 100644
index 000000000000..9ddf672ae5c4
--- /dev/null
+++ b/fs/9p/xattr.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14#ifndef FS_9P_XATTR_H
15#define FS_9P_XATTR_H
16
17#include <linux/xattr.h>
18
19extern const struct xattr_handler *v9fs_xattr_handlers[];
20extern struct xattr_handler v9fs_xattr_user_handler;
21
22extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
23 void *, size_t);
24extern int v9fs_xattr_set(struct dentry *, const char *,
25 const void *, size_t, int);
26extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
27#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
new file mode 100644
index 000000000000..d0b701b72080
--- /dev/null
+++ b/fs/9p/xattr_user.c
@@ -0,0 +1,80 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/slab.h>
20#include "xattr.h"
21
22static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
23 void *buffer, size_t size, int type)
24{
25 int retval;
26 char *full_name;
27 size_t name_len;
28 size_t prefix_len = XATTR_USER_PREFIX_LEN;
29
30 if (name == NULL)
31 return -EINVAL;
32
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35
36 name_len = strlen(name);
37 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
38 if (!full_name)
39 return -ENOMEM;
40 memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
41 memcpy(full_name+prefix_len, name, name_len);
42 full_name[prefix_len + name_len] = '\0';
43
44 retval = v9fs_xattr_get(dentry, full_name, buffer, size);
45 kfree(full_name);
46 return retval;
47}
48
49static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
50 const void *value, size_t size, int flags, int type)
51{
52 int retval;
53 char *full_name;
54 size_t name_len;
55 size_t prefix_len = XATTR_USER_PREFIX_LEN;
56
57 if (name == NULL)
58 return -EINVAL;
59
60 if (strcmp(name, "") == 0)
61 return -EINVAL;
62
63 name_len = strlen(name);
64 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
65 if (!full_name)
66 return -ENOMEM;
67 memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
68 memcpy(full_name + prefix_len, name, name_len);
69 full_name[prefix_len + name_len] = '\0';
70
71 retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
72 kfree(full_name);
73 return retval;
74}
75
76struct xattr_handler v9fs_xattr_user_handler = {
77 .prefix = XATTR_USER_PREFIX,
78 .get = v9fs_xattr_user_get,
79 .set = v9fs_xattr_user_set,
80};
diff --git a/fs/Kconfig b/fs/Kconfig
index 5f85b5947613..3d185308ec88 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -64,7 +64,7 @@ source "fs/autofs4/Kconfig"
64source "fs/fuse/Kconfig" 64source "fs/fuse/Kconfig"
65 65
66config CUSE 66config CUSE
67 tristate "Character device in Userpace support" 67 tristate "Character device in Userspace support"
68 depends on FUSE_FS 68 depends on FUSE_FS
69 help 69 help
70 This FUSE extension allows character devices to be 70 This FUSE extension allows character devices to be
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f14ba2..e6ec1d309b1d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o fs_struct.o 14 stack.o fs_struct.o statfs.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
197 .read = generic_read_dir, 197 .read = generic_read_dir,
198 .llseek = generic_file_llseek, 198 .llseek = generic_file_llseek,
199 .readdir = adfs_readdir, 199 .readdir = adfs_readdir,
200 .fsync = simple_fsync, 200 .fsync = generic_file_fsync,
201}; 201};
202 202
203static int 203static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
26 .read = do_sync_read, 26 .read = do_sync_read,
27 .aio_read = generic_file_aio_read, 27 .aio_read = generic_file_aio_read,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .write = do_sync_write, 30 .write = do_sync_write,
31 .aio_write = generic_file_aio_write, 31 .aio_write = generic_file_aio_write,
32 .splice_read = generic_file_splice_read, 32 .splice_read = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..65794b8fe79e 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -50,10 +50,19 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
50 loff_t pos, unsigned len, unsigned flags, 50 loff_t pos, unsigned len, unsigned flags,
51 struct page **pagep, void **fsdata) 51 struct page **pagep, void **fsdata)
52{ 52{
53 int ret;
54
53 *pagep = NULL; 55 *pagep = NULL;
54 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 56 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
55 adfs_get_block, 57 adfs_get_block,
56 &ADFS_I(mapping->host)->mmu_private); 58 &ADFS_I(mapping->host)->mmu_private);
59 if (unlikely(ret)) {
60 loff_t isize = mapping->host->i_size;
61 if (pos + len > isize)
62 vmtruncate(mapping->host, isize);
63 }
64
65 return ret;
57} 66}
58 67
59static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) 68static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
@@ -322,11 +331,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
322 if (error) 331 if (error)
323 goto out; 332 goto out;
324 333
334 /* XXX: this is missing some actual on-disk truncation.. */
325 if (ia_valid & ATTR_SIZE) 335 if (ia_valid & ATTR_SIZE)
326 error = vmtruncate(inode, attr->ia_size); 336 truncate_setsize(inode, attr->ia_size);
327
328 if (error)
329 goto out;
330 337
331 if (ia_valid & ATTR_MTIME) { 338 if (ia_valid & ATTR_MTIME) {
332 inode->i_mtime = attr->ia_mtime; 339 inode->i_mtime = attr->ia_mtime;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..a8cbdeb34025 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -171,8 +171,7 @@ extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry,
171extern unsigned long affs_parent_ino(struct inode *dir); 171extern unsigned long affs_parent_ino(struct inode *dir);
172extern struct inode *affs_new_inode(struct inode *dir); 172extern struct inode *affs_new_inode(struct inode *dir);
173extern int affs_notify_change(struct dentry *dentry, struct iattr *attr); 173extern int affs_notify_change(struct dentry *dentry, struct iattr *attr);
174extern void affs_delete_inode(struct inode *inode); 174extern void affs_evict_inode(struct inode *inode);
175extern void affs_clear_inode(struct inode *inode);
176extern struct inode *affs_iget(struct super_block *sb, 175extern struct inode *affs_iget(struct super_block *sb,
177 unsigned long ino); 176 unsigned long ino);
178extern int affs_write_inode(struct inode *inode, 177extern int affs_write_inode(struct inode *inode,
@@ -183,7 +182,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent
183 182
184void affs_free_prealloc(struct inode *inode); 183void affs_free_prealloc(struct inode *inode);
185extern void affs_truncate(struct inode *); 184extern void affs_truncate(struct inode *);
186int affs_file_fsync(struct file *, struct dentry *, int); 185int affs_file_fsync(struct file *, int);
187 186
188/* dir.c */ 187/* dir.c */
189 188
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..c4a9875bd1a6 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,10 +406,19 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
406 loff_t pos, unsigned len, unsigned flags, 406 loff_t pos, unsigned len, unsigned flags,
407 struct page **pagep, void **fsdata) 407 struct page **pagep, void **fsdata)
408{ 408{
409 int ret;
410
409 *pagep = NULL; 411 *pagep = NULL;
410 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 412 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
411 affs_get_block, 413 affs_get_block,
412 &AFFS_I(mapping->host)->mmu_private); 414 &AFFS_I(mapping->host)->mmu_private);
415 if (unlikely(ret)) {
416 loff_t isize = mapping->host->i_size;
417 if (pos + len > isize)
418 vmtruncate(mapping->host, isize);
419 }
420
421 return ret;
413} 422}
414 423
415static sector_t _affs_bmap(struct address_space *mapping, sector_t block) 424static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
@@ -916,9 +925,9 @@ affs_truncate(struct inode *inode)
916 affs_free_prealloc(inode); 925 affs_free_prealloc(inode);
917} 926}
918 927
919int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 928int affs_file_fsync(struct file *filp, int datasync)
920{ 929{
921 struct inode * inode = dentry->d_inode; 930 struct inode *inode = filp->f_mapping->host;
922 int ret, err; 931 int ret, err;
923 932
924 ret = write_inode_now(inode, 0); 933 ret = write_inode_now(inode, 0);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index f4b2a4ee4f91..3a0fdec175ba 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -235,31 +235,36 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
235 goto out; 235 goto out;
236 } 236 }
237 237
238 error = inode_setattr(inode, attr); 238 if ((attr->ia_valid & ATTR_SIZE) &&
239 if (!error && (attr->ia_valid & ATTR_MODE)) 239 attr->ia_size != i_size_read(inode)) {
240 error = vmtruncate(inode, attr->ia_size);
241 if (error)
242 return error;
243 }
244
245 setattr_copy(inode, attr);
246 mark_inode_dirty(inode);
247
248 if (attr->ia_valid & ATTR_MODE)
240 mode_to_prot(inode); 249 mode_to_prot(inode);
241out: 250out:
242 return error; 251 return error;
243} 252}
244 253
245void 254void
246affs_delete_inode(struct inode *inode) 255affs_evict_inode(struct inode *inode)
247{
248 pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
249 truncate_inode_pages(&inode->i_data, 0);
250 inode->i_size = 0;
251 affs_truncate(inode);
252 clear_inode(inode);
253 affs_free_block(inode->i_sb, inode->i_ino);
254}
255
256void
257affs_clear_inode(struct inode *inode)
258{ 256{
259 unsigned long cache_page; 257 unsigned long cache_page;
258 pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
259 truncate_inode_pages(&inode->i_data, 0);
260 260
261 pr_debug("AFFS: clear_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); 261 if (!inode->i_nlink) {
262 inode->i_size = 0;
263 affs_truncate(inode);
264 }
262 265
266 invalidate_inode_buffers(inode);
267 end_writeback(inode);
263 affs_free_prealloc(inode); 268 affs_free_prealloc(inode);
264 cache_page = (unsigned long)AFFS_I(inode)->i_lc; 269 cache_page = (unsigned long)AFFS_I(inode)->i_lc;
265 if (cache_page) { 270 if (cache_page) {
@@ -271,6 +276,9 @@ affs_clear_inode(struct inode *inode)
271 affs_brelse(AFFS_I(inode)->i_ext_bh); 276 affs_brelse(AFFS_I(inode)->i_ext_bh);
272 AFFS_I(inode)->i_ext_last = ~1; 277 AFFS_I(inode)->i_ext_last = ~1;
273 AFFS_I(inode)->i_ext_bh = NULL; 278 AFFS_I(inode)->i_ext_bh = NULL;
279
280 if (!inode->i_nlink)
281 affs_free_block(inode->i_sb, inode->i_ino);
274} 282}
275 283
276struct inode * 284struct inode *
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
224 affs_brelse(bh); 224 affs_brelse(bh);
225 inode = affs_iget(sb, ino); 225 inode = affs_iget(sb, ino);
226 if (IS_ERR(inode)) 226 if (IS_ERR(inode))
227 return ERR_PTR(PTR_ERR(inode)); 227 return ERR_CAST(inode);
228 } 228 }
229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations; 229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
230 d_add(dentry, inode); 230 d_add(dentry, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 16a3e4765f68..33c4e7eef470 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -26,7 +26,7 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
26static int affs_remount (struct super_block *sb, int *flags, char *data); 26static int affs_remount (struct super_block *sb, int *flags, char *data);
27 27
28static void 28static void
29affs_commit_super(struct super_block *sb, int clean) 29affs_commit_super(struct super_block *sb, int wait, int clean)
30{ 30{
31 struct affs_sb_info *sbi = AFFS_SB(sb); 31 struct affs_sb_info *sbi = AFFS_SB(sb);
32 struct buffer_head *bh = sbi->s_root_bh; 32 struct buffer_head *bh = sbi->s_root_bh;
@@ -36,6 +36,8 @@ affs_commit_super(struct super_block *sb, int clean)
36 secs_to_datestamp(get_seconds(), &tail->disk_change); 36 secs_to_datestamp(get_seconds(), &tail->disk_change);
37 affs_fix_checksum(sb, bh); 37 affs_fix_checksum(sb, bh);
38 mark_buffer_dirty(bh); 38 mark_buffer_dirty(bh);
39 if (wait)
40 sync_dirty_buffer(bh);
39} 41}
40 42
41static void 43static void
@@ -46,8 +48,8 @@ affs_put_super(struct super_block *sb)
46 48
47 lock_kernel(); 49 lock_kernel();
48 50
49 if (!(sb->s_flags & MS_RDONLY)) 51 if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
50 affs_commit_super(sb, 1); 52 affs_commit_super(sb, 1, 1);
51 53
52 kfree(sbi->s_prefix); 54 kfree(sbi->s_prefix);
53 affs_free_bitmap(sb); 55 affs_free_bitmap(sb);
@@ -61,27 +63,20 @@ affs_put_super(struct super_block *sb)
61static void 63static void
62affs_write_super(struct super_block *sb) 64affs_write_super(struct super_block *sb)
63{ 65{
64 int clean = 2;
65
66 lock_super(sb); 66 lock_super(sb);
67 if (!(sb->s_flags & MS_RDONLY)) { 67 if (!(sb->s_flags & MS_RDONLY))
68 // if (sbi->s_bitmap[i].bm_bh) { 68 affs_commit_super(sb, 1, 2);
69 // if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) { 69 sb->s_dirt = 0;
70 // clean = 0;
71 affs_commit_super(sb, clean);
72 sb->s_dirt = !clean; /* redo until bitmap synced */
73 } else
74 sb->s_dirt = 0;
75 unlock_super(sb); 70 unlock_super(sb);
76 71
77 pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean); 72 pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds());
78} 73}
79 74
80static int 75static int
81affs_sync_fs(struct super_block *sb, int wait) 76affs_sync_fs(struct super_block *sb, int wait)
82{ 77{
83 lock_super(sb); 78 lock_super(sb);
84 affs_commit_super(sb, 2); 79 affs_commit_super(sb, wait, 2);
85 sb->s_dirt = 0; 80 sb->s_dirt = 0;
86 unlock_super(sb); 81 unlock_super(sb);
87 return 0; 82 return 0;
@@ -140,8 +135,7 @@ static const struct super_operations affs_sops = {
140 .alloc_inode = affs_alloc_inode, 135 .alloc_inode = affs_alloc_inode,
141 .destroy_inode = affs_destroy_inode, 136 .destroy_inode = affs_destroy_inode,
142 .write_inode = affs_write_inode, 137 .write_inode = affs_write_inode,
143 .delete_inode = affs_delete_inode, 138 .evict_inode = affs_evict_inode,
144 .clear_inode = affs_clear_inode,
145 .put_super = affs_put_super, 139 .put_super = affs_put_super,
146 .write_super = affs_write_super, 140 .write_super = affs_write_super,
147 .sync_fs = affs_sync_fs, 141 .sync_fs = affs_sync_fs,
@@ -554,9 +548,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
554 return 0; 548 return 0;
555 } 549 }
556 if (*flags & MS_RDONLY) { 550 if (*flags & MS_RDONLY) {
557 sb->s_dirt = 1; 551 affs_write_super(sb);
558 while (sb->s_dirt)
559 affs_write_super(sb);
560 affs_free_bitmap(sb); 552 affs_free_bitmap(sb);
561 } else 553 } else
562 res = affs_init_bitmap(sb, flags); 554 res = affs_init_bitmap(sb, flags);
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 5c4e61d3c772..8f975f25b486 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -2,6 +2,7 @@ config AFS_FS
2 tristate "Andrew File System support (AFS) (EXPERIMENTAL)" 2 tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select AF_RXRPC 4 select AF_RXRPC
5 select DNS_RESOLVER
5 help 6 help
6 If you say Y here, you will get an experimental Andrew File System 7 If you say Y here, you will get an experimental Andrew File System
7 driver. It currently only supports unsecured read-only AFS access. 8 driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index e19c13f059ed..0d5eeadf6121 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -13,6 +13,7 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/key.h> 14#include <linux/key.h>
15#include <linux/ctype.h> 15#include <linux/ctype.h>
16#include <linux/dns_resolver.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <keys/rxrpc-type.h> 18#include <keys/rxrpc-type.h>
18#include "internal.h" 19#include "internal.h"
@@ -30,21 +31,24 @@ static struct afs_cell *afs_cell_root;
30 * allocate a cell record and fill in its name, VL server address list and 31 * allocate a cell record and fill in its name, VL server address list and
31 * allocate an anonymous key 32 * allocate an anonymous key
32 */ 33 */
33static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) 34static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
35 char *vllist)
34{ 36{
35 struct afs_cell *cell; 37 struct afs_cell *cell;
36 struct key *key; 38 struct key *key;
37 size_t namelen;
38 char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; 39 char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
40 char *dvllist = NULL, *_vllist = NULL;
41 char delimiter = ':';
39 int ret; 42 int ret;
40 43
41 _enter("%s,%s", name, vllist); 44 _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
42 45
43 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ 46 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
44 47
45 namelen = strlen(name); 48 if (namelen > AFS_MAXCELLNAME) {
46 if (namelen > AFS_MAXCELLNAME) 49 _leave(" = -ENAMETOOLONG");
47 return ERR_PTR(-ENAMETOOLONG); 50 return ERR_PTR(-ENAMETOOLONG);
51 }
48 52
49 /* allocate and initialise a cell record */ 53 /* allocate and initialise a cell record */
50 cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); 54 cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
@@ -64,15 +68,35 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
64 INIT_LIST_HEAD(&cell->vl_list); 68 INIT_LIST_HEAD(&cell->vl_list);
65 spin_lock_init(&cell->vl_lock); 69 spin_lock_init(&cell->vl_lock);
66 70
71 /* if the ip address is invalid, try dns query */
72 if (!vllist || strlen(vllist) < 7) {
73 ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
74 if (ret < 0) {
75 if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
76 /* translate these errors into something
77 * userspace might understand */
78 ret = -EDESTADDRREQ;
79 _leave(" = %d", ret);
80 return ERR_PTR(ret);
81 }
82 _vllist = dvllist;
83
84 /* change the delimiter for user-space reply */
85 delimiter = ',';
86
87 } else {
88 _vllist = vllist;
89 }
90
67 /* fill in the VL server list from the rest of the string */ 91 /* fill in the VL server list from the rest of the string */
68 do { 92 do {
69 unsigned a, b, c, d; 93 unsigned a, b, c, d;
70 94
71 next = strchr(vllist, ':'); 95 next = strchr(_vllist, delimiter);
72 if (next) 96 if (next)
73 *next++ = 0; 97 *next++ = 0;
74 98
75 if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) 99 if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
76 goto bad_address; 100 goto bad_address;
77 101
78 if (a > 255 || b > 255 || c > 255 || d > 255) 102 if (a > 255 || b > 255 || c > 255 || d > 255)
@@ -81,7 +105,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
81 cell->vl_addrs[cell->vl_naddrs++].s_addr = 105 cell->vl_addrs[cell->vl_naddrs++].s_addr =
82 htonl((a << 24) | (b << 16) | (c << 8) | d); 106 htonl((a << 24) | (b << 16) | (c << 8) | d);
83 107
84 } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next)); 108 } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
85 109
86 /* create a key to represent an anonymous user */ 110 /* create a key to represent an anonymous user */
87 memcpy(keyname, "afs@", 4); 111 memcpy(keyname, "afs@", 4);
@@ -110,32 +134,36 @@ bad_address:
110 ret = -EINVAL; 134 ret = -EINVAL;
111error: 135error:
112 key_put(cell->anonymous_key); 136 key_put(cell->anonymous_key);
137 kfree(dvllist);
113 kfree(cell); 138 kfree(cell);
114 _leave(" = %d", ret); 139 _leave(" = %d", ret);
115 return ERR_PTR(ret); 140 return ERR_PTR(ret);
116} 141}
117 142
118/* 143/*
119 * create a cell record 144 * afs_cell_crate() - create a cell record
120 * - "name" is the name of the cell 145 * @name: is the name of the cell.
121 * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format 146 * @namsesz: is the strlen of the cell name.
147 * @vllist: is a colon separated list of IP addresses in "a.b.c.d" format.
148 * @retref: is T to return the cell reference when the cell exists.
122 */ 149 */
123struct afs_cell *afs_cell_create(const char *name, char *vllist) 150struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
151 char *vllist, bool retref)
124{ 152{
125 struct afs_cell *cell; 153 struct afs_cell *cell;
126 int ret; 154 int ret;
127 155
128 _enter("%s,%s", name, vllist); 156 _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
129 157
130 down_write(&afs_cells_sem); 158 down_write(&afs_cells_sem);
131 read_lock(&afs_cells_lock); 159 read_lock(&afs_cells_lock);
132 list_for_each_entry(cell, &afs_cells, link) { 160 list_for_each_entry(cell, &afs_cells, link) {
133 if (strcasecmp(cell->name, name) == 0) 161 if (strncasecmp(cell->name, name, namesz) == 0)
134 goto duplicate_name; 162 goto duplicate_name;
135 } 163 }
136 read_unlock(&afs_cells_lock); 164 read_unlock(&afs_cells_lock);
137 165
138 cell = afs_cell_alloc(name, vllist); 166 cell = afs_cell_alloc(name, namesz, vllist);
139 if (IS_ERR(cell)) { 167 if (IS_ERR(cell)) {
140 _leave(" = %ld", PTR_ERR(cell)); 168 _leave(" = %ld", PTR_ERR(cell));
141 up_write(&afs_cells_sem); 169 up_write(&afs_cells_sem);
@@ -175,8 +203,18 @@ error:
175 return ERR_PTR(ret); 203 return ERR_PTR(ret);
176 204
177duplicate_name: 205duplicate_name:
206 if (retref && !IS_ERR(cell))
207 afs_get_cell(cell);
208
178 read_unlock(&afs_cells_lock); 209 read_unlock(&afs_cells_lock);
179 up_write(&afs_cells_sem); 210 up_write(&afs_cells_sem);
211
212 if (retref) {
213 _leave(" = %p", cell);
214 return cell;
215 }
216
217 _leave(" = -EEXIST");
180 return ERR_PTR(-EEXIST); 218 return ERR_PTR(-EEXIST);
181} 219}
182 220
@@ -201,15 +239,13 @@ int afs_cell_init(char *rootcell)
201 } 239 }
202 240
203 cp = strchr(rootcell, ':'); 241 cp = strchr(rootcell, ':');
204 if (!cp) { 242 if (!cp)
205 printk(KERN_ERR "kAFS: no VL server IP addresses specified\n"); 243 _debug("kAFS: no VL server IP addresses specified");
206 _leave(" = -EINVAL"); 244 else
207 return -EINVAL; 245 *cp++ = 0;
208 }
209 246
210 /* allocate a cell record for the root cell */ 247 /* allocate a cell record for the root cell */
211 *cp++ = 0; 248 new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
212 new_root = afs_cell_create(rootcell, cp);
213 if (IS_ERR(new_root)) { 249 if (IS_ERR(new_root)) {
214 _leave(" = %ld", PTR_ERR(new_root)); 250 _leave(" = %ld", PTR_ERR(new_root));
215 return PTR_ERR(new_root); 251 return PTR_ERR(new_root);
@@ -229,11 +265,12 @@ int afs_cell_init(char *rootcell)
229/* 265/*
230 * lookup a cell record 266 * lookup a cell record
231 */ 267 */
232struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz) 268struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
269 bool dns_cell)
233{ 270{
234 struct afs_cell *cell; 271 struct afs_cell *cell;
235 272
236 _enter("\"%*.*s\",", namesz, namesz, name ? name : ""); 273 _enter("\"%*.*s\",", namesz, namesz, name ?: "");
237 274
238 down_read(&afs_cells_sem); 275 down_read(&afs_cells_sem);
239 read_lock(&afs_cells_lock); 276 read_lock(&afs_cells_lock);
@@ -247,6 +284,8 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
247 } 284 }
248 } 285 }
249 cell = ERR_PTR(-ENOENT); 286 cell = ERR_PTR(-ENOENT);
287 if (dns_cell)
288 goto create_cell;
250 found: 289 found:
251 ; 290 ;
252 } else { 291 } else {
@@ -269,6 +308,15 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
269 up_read(&afs_cells_sem); 308 up_read(&afs_cells_sem);
270 _leave(" = %p", cell); 309 _leave(" = %p", cell);
271 return cell; 310 return cell;
311
312create_cell:
313 read_unlock(&afs_cells_lock);
314 up_read(&afs_cells_sem);
315
316 cell = afs_cell_create(name, namesz, NULL, true);
317
318 _leave(" = %p", cell);
319 return cell;
272} 320}
273 321
274#if 0 322#if 0
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index adc1cb771b57..0d38c09bd55e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
189 struct key *key) 189 struct key *key)
190{ 190{
191 struct page *page; 191 struct page *page;
192 struct file file = {
193 .private_data = key,
194 };
195
196 _enter("{%lu},%lu", dir->i_ino, index); 192 _enter("{%lu},%lu", dir->i_ino, index);
197 193
198 page = read_mapping_page(dir->i_mapping, index, &file); 194 page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
199 if (!IS_ERR(page)) { 195 if (!IS_ERR(page)) {
200 kmap(page); 196 kmap(page);
201 if (!PageChecked(page)) 197 if (!PageChecked(page))
@@ -481,6 +477,40 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
481} 477}
482 478
483/* 479/*
480 * Try to auto mount the mountpoint with pseudo directory, if the autocell
481 * operation is setted.
482 */
483static struct inode *afs_try_auto_mntpt(
484 int ret, struct dentry *dentry, struct inode *dir, struct key *key,
485 struct afs_fid *fid)
486{
487 const char *devname = dentry->d_name.name;
488 struct afs_vnode *vnode = AFS_FS_I(dir);
489 struct inode *inode;
490
491 _enter("%d, %p{%s}, {%x:%u}, %p",
492 ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key);
493
494 if (ret != -ENOENT ||
495 !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
496 goto out;
497
498 inode = afs_iget_autocell(dir, devname, strlen(devname), key);
499 if (IS_ERR(inode)) {
500 ret = PTR_ERR(inode);
501 goto out;
502 }
503
504 *fid = AFS_FS_I(inode)->fid;
505 _leave("= %p", inode);
506 return inode;
507
508out:
509 _leave("= %d", ret);
510 return ERR_PTR(ret);
511}
512
513/*
484 * look up an entry in a directory 514 * look up an entry in a directory
485 */ 515 */
486static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, 516static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -524,6 +554,13 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
524 554
525 ret = afs_do_lookup(dir, dentry, &fid, key); 555 ret = afs_do_lookup(dir, dentry, &fid, key);
526 if (ret < 0) { 556 if (ret < 0) {
557 inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid);
558 if (!IS_ERR(inode)) {
559 key_put(key);
560 goto success;
561 }
562
563 ret = PTR_ERR(inode);
527 key_put(key); 564 key_put(key);
528 if (ret == -ENOENT) { 565 if (ret == -ENOENT) {
529 d_add(dentry, NULL); 566 d_add(dentry, NULL);
@@ -543,6 +580,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
543 return ERR_CAST(inode); 580 return ERR_CAST(inode);
544 } 581 }
545 582
583success:
546 dentry->d_op = &afs_fs_dentry_operations; 584 dentry->d_op = &afs_fs_dentry_operations;
547 585
548 d_add(dentry, inode); 586 d_add(dentry, inode);
@@ -700,8 +738,9 @@ static int afs_d_delete(struct dentry *dentry)
700 goto zap; 738 goto zap;
701 739
702 if (dentry->d_inode && 740 if (dentry->d_inode &&
703 test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags)) 741 (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) ||
704 goto zap; 742 test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags)))
743 goto zap;
705 744
706 _leave(" = 0 [keep]"); 745 _leave(" = 0 [keep]");
707 return 0; 746 return 0;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0df9bc2b724d..14d89fa58fee 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
121#endif 121#endif
122 122
123/* 123/*
124 * AFS read page from file, directory or symlink 124 * read page from file, directory or symlink, given a key to use
125 */ 125 */
126static int afs_readpage(struct file *file, struct page *page) 126int afs_page_filler(void *data, struct page *page)
127{ 127{
128 struct afs_vnode *vnode; 128 struct inode *inode = page->mapping->host;
129 struct inode *inode; 129 struct afs_vnode *vnode = AFS_FS_I(inode);
130 struct key *key; 130 struct key *key = data;
131 size_t len; 131 size_t len;
132 off_t offset; 132 off_t offset;
133 int ret; 133 int ret;
134 134
135 inode = page->mapping->host;
136
137 if (file) {
138 key = file->private_data;
139 ASSERT(key != NULL);
140 } else {
141 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
142 if (IS_ERR(key)) {
143 ret = PTR_ERR(key);
144 goto error_nokey;
145 }
146 }
147
148 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); 135 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
149 136
150 vnode = AFS_FS_I(inode);
151
152 BUG_ON(!PageLocked(page)); 137 BUG_ON(!PageLocked(page));
153 138
154 ret = -ESTALE; 139 ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
214 unlock_page(page); 199 unlock_page(page);
215 } 200 }
216 201
217 if (!file)
218 key_put(key);
219 _leave(" = 0"); 202 _leave(" = 0");
220 return 0; 203 return 0;
221 204
222error: 205error:
223 SetPageError(page); 206 SetPageError(page);
224 unlock_page(page); 207 unlock_page(page);
225 if (!file)
226 key_put(key);
227error_nokey:
228 _leave(" = %d", ret); 208 _leave(" = %d", ret);
229 return ret; 209 return ret;
230} 210}
231 211
232/* 212/*
213 * read page from file, directory or symlink, given a file to nominate the key
214 * to be used
215 */
216static int afs_readpage(struct file *file, struct page *page)
217{
218 struct key *key;
219 int ret;
220
221 if (file) {
222 key = file->private_data;
223 ASSERT(key != NULL);
224 ret = afs_page_filler(key, page);
225 } else {
226 struct inode *inode = page->mapping->host;
227 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
228 if (IS_ERR(key)) {
229 ret = PTR_ERR(key);
230 } else {
231 ret = afs_page_filler(key, page);
232 key_put(key);
233 }
234 }
235 return ret;
236}
237
238/*
233 * read a set of pages 239 * read a set of pages
234 */ 240 */
235static int afs_readpages(struct file *file, struct address_space *mapping, 241static int afs_readpages(struct file *file, struct address_space *mapping,
236 struct list_head *pages, unsigned nr_pages) 242 struct list_head *pages, unsigned nr_pages)
237{ 243{
244 struct key *key = file->private_data;
238 struct afs_vnode *vnode; 245 struct afs_vnode *vnode;
239 int ret = 0; 246 int ret = 0;
240 247
241 _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); 248 _enter("{%d},{%lu},,%d",
249 key_serial(key), mapping->host->i_ino, nr_pages);
250
251 ASSERT(key != NULL);
242 252
243 vnode = AFS_FS_I(mapping->host); 253 vnode = AFS_FS_I(mapping->host);
244 if (vnode->flags & AFS_VNODE_DELETED) { 254 if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
279 } 289 }
280 290
281 /* load the missing pages from the network */ 291 /* load the missing pages from the network */
282 ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); 292 ret = read_cache_pages(mapping, pages, afs_page_filler, key);
283 293
284 _leave(" = %d [netting]", ret); 294 _leave(" = %d [netting]", ret);
285 return ret; 295 return ret;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d00b312e3110..0747339011c3 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,6 +19,8 @@
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/mount.h>
23#include <linux/namei.h>
22#include "internal.h" 24#include "internal.h"
23 25
24struct afs_iget_data { 26struct afs_iget_data {
@@ -102,6 +104,16 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
102} 104}
103 105
104/* 106/*
107 * iget5() comparator for inode created by autocell operations
108 *
109 * These pseudo inodes don't match anything.
110 */
111static int afs_iget5_autocell_test(struct inode *inode, void *opaque)
112{
113 return 0;
114}
115
116/*
105 * iget5() inode initialiser 117 * iget5() inode initialiser
106 */ 118 */
107static int afs_iget5_set(struct inode *inode, void *opaque) 119static int afs_iget5_set(struct inode *inode, void *opaque)
@@ -118,6 +130,67 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
118} 130}
119 131
120/* 132/*
133 * inode retrieval for autocell
134 */
135struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
136 int namesz, struct key *key)
137{
138 struct afs_iget_data data;
139 struct afs_super_info *as;
140 struct afs_vnode *vnode;
141 struct super_block *sb;
142 struct inode *inode;
143 static atomic_t afs_autocell_ino;
144
145 _enter("{%x:%u},%*.*s,",
146 AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode,
147 namesz, namesz, dev_name ?: "");
148
149 sb = dir->i_sb;
150 as = sb->s_fs_info;
151 data.volume = as->volume;
152 data.fid.vid = as->volume->vid;
153 data.fid.unique = 0;
154 data.fid.vnode = 0;
155
156 inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino),
157 afs_iget5_autocell_test, afs_iget5_set,
158 &data);
159 if (!inode) {
160 _leave(" = -ENOMEM");
161 return ERR_PTR(-ENOMEM);
162 }
163
164 _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
165 inode, inode->i_ino, data.fid.vid, data.fid.vnode,
166 data.fid.unique);
167
168 vnode = AFS_FS_I(inode);
169
170 /* there shouldn't be an existing inode */
171 BUG_ON(!(inode->i_state & I_NEW));
172
173 inode->i_size = 0;
174 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
175 inode->i_op = &afs_autocell_inode_operations;
176 inode->i_nlink = 2;
177 inode->i_uid = 0;
178 inode->i_gid = 0;
179 inode->i_ctime.tv_sec = get_seconds();
180 inode->i_ctime.tv_nsec = 0;
181 inode->i_atime = inode->i_mtime = inode->i_ctime;
182 inode->i_blocks = 0;
183 inode->i_version = 0;
184 inode->i_generation = 0;
185
186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
187 inode->i_flags |= S_NOATIME;
188 unlock_new_inode(inode);
189 _leave(" = %p", inode);
190 return inode;
191}
192
193/*
121 * inode retrieval 194 * inode retrieval
122 */ 195 */
123struct inode *afs_iget(struct super_block *sb, struct key *key, 196struct inode *afs_iget(struct super_block *sb, struct key *key,
@@ -314,9 +387,22 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
314} 387}
315 388
316/* 389/*
390 * discard an AFS inode
391 */
392int afs_drop_inode(struct inode *inode)
393{
394 _enter("");
395
396 if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
397 return generic_delete_inode(inode);
398 else
399 return generic_drop_inode(inode);
400}
401
402/*
317 * clear an AFS inode 403 * clear an AFS inode
318 */ 404 */
319void afs_clear_inode(struct inode *inode) 405void afs_evict_inode(struct inode *inode)
320{ 406{
321 struct afs_permits *permits; 407 struct afs_permits *permits;
322 struct afs_vnode *vnode; 408 struct afs_vnode *vnode;
@@ -335,6 +421,9 @@ void afs_clear_inode(struct inode *inode)
335 421
336 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); 422 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
337 423
424 truncate_inode_pages(&inode->i_data, 0);
425 end_writeback(inode);
426
338 afs_give_up_callback(vnode); 427 afs_give_up_callback(vnode);
339 428
340 if (vnode->server) { 429 if (vnode->server) {
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a10f2582844f..cca8eef736fc 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -42,6 +42,7 @@ typedef enum {
42struct afs_mount_params { 42struct afs_mount_params {
43 bool rwpath; /* T if the parent should be considered R/W */ 43 bool rwpath; /* T if the parent should be considered R/W */
44 bool force; /* T to force cell type */ 44 bool force; /* T to force cell type */
45 bool autocell; /* T if set auto mount operation */
45 afs_voltype_t type; /* type of volume requested */ 46 afs_voltype_t type; /* type of volume requested */
46 int volnamesz; /* size of volume name */ 47 int volnamesz; /* size of volume name */
47 const char *volname; /* name of volume to mount */ 48 const char *volname; /* name of volume to mount */
@@ -358,6 +359,8 @@ struct afs_vnode {
358#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ 359#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */
359#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ 360#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */
360#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ 361#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */
362#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */
363#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */
361 364
362 long acl_order; /* ACL check count (callback break count) */ 365 long acl_order; /* ACL check count (callback break count) */
363 366
@@ -468,8 +471,8 @@ extern struct list_head afs_proc_cells;
468 471
469#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) 472#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
470extern int afs_cell_init(char *); 473extern int afs_cell_init(char *);
471extern struct afs_cell *afs_cell_create(const char *, char *); 474extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
472extern struct afs_cell *afs_cell_lookup(const char *, unsigned); 475extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
473extern struct afs_cell *afs_grab_cell(struct afs_cell *); 476extern struct afs_cell *afs_grab_cell(struct afs_cell *);
474extern void afs_put_cell(struct afs_cell *); 477extern void afs_put_cell(struct afs_cell *);
475extern void afs_cell_purge(void); 478extern void afs_cell_purge(void);
@@ -494,6 +497,7 @@ extern const struct file_operations afs_file_operations;
494 497
495extern int afs_open(struct inode *, struct file *); 498extern int afs_open(struct inode *, struct file *);
496extern int afs_release(struct inode *, struct file *); 499extern int afs_release(struct inode *, struct file *);
500extern int afs_page_filler(void *, struct page *);
497 501
498/* 502/*
499 * flock.c 503 * flock.c
@@ -557,6 +561,8 @@ extern int afs_fs_release_lock(struct afs_server *, struct key *,
557/* 561/*
558 * inode.c 562 * inode.c
559 */ 563 */
564extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
565 struct key *);
560extern struct inode *afs_iget(struct super_block *, struct key *, 566extern struct inode *afs_iget(struct super_block *, struct key *,
561 struct afs_fid *, struct afs_file_status *, 567 struct afs_fid *, struct afs_file_status *,
562 struct afs_callback *); 568 struct afs_callback *);
@@ -564,7 +570,8 @@ extern void afs_zap_data(struct afs_vnode *);
564extern int afs_validate(struct afs_vnode *, struct key *); 570extern int afs_validate(struct afs_vnode *, struct key *);
565extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 571extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
566extern int afs_setattr(struct dentry *, struct iattr *); 572extern int afs_setattr(struct dentry *, struct iattr *);
567extern void afs_clear_inode(struct inode *); 573extern void afs_evict_inode(struct inode *);
574extern int afs_drop_inode(struct inode *);
568 575
569/* 576/*
570 * main.c 577 * main.c
@@ -580,6 +587,7 @@ extern int afs_abort_to_error(u32);
580 * mntpt.c 587 * mntpt.c
581 */ 588 */
582extern const struct inode_operations afs_mntpt_inode_operations; 589extern const struct inode_operations afs_mntpt_inode_operations;
590extern const struct inode_operations afs_autocell_inode_operations;
583extern const struct file_operations afs_mntpt_file_operations; 591extern const struct file_operations afs_mntpt_file_operations;
584 592
585extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); 593extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
@@ -739,7 +747,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
739extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 747extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
740 unsigned long, loff_t); 748 unsigned long, loff_t);
741extern int afs_writeback_all(struct afs_vnode *); 749extern int afs_writeback_all(struct afs_vnode *);
742extern int afs_fsync(struct file *, struct dentry *, int); 750extern int afs_fsync(struct file *, int);
743 751
744 752
745/*****************************************************************************/ 753/*****************************************************************************/
@@ -751,12 +759,6 @@ extern unsigned afs_debug;
751#define dbgprintk(FMT,...) \ 759#define dbgprintk(FMT,...) \
752 printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) 760 printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
753 761
754/* make sure we maintain the format strings, even when debugging is disabled */
755static inline __attribute__((format(printf,1,2)))
756void _dbprintk(const char *fmt, ...)
757{
758}
759
760#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) 762#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
761#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) 763#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
762#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) 764#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__)
@@ -791,9 +793,9 @@ do { \
791} while (0) 793} while (0)
792 794
793#else 795#else
794#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) 796#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
795#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) 797#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
796#define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__) 798#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__)
797#endif 799#endif
798 800
799/* 801/*
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 66d54d348c55..cfd1cbe25b22 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -111,6 +111,8 @@ static int __init afs_init(void)
111 111
112 /* initialise the callback update process */ 112 /* initialise the callback update process */
113 ret = afs_callback_update_init(); 113 ret = afs_callback_update_init();
114 if (ret < 0)
115 goto error_callback_update_init;
114 116
115 /* create the RxRPC transport */ 117 /* create the RxRPC transport */
116 ret = afs_open_socket(); 118 ret = afs_open_socket();
@@ -127,15 +129,16 @@ static int __init afs_init(void)
127error_fs: 129error_fs:
128 afs_close_socket(); 130 afs_close_socket();
129error_open_socket: 131error_open_socket:
132 afs_callback_update_kill();
133error_callback_update_init:
134 afs_vlocation_purge();
130error_vl_update_init: 135error_vl_update_init:
136 afs_cell_purge();
131error_cell_init: 137error_cell_init:
132#ifdef CONFIG_AFS_FSCACHE 138#ifdef CONFIG_AFS_FSCACHE
133 fscache_unregister_netfs(&afs_cache_netfs); 139 fscache_unregister_netfs(&afs_cache_netfs);
134error_cache: 140error_cache:
135#endif 141#endif
136 afs_callback_update_kill();
137 afs_vlocation_purge();
138 afs_cell_purge();
139 afs_proc_cleanup(); 142 afs_proc_cleanup();
140 rcu_barrier(); 143 rcu_barrier();
141 printk(KERN_ERR "kAFS: failed to register: %d\n", ret); 144 printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b3feddc4f7d6..6d552686c498 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -38,6 +38,11 @@ const struct inode_operations afs_mntpt_inode_operations = {
38 .getattr = afs_getattr, 38 .getattr = afs_getattr,
39}; 39};
40 40
41const struct inode_operations afs_autocell_inode_operations = {
42 .follow_link = afs_mntpt_follow_link,
43 .getattr = afs_getattr,
44};
45
41static LIST_HEAD(afs_vfsmounts); 46static LIST_HEAD(afs_vfsmounts);
42static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); 47static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
43 48
@@ -49,9 +54,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
49 */ 54 */
50int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) 55int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
51{ 56{
52 struct file file = {
53 .private_data = key,
54 };
55 struct page *page; 57 struct page *page;
56 size_t size; 58 size_t size;
57 char *buf; 59 char *buf;
@@ -61,7 +63,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
61 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 63 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
62 64
63 /* read the contents of the symlink into the pagecache */ 65 /* read the contents of the symlink into the pagecache */
64 page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file); 66 page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
67 afs_page_filler, key);
65 if (IS_ERR(page)) { 68 if (IS_ERR(page)) {
66 ret = PTR_ERR(page); 69 ret = PTR_ERR(page);
67 goto out; 70 goto out;
@@ -138,20 +141,16 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
138{ 141{
139 struct afs_super_info *super; 142 struct afs_super_info *super;
140 struct vfsmount *mnt; 143 struct vfsmount *mnt;
144 struct afs_vnode *vnode;
141 struct page *page; 145 struct page *page;
142 size_t size; 146 char *devname, *options;
143 char *buf, *devname, *options; 147 bool rwpath = false;
144 int ret; 148 int ret;
145 149
146 _enter("{%s}", mntpt->d_name.name); 150 _enter("{%s}", mntpt->d_name.name);
147 151
148 BUG_ON(!mntpt->d_inode); 152 BUG_ON(!mntpt->d_inode);
149 153
150 ret = -EINVAL;
151 size = mntpt->d_inode->i_size;
152 if (size > PAGE_SIZE - 1)
153 goto error_no_devname;
154
155 ret = -ENOMEM; 154 ret = -ENOMEM;
156 devname = (char *) get_zeroed_page(GFP_KERNEL); 155 devname = (char *) get_zeroed_page(GFP_KERNEL);
157 if (!devname) 156 if (!devname)
@@ -161,28 +160,59 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
161 if (!options) 160 if (!options)
162 goto error_no_options; 161 goto error_no_options;
163 162
164 /* read the contents of the AFS special symlink */ 163 vnode = AFS_FS_I(mntpt->d_inode);
165 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); 164 if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
166 if (IS_ERR(page)) { 165 /* if the directory is a pseudo directory, use the d_name */
167 ret = PTR_ERR(page); 166 static const char afs_root_cell[] = ":root.cell.";
168 goto error_no_page; 167 unsigned size = mntpt->d_name.len;
168
169 ret = -ENOENT;
170 if (size < 2 || size > AFS_MAXCELLNAME)
171 goto error_no_page;
172
173 if (mntpt->d_name.name[0] == '.') {
174 devname[0] = '#';
175 memcpy(devname + 1, mntpt->d_name.name, size - 1);
176 memcpy(devname + size, afs_root_cell,
177 sizeof(afs_root_cell));
178 rwpath = true;
179 } else {
180 devname[0] = '%';
181 memcpy(devname + 1, mntpt->d_name.name, size);
182 memcpy(devname + size + 1, afs_root_cell,
183 sizeof(afs_root_cell));
184 }
185 } else {
186 /* read the contents of the AFS special symlink */
187 loff_t size = i_size_read(mntpt->d_inode);
188 char *buf;
189
190 ret = -EINVAL;
191 if (size > PAGE_SIZE - 1)
192 goto error_no_page;
193
194 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
195 if (IS_ERR(page)) {
196 ret = PTR_ERR(page);
197 goto error_no_page;
198 }
199
200 ret = -EIO;
201 if (PageError(page))
202 goto error;
203
204 buf = kmap_atomic(page, KM_USER0);
205 memcpy(devname, buf, size);
206 kunmap_atomic(buf, KM_USER0);
207 page_cache_release(page);
208 page = NULL;
169 } 209 }
170 210
171 ret = -EIO;
172 if (PageError(page))
173 goto error;
174
175 buf = kmap_atomic(page, KM_USER0);
176 memcpy(devname, buf, size);
177 kunmap_atomic(buf, KM_USER0);
178 page_cache_release(page);
179 page = NULL;
180
181 /* work out what options we want */ 211 /* work out what options we want */
182 super = AFS_FS_S(mntpt->d_sb); 212 super = AFS_FS_S(mntpt->d_sb);
183 memcpy(options, "cell=", 5); 213 memcpy(options, "cell=", 5);
184 strcpy(options + 5, super->volume->cell->name); 214 strcpy(options + 5, super->volume->cell->name);
185 if (super->volume->type == AFSVL_RWVOL) 215 if (super->volume->type == AFSVL_RWVOL || rwpath)
186 strcat(options, ",rwpath"); 216 strcat(options, ",rwpath");
187 217
188 /* try and do the mount */ 218 /* try and do the mount */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 852739d262a9..096b23f821a1 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -294,7 +294,7 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
294 if (strcmp(kbuf, "add") == 0) { 294 if (strcmp(kbuf, "add") == 0) {
295 struct afs_cell *cell; 295 struct afs_cell *cell;
296 296
297 cell = afs_cell_create(name, args); 297 cell = afs_cell_create(name, strlen(name), args, false);
298 if (IS_ERR(cell)) { 298 if (IS_ERR(cell)) {
299 ret = PTR_ERR(cell); 299 ret = PTR_ERR(cell);
300 goto done; 300 goto done;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 67cf810e0fd6..654d8fdbf01f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -100,6 +100,7 @@ int afs_open_socket(void)
100 ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); 100 ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
101 if (ret < 0) { 101 if (ret < 0) {
102 sock_release(socket); 102 sock_release(socket);
103 destroy_workqueue(afs_async_calls);
103 _leave(" = %d [bind]", ret); 104 _leave(" = %d [bind]", ret);
104 return ret; 105 return ret;
105 } 106 }
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f49099516675..9fdc7fe3a7bc 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
91 91
92 memcpy(&server->addr, addr, sizeof(struct in_addr)); 92 memcpy(&server->addr, addr, sizeof(struct in_addr));
93 server->addr.s_addr = addr->s_addr; 93 server->addr.s_addr = addr->s_addr;
94 _leave(" = %p{%d}", server, atomic_read(&server->usage));
95 } else {
96 _leave(" = NULL [nomem]");
94 } 97 }
95
96 _leave(" = %p{%d}", server, atomic_read(&server->usage));
97 return server; 98 return server;
98} 99}
99 100
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e932e5a3a0c1..77e1e5a61154 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -16,6 +16,7 @@
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/mount.h>
19#include <linux/init.h> 20#include <linux/init.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
21#include <linux/smp_lock.h> 22#include <linux/smp_lock.h>
@@ -48,8 +49,9 @@ struct file_system_type afs_fs_type = {
48static const struct super_operations afs_super_ops = { 49static const struct super_operations afs_super_ops = {
49 .statfs = afs_statfs, 50 .statfs = afs_statfs,
50 .alloc_inode = afs_alloc_inode, 51 .alloc_inode = afs_alloc_inode,
52 .drop_inode = afs_drop_inode,
51 .destroy_inode = afs_destroy_inode, 53 .destroy_inode = afs_destroy_inode,
52 .clear_inode = afs_clear_inode, 54 .evict_inode = afs_evict_inode,
53 .put_super = afs_put_super, 55 .put_super = afs_put_super,
54 .show_options = generic_show_options, 56 .show_options = generic_show_options,
55}; 57};
@@ -62,12 +64,14 @@ enum {
62 afs_opt_cell, 64 afs_opt_cell,
63 afs_opt_rwpath, 65 afs_opt_rwpath,
64 afs_opt_vol, 66 afs_opt_vol,
67 afs_opt_autocell,
65}; 68};
66 69
67static const match_table_t afs_options_list = { 70static const match_table_t afs_options_list = {
68 { afs_opt_cell, "cell=%s" }, 71 { afs_opt_cell, "cell=%s" },
69 { afs_opt_rwpath, "rwpath" }, 72 { afs_opt_rwpath, "rwpath" },
70 { afs_opt_vol, "vol=%s" }, 73 { afs_opt_vol, "vol=%s" },
74 { afs_opt_autocell, "autocell" },
71 { afs_no_opt, NULL }, 75 { afs_no_opt, NULL },
72}; 76};
73 77
@@ -151,7 +155,8 @@ static int afs_parse_options(struct afs_mount_params *params,
151 switch (token) { 155 switch (token) {
152 case afs_opt_cell: 156 case afs_opt_cell:
153 cell = afs_cell_lookup(args[0].from, 157 cell = afs_cell_lookup(args[0].from,
154 args[0].to - args[0].from); 158 args[0].to - args[0].from,
159 false);
155 if (IS_ERR(cell)) 160 if (IS_ERR(cell))
156 return PTR_ERR(cell); 161 return PTR_ERR(cell);
157 afs_put_cell(params->cell); 162 afs_put_cell(params->cell);
@@ -166,6 +171,10 @@ static int afs_parse_options(struct afs_mount_params *params,
166 *devname = args[0].from; 171 *devname = args[0].from;
167 break; 172 break;
168 173
174 case afs_opt_autocell:
175 params->autocell = 1;
176 break;
177
169 default: 178 default:
170 printk(KERN_ERR "kAFS:" 179 printk(KERN_ERR "kAFS:"
171 " Unknown or invalid mount option: '%s'\n", p); 180 " Unknown or invalid mount option: '%s'\n", p);
@@ -252,10 +261,10 @@ static int afs_parse_device_name(struct afs_mount_params *params,
252 261
253 /* lookup the cell record */ 262 /* lookup the cell record */
254 if (cellname || !params->cell) { 263 if (cellname || !params->cell) {
255 cell = afs_cell_lookup(cellname, cellnamesz); 264 cell = afs_cell_lookup(cellname, cellnamesz, true);
256 if (IS_ERR(cell)) { 265 if (IS_ERR(cell)) {
257 printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n", 266 printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
258 cellname ?: ""); 267 cellnamesz, cellnamesz, cellname ?: "");
259 return PTR_ERR(cell); 268 return PTR_ERR(cell);
260 } 269 }
261 afs_put_cell(params->cell); 270 afs_put_cell(params->cell);
@@ -321,6 +330,9 @@ static int afs_fill_super(struct super_block *sb, void *data)
321 if (IS_ERR(inode)) 330 if (IS_ERR(inode))
322 goto error_inode; 331 goto error_inode;
323 332
333 if (params->autocell)
334 set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
335
324 ret = -ENOMEM; 336 ret = -ENOMEM;
325 root = d_alloc_root(inode); 337 root = d_alloc_root(inode);
326 if (!root) 338 if (!root)
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..722743b152d8 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -680,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
680{ 680{
681 struct address_space *mapping = vnode->vfs_inode.i_mapping; 681 struct address_space *mapping = vnode->vfs_inode.i_mapping;
682 struct writeback_control wbc = { 682 struct writeback_control wbc = {
683 .bdi = mapping->backing_dev_info,
684 .sync_mode = WB_SYNC_ALL, 683 .sync_mode = WB_SYNC_ALL,
685 .nr_to_write = LONG_MAX, 684 .nr_to_write = LONG_MAX,
686 .range_cyclic = 1, 685 .range_cyclic = 1,
@@ -701,8 +700,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
701 * - the return status from this call provides a reliable indication of 700 * - the return status from this call provides a reliable indication of
702 * whether any write errors occurred for this process. 701 * whether any write errors occurred for this process.
703 */ 702 */
704int afs_fsync(struct file *file, struct dentry *dentry, int datasync) 703int afs_fsync(struct file *file, int datasync)
705{ 704{
705 struct dentry *dentry = file->f_path.dentry;
706 struct afs_writeback *wb, *xwb; 706 struct afs_writeback *wb, *xwb;
707 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 707 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
708 int ret; 708 int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..250b0a73c8a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/mempool.h> 37#include <linux/mempool.h>
38#include <linux/hash.h> 38#include <linux/hash.h>
39#include <linux/compat.h>
39 40
40#include <asm/kmap_types.h> 41#include <asm/kmap_types.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
526 527
527 /* Complete the fput(s) */ 528 /* Complete the fput(s) */
528 if (req->ki_filp != NULL) 529 if (req->ki_filp != NULL)
529 __fput(req->ki_filp); 530 fput(req->ki_filp);
530 531
531 /* Link the iocb into the context's free list */ 532 /* Link the iocb into the context's free list */
532 spin_lock_irq(&ctx->ctx_lock); 533 spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
559 560
560 /* 561 /*
561 * Try to optimize the aio and eventfd file* puts, by avoiding to 562 * Try to optimize the aio and eventfd file* puts, by avoiding to
562 * schedule work in case it is not __fput() time. In normal cases, 563 * schedule work in case it is not final fput() time. In normal cases,
563 * we would not be holding the last reference to the file*, so 564 * we would not be holding the last reference to the file*, so
564 * this function will be executed w/out any aio kthread wakeup. 565 * this function will be executed w/out any aio kthread wakeup.
565 */ 566 */
566 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { 567 if (unlikely(!fput_atomic(req->ki_filp))) {
567 get_ioctx(ctx); 568 get_ioctx(ctx);
568 spin_lock(&fput_lock); 569 spin_lock(&fput_lock);
569 list_add(&req->ki_list, &fput_head); 570 list_add(&req->ki_list, &fput_head);
@@ -711,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
711 */ 712 */
712 ret = retry(iocb); 713 ret = retry(iocb);
713 714
714 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) 715 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
716 /*
717 * There's no easy way to restart the syscall since other AIO's
718 * may be already running. Just fail this IO with EINTR.
719 */
720 if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
721 ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
722 ret = -EINTR;
715 aio_complete(iocb, ret, 0); 723 aio_complete(iocb, ret, 0);
724 }
716out: 725out:
717 spin_lock_irq(&ctx->ctx_lock); 726 spin_lock_irq(&ctx->ctx_lock);
718 727
@@ -1276,7 +1285,7 @@ out:
1276/* sys_io_destroy: 1285/* sys_io_destroy:
1277 * Destroy the aio_context specified. May cancel any outstanding 1286 * Destroy the aio_context specified. May cancel any outstanding
1278 * AIOs and block on completion. Will fail with -ENOSYS if not 1287 * AIOs and block on completion. Will fail with -ENOSYS if not
1279 * implemented. May fail with -EFAULT if the context pointed to 1288 * implemented. May fail with -EINVAL if the context pointed to
1280 * is invalid. 1289 * is invalid.
1281 */ 1290 */
1282SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) 1291SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
@@ -1384,13 +1393,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
1384 return ret; 1393 return ret;
1385} 1394}
1386 1395
1387static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) 1396static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
1388{ 1397{
1389 ssize_t ret; 1398 ssize_t ret;
1390 1399
1391 ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, 1400#ifdef CONFIG_COMPAT
1392 kiocb->ki_nbytes, 1, 1401 if (compat)
1393 &kiocb->ki_inline_vec, &kiocb->ki_iovec); 1402 ret = compat_rw_copy_check_uvector(type,
1403 (struct compat_iovec __user *)kiocb->ki_buf,
1404 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1405 &kiocb->ki_iovec);
1406 else
1407#endif
1408 ret = rw_copy_check_uvector(type,
1409 (struct iovec __user *)kiocb->ki_buf,
1410 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1411 &kiocb->ki_iovec);
1394 if (ret < 0) 1412 if (ret < 0)
1395 goto out; 1413 goto out;
1396 1414
@@ -1420,7 +1438,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
1420 * Performs the initial checks and aio retry method 1438 * Performs the initial checks and aio retry method
1421 * setup for the kiocb at the time of io submission. 1439 * setup for the kiocb at the time of io submission.
1422 */ 1440 */
1423static ssize_t aio_setup_iocb(struct kiocb *kiocb) 1441static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1424{ 1442{
1425 struct file *file = kiocb->ki_filp; 1443 struct file *file = kiocb->ki_filp;
1426 ssize_t ret = 0; 1444 ssize_t ret = 0;
@@ -1469,7 +1487,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1469 ret = security_file_permission(file, MAY_READ); 1487 ret = security_file_permission(file, MAY_READ);
1470 if (unlikely(ret)) 1488 if (unlikely(ret))
1471 break; 1489 break;
1472 ret = aio_setup_vectored_rw(READ, kiocb); 1490 ret = aio_setup_vectored_rw(READ, kiocb, compat);
1473 if (ret) 1491 if (ret)
1474 break; 1492 break;
1475 ret = -EINVAL; 1493 ret = -EINVAL;
@@ -1483,7 +1501,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1483 ret = security_file_permission(file, MAY_WRITE); 1501 ret = security_file_permission(file, MAY_WRITE);
1484 if (unlikely(ret)) 1502 if (unlikely(ret))
1485 break; 1503 break;
1486 ret = aio_setup_vectored_rw(WRITE, kiocb); 1504 ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
1487 if (ret) 1505 if (ret)
1488 break; 1506 break;
1489 ret = -EINVAL; 1507 ret = -EINVAL;
@@ -1548,7 +1566,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
1548} 1566}
1549 1567
1550static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1568static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1551 struct iocb *iocb, struct hlist_head *batch_hash) 1569 struct iocb *iocb, struct hlist_head *batch_hash,
1570 bool compat)
1552{ 1571{
1553 struct kiocb *req; 1572 struct kiocb *req;
1554 struct file *file; 1573 struct file *file;
@@ -1609,7 +1628,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1609 req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1628 req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
1610 req->ki_opcode = iocb->aio_lio_opcode; 1629 req->ki_opcode = iocb->aio_lio_opcode;
1611 1630
1612 ret = aio_setup_iocb(req); 1631 ret = aio_setup_iocb(req, compat);
1613 1632
1614 if (ret) 1633 if (ret)
1615 goto out_put_req; 1634 goto out_put_req;
@@ -1637,20 +1656,8 @@ out_put_req:
1637 return ret; 1656 return ret;
1638} 1657}
1639 1658
1640/* sys_io_submit: 1659long do_io_submit(aio_context_t ctx_id, long nr,
1641 * Queue the nr iocbs pointed to by iocbpp for processing. Returns 1660 struct iocb __user *__user *iocbpp, bool compat)
1642 * the number of iocbs queued. May return -EINVAL if the aio_context
1643 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1644 * *iocbpp[0] is not properly initialized, if the operation specified
1645 * is invalid for the file descriptor in the iocb. May fail with
1646 * -EFAULT if any of the data structures point to invalid data. May
1647 * fail with -EBADF if the file descriptor specified in the first
1648 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1649 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1650 * fail with -ENOSYS if not implemented.
1651 */
1652SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1653 struct iocb __user * __user *, iocbpp)
1654{ 1661{
1655 struct kioctx *ctx; 1662 struct kioctx *ctx;
1656 long ret = 0; 1663 long ret = 0;
@@ -1660,6 +1667,9 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1660 if (unlikely(nr < 0)) 1667 if (unlikely(nr < 0))
1661 return -EINVAL; 1668 return -EINVAL;
1662 1669
1670 if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
1671 nr = LONG_MAX/sizeof(*iocbpp);
1672
1663 if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) 1673 if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
1664 return -EFAULT; 1674 return -EFAULT;
1665 1675
@@ -1687,7 +1697,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1687 break; 1697 break;
1688 } 1698 }
1689 1699
1690 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); 1700 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
1691 if (ret) 1701 if (ret)
1692 break; 1702 break;
1693 } 1703 }
@@ -1697,6 +1707,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1697 return i ? i : ret; 1707 return i ? i : ret;
1698} 1708}
1699 1709
1710/* sys_io_submit:
1711 * Queue the nr iocbs pointed to by iocbpp for processing. Returns
1712 * the number of iocbs queued. May return -EINVAL if the aio_context
1713 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1714 * *iocbpp[0] is not properly initialized, if the operation specified
1715 * is invalid for the file descriptor in the iocb. May fail with
1716 * -EFAULT if any of the data structures point to invalid data. May
1717 * fail with -EBADF if the file descriptor specified in the first
1718 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1719 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1720 * fail with -ENOSYS if not implemented.
1721 */
1722SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1723 struct iocb __user * __user *, iocbpp)
1724{
1725 return do_io_submit(ctx_id, nr, iocbpp, 0);
1726}
1727
1700/* lookup_kiocb 1728/* lookup_kiocb
1701 * Finds a given iocb for cancellation. 1729 * Finds a given iocb for cancellation.
1702 */ 1730 */
@@ -1778,15 +1806,16 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
1778 1806
1779/* io_getevents: 1807/* io_getevents:
1780 * Attempts to read at least min_nr events and up to nr events from 1808 * Attempts to read at least min_nr events and up to nr events from
1781 * the completion queue for the aio_context specified by ctx_id. May 1809 * the completion queue for the aio_context specified by ctx_id. If
1782 * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, 1810 * it succeeds, the number of read events is returned. May fail with
1783 * if nr is out of range, if when is out of range. May fail with 1811 * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
1784 * -EFAULT if any of the memory specified to is invalid. May return 1812 * out of range, if timeout is out of range. May fail with -EFAULT
1785 * 0 or < min_nr if no events are available and the timeout specified 1813 * if any of the memory specified is invalid. May return 0 or
1786 * by when has elapsed, where when == NULL specifies an infinite 1814 * < min_nr if the timeout specified by timeout has elapsed
1787 * timeout. Note that the timeout pointed to by when is relative and 1815 * before sufficient events are available, where timeout == NULL
1788 * will be updated if not NULL and the operation blocks. Will fail 1816 * specifies an infinite timeout. Note that the timeout pointed to by
1789 * with -ENOSYS if not implemented. 1817 * timeout is relative and will be updated if not NULL and the
1818 * operation blocks. Will fail with -ENOSYS if not implemented.
1790 */ 1819 */
1791SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, 1820SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1792 long, min_nr, 1821 long, min_nr,
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..7ca41811afa1 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -14,35 +14,53 @@
14#include <linux/fcntl.h> 14#include <linux/fcntl.h>
15#include <linux/security.h> 15#include <linux/security.h>
16 16
17/* Taken over from the old code... */ 17/**
18 18 * inode_change_ok - check if attribute changes to an inode are allowed
19/* POSIX UID/GID verification for setting inode attributes. */ 19 * @inode: inode to check
20 * @attr: attributes to change
21 *
22 * Check if we are allowed to change the attributes contained in @attr
23 * in the given inode. This includes the normal unix access permission
24 * checks, as well as checks for rlimits and others.
25 *
26 * Should be called as the first thing in ->setattr implementations,
27 * possibly after taking additional locks.
28 */
20int inode_change_ok(const struct inode *inode, struct iattr *attr) 29int inode_change_ok(const struct inode *inode, struct iattr *attr)
21{ 30{
22 int retval = -EPERM;
23 unsigned int ia_valid = attr->ia_valid; 31 unsigned int ia_valid = attr->ia_valid;
24 32
33 /*
34 * First check size constraints. These can't be overriden using
35 * ATTR_FORCE.
36 */
37 if (ia_valid & ATTR_SIZE) {
38 int error = inode_newsize_ok(inode, attr->ia_size);
39 if (error)
40 return error;
41 }
42
25 /* If force is set do it anyway. */ 43 /* If force is set do it anyway. */
26 if (ia_valid & ATTR_FORCE) 44 if (ia_valid & ATTR_FORCE)
27 goto fine; 45 return 0;
28 46
29 /* Make sure a caller can chown. */ 47 /* Make sure a caller can chown. */
30 if ((ia_valid & ATTR_UID) && 48 if ((ia_valid & ATTR_UID) &&
31 (current_fsuid() != inode->i_uid || 49 (current_fsuid() != inode->i_uid ||
32 attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN)) 50 attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
33 goto error; 51 return -EPERM;
34 52
35 /* Make sure caller can chgrp. */ 53 /* Make sure caller can chgrp. */
36 if ((ia_valid & ATTR_GID) && 54 if ((ia_valid & ATTR_GID) &&
37 (current_fsuid() != inode->i_uid || 55 (current_fsuid() != inode->i_uid ||
38 (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) && 56 (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
39 !capable(CAP_CHOWN)) 57 !capable(CAP_CHOWN))
40 goto error; 58 return -EPERM;
41 59
42 /* Make sure a caller can chmod. */ 60 /* Make sure a caller can chmod. */
43 if (ia_valid & ATTR_MODE) { 61 if (ia_valid & ATTR_MODE) {
44 if (!is_owner_or_cap(inode)) 62 if (!is_owner_or_cap(inode))
45 goto error; 63 return -EPERM;
46 /* Also check the setgid bit! */ 64 /* Also check the setgid bit! */
47 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 65 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
48 inode->i_gid) && !capable(CAP_FSETID)) 66 inode->i_gid) && !capable(CAP_FSETID))
@@ -52,12 +70,10 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
52 /* Check for setting the inode time. */ 70 /* Check for setting the inode time. */
53 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { 71 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
54 if (!is_owner_or_cap(inode)) 72 if (!is_owner_or_cap(inode))
55 goto error; 73 return -EPERM;
56 } 74 }
57fine: 75
58 retval = 0; 76 return 0;
59error:
60 return retval;
61} 77}
62EXPORT_SYMBOL(inode_change_ok); 78EXPORT_SYMBOL(inode_change_ok);
63 79
@@ -67,14 +83,14 @@ EXPORT_SYMBOL(inode_change_ok);
67 * @offset: the new size to assign to the inode 83 * @offset: the new size to assign to the inode
68 * @Returns: 0 on success, -ve errno on failure 84 * @Returns: 0 on success, -ve errno on failure
69 * 85 *
86 * inode_newsize_ok must be called with i_mutex held.
87 *
70 * inode_newsize_ok will check filesystem limits and ulimits to check that the 88 * inode_newsize_ok will check filesystem limits and ulimits to check that the
71 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ 89 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
72 * when necessary. Caller must not proceed with inode size change if failure is 90 * when necessary. Caller must not proceed with inode size change if failure is
73 * returned. @inode must be a file (not directory), with appropriate 91 * returned. @inode must be a file (not directory), with appropriate
74 * permissions to allow truncate (inode_newsize_ok does NOT check these 92 * permissions to allow truncate (inode_newsize_ok does NOT check these
75 * conditions). 93 * conditions).
76 *
77 * inode_newsize_ok must be called with i_mutex held.
78 */ 94 */
79int inode_newsize_ok(const struct inode *inode, loff_t offset) 95int inode_newsize_ok(const struct inode *inode, loff_t offset)
80{ 96{
@@ -104,17 +120,25 @@ out_big:
104} 120}
105EXPORT_SYMBOL(inode_newsize_ok); 121EXPORT_SYMBOL(inode_newsize_ok);
106 122
107int inode_setattr(struct inode * inode, struct iattr * attr) 123/**
124 * setattr_copy - copy simple metadata updates into the generic inode
125 * @inode: the inode to be updated
126 * @attr: the new attributes
127 *
128 * setattr_copy must be called with i_mutex held.
129 *
130 * setattr_copy updates the inode's metadata with that specified
131 * in attr. Noticably missing is inode size update, which is more complex
132 * as it requires pagecache updates.
133 *
134 * The inode is not marked as dirty after this operation. The rationale is
135 * that for "simple" filesystems, the struct inode is the inode storage.
136 * The caller is free to mark the inode dirty afterwards if needed.
137 */
138void setattr_copy(struct inode *inode, const struct iattr *attr)
108{ 139{
109 unsigned int ia_valid = attr->ia_valid; 140 unsigned int ia_valid = attr->ia_valid;
110 141
111 if (ia_valid & ATTR_SIZE &&
112 attr->ia_size != i_size_read(inode)) {
113 int error = vmtruncate(inode, attr->ia_size);
114 if (error)
115 return error;
116 }
117
118 if (ia_valid & ATTR_UID) 142 if (ia_valid & ATTR_UID)
119 inode->i_uid = attr->ia_uid; 143 inode->i_uid = attr->ia_uid;
120 if (ia_valid & ATTR_GID) 144 if (ia_valid & ATTR_GID)
@@ -135,11 +159,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
135 mode &= ~S_ISGID; 159 mode &= ~S_ISGID;
136 inode->i_mode = mode; 160 inode->i_mode = mode;
137 } 161 }
138 mark_inode_dirty(inode);
139
140 return 0;
141} 162}
142EXPORT_SYMBOL(inode_setattr); 163EXPORT_SYMBOL(setattr_copy);
143 164
144int notify_change(struct dentry * dentry, struct iattr * attr) 165int notify_change(struct dentry * dentry, struct iattr * attr)
145{ 166{
@@ -207,13 +228,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
207 if (ia_valid & ATTR_SIZE) 228 if (ia_valid & ATTR_SIZE)
208 down_write(&dentry->d_inode->i_alloc_sem); 229 down_write(&dentry->d_inode->i_alloc_sem);
209 230
210 if (inode->i_op && inode->i_op->setattr) { 231 if (inode->i_op->setattr)
211 error = inode->i_op->setattr(dentry, attr); 232 error = inode->i_op->setattr(dentry, attr);
212 } else { 233 else
213 error = inode_change_ok(inode, attr); 234 error = simple_setattr(dentry, attr);
214 if (!error)
215 error = inode_setattr(inode, attr);
216 }
217 235
218 if (ia_valid & ATTR_SIZE) 236 if (ia_valid & ATTR_SIZE)
219 up_write(&dentry->d_inode->i_alloc_sem); 237 up_write(&dentry->d_inode->i_alloc_sem);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..11b1ea786d00 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -16,6 +16,7 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/param.h> 17#include <linux/param.h>
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/compat.h>
19#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
20#include "autofs_i.h" 21#include "autofs_i.h"
21 22
@@ -25,12 +26,17 @@ static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
25static int autofs_root_unlink(struct inode *,struct dentry *); 26static int autofs_root_unlink(struct inode *,struct dentry *);
26static int autofs_root_rmdir(struct inode *,struct dentry *); 27static int autofs_root_rmdir(struct inode *,struct dentry *);
27static int autofs_root_mkdir(struct inode *,struct dentry *,int); 28static int autofs_root_mkdir(struct inode *,struct dentry *,int);
28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 29static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
30static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
29 31
30const struct file_operations autofs_root_operations = { 32const struct file_operations autofs_root_operations = {
33 .llseek = generic_file_llseek,
31 .read = generic_read_dir, 34 .read = generic_read_dir,
32 .readdir = autofs_root_readdir, 35 .readdir = autofs_root_readdir,
33 .ioctl = autofs_root_ioctl, 36 .unlocked_ioctl = autofs_root_ioctl,
37#ifdef CONFIG_COMPAT
38 .compat_ioctl = autofs_root_compat_ioctl,
39#endif
34}; 40};
35 41
36const struct inode_operations autofs_root_inode_operations = { 42const struct inode_operations autofs_root_inode_operations = {
@@ -491,6 +497,25 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
491} 497}
492 498
493/* Get/set timeout ioctl() operation */ 499/* Get/set timeout ioctl() operation */
500#ifdef CONFIG_COMPAT
501static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
502 unsigned int __user *p)
503{
504 unsigned long ntimeout;
505
506 if (get_user(ntimeout, p) ||
507 put_user(sbi->exp_timeout / HZ, p))
508 return -EFAULT;
509
510 if (ntimeout > UINT_MAX/HZ)
511 sbi->exp_timeout = 0;
512 else
513 sbi->exp_timeout = ntimeout * HZ;
514
515 return 0;
516}
517#endif
518
494static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, 519static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
495 unsigned long __user *p) 520 unsigned long __user *p)
496{ 521{
@@ -545,7 +570,7 @@ static inline int autofs_expire_run(struct super_block *sb,
545 * ioctl()'s on the root directory is the chief method for the daemon to 570 * ioctl()'s on the root directory is the chief method for the daemon to
546 * generate kernel reactions 571 * generate kernel reactions
547 */ 572 */
548static int autofs_root_ioctl(struct inode *inode, struct file *filp, 573static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
549 unsigned int cmd, unsigned long arg) 574 unsigned int cmd, unsigned long arg)
550{ 575{
551 struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); 576 struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
@@ -570,6 +595,10 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
570 return 0; 595 return 0;
571 case AUTOFS_IOC_PROTOVER: /* Get protocol version */ 596 case AUTOFS_IOC_PROTOVER: /* Get protocol version */
572 return autofs_get_protover(argp); 597 return autofs_get_protover(argp);
598#ifdef CONFIG_COMPAT
599 case AUTOFS_IOC_SETTIMEOUT32:
600 return autofs_compat_get_set_timeout(sbi, argp);
601#endif
573 case AUTOFS_IOC_SETTIMEOUT: 602 case AUTOFS_IOC_SETTIMEOUT:
574 return autofs_get_set_timeout(sbi, argp); 603 return autofs_get_set_timeout(sbi, argp);
575 case AUTOFS_IOC_EXPIRE: 604 case AUTOFS_IOC_EXPIRE:
@@ -578,4 +607,37 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
578 default: 607 default:
579 return -ENOSYS; 608 return -ENOSYS;
580 } 609 }
610
611}
612
613static long autofs_root_ioctl(struct file *filp,
614 unsigned int cmd, unsigned long arg)
615{
616 int ret;
617
618 lock_kernel();
619 ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
620 filp, cmd, arg);
621 unlock_kernel();
622
623 return ret;
624}
625
626#ifdef CONFIG_COMPAT
627static long autofs_root_compat_ioctl(struct file *filp,
628 unsigned int cmd, unsigned long arg)
629{
630 struct inode *inode = filp->f_path.dentry->d_inode;
631 int ret;
632
633 lock_kernel();
634 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
635 ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
636 else
637 ret = autofs_do_root_ioctl(inode, filp, cmd,
638 (unsigned long)compat_ptr(arg));
639 unlock_kernel();
640
641 return ret;
581} 642}
643#endif
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d29b7f6df862..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
95 */ 95 */
96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) 96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
97{ 97{
98 struct autofs_dev_ioctl tmp, *ads; 98 struct autofs_dev_ioctl tmp;
99 99
100 if (copy_from_user(&tmp, in, sizeof(tmp))) 100 if (copy_from_user(&tmp, in, sizeof(tmp)))
101 return ERR_PTR(-EFAULT); 101 return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
103 if (tmp.size < sizeof(tmp)) 103 if (tmp.size < sizeof(tmp))
104 return ERR_PTR(-EINVAL); 104 return ERR_PTR(-EINVAL);
105 105
106 ads = kmalloc(tmp.size, GFP_KERNEL); 106 return memdup_user(in, tmp.size);
107 if (!ads)
108 return ERR_PTR(-ENOMEM);
109
110 if (copy_from_user(ads, in, tmp.size)) {
111 kfree(ads);
112 return ERR_PTR(-EFAULT);
113 }
114
115 return ads;
116} 107}
117 108
118static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) 109static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -736,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
736}; 727};
737 728
738static struct miscdevice _autofs_dev_ioctl_misc = { 729static struct miscdevice _autofs_dev_ioctl_misc = {
739 .minor = MISC_DYNAMIC_MINOR, 730 .minor = AUTOFS_MINOR,
740 .name = AUTOFS_DEVICE_NAME, 731 .name = AUTOFS_DEVICE_NAME,
741 .fops = &_dev_ioctl_fops 732 .fops = &_dev_ioctl_fops
742}; 733};
743 734
735MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
736MODULE_ALIAS("devname:autofs");
737
744/* Register/deregister misc character device */ 738/* Register/deregister misc character device */
745int autofs_dev_ioctl_init(void) 739int autofs_dev_ioctl_init(void)
746{ 740{
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e8e5e63ac950..cb1bd38dc08c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,17 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/param.h> 19#include <linux/param.h>
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/compat.h>
22#include <linux/smp_lock.h>
23
21#include "autofs_i.h" 24#include "autofs_i.h"
22 25
23static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
24static int autofs4_dir_unlink(struct inode *,struct dentry *); 27static int autofs4_dir_unlink(struct inode *,struct dentry *);
25static int autofs4_dir_rmdir(struct inode *,struct dentry *); 28static int autofs4_dir_rmdir(struct inode *,struct dentry *);
26static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 29static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
27static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 30static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
31static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
28static int autofs4_dir_open(struct inode *inode, struct file *file); 32static int autofs4_dir_open(struct inode *inode, struct file *file);
29static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 33static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
30static void *autofs4_follow_link(struct dentry *, struct nameidata *); 34static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -38,7 +42,10 @@ const struct file_operations autofs4_root_operations = {
38 .read = generic_read_dir, 42 .read = generic_read_dir,
39 .readdir = dcache_readdir, 43 .readdir = dcache_readdir,
40 .llseek = dcache_dir_lseek, 44 .llseek = dcache_dir_lseek,
41 .ioctl = autofs4_root_ioctl, 45 .unlocked_ioctl = autofs4_root_ioctl,
46#ifdef CONFIG_COMPAT
47 .compat_ioctl = autofs4_root_compat_ioctl,
48#endif
42}; 49};
43 50
44const struct file_operations autofs4_dir_operations = { 51const struct file_operations autofs4_dir_operations = {
@@ -197,8 +204,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
197 } 204 }
198 205
199 /* Initialize expiry counter after successful mount */ 206 /* Initialize expiry counter after successful mount */
200 if (ino) 207 ino->last_used = jiffies;
201 ino->last_used = jiffies;
202 208
203 spin_lock(&sbi->fs_lock); 209 spin_lock(&sbi->fs_lock);
204 ino->flags &= ~AUTOFS_INF_PENDING; 210 ino->flags &= ~AUTOFS_INF_PENDING;
@@ -839,6 +845,26 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
839} 845}
840 846
841/* Get/set timeout ioctl() operation */ 847/* Get/set timeout ioctl() operation */
848#ifdef CONFIG_COMPAT
849static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
850 compat_ulong_t __user *p)
851{
852 int rv;
853 unsigned long ntimeout;
854
855 if ((rv = get_user(ntimeout, p)) ||
856 (rv = put_user(sbi->exp_timeout/HZ, p)))
857 return rv;
858
859 if (ntimeout > UINT_MAX/HZ)
860 sbi->exp_timeout = 0;
861 else
862 sbi->exp_timeout = ntimeout * HZ;
863
864 return 0;
865}
866#endif
867
842static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, 868static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
843 unsigned long __user *p) 869 unsigned long __user *p)
844{ 870{
@@ -902,8 +928,8 @@ int is_autofs4_dentry(struct dentry *dentry)
902 * ioctl()'s on the root directory is the chief method for the daemon to 928 * ioctl()'s on the root directory is the chief method for the daemon to
903 * generate kernel reactions 929 * generate kernel reactions
904 */ 930 */
905static int autofs4_root_ioctl(struct inode *inode, struct file *filp, 931static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
906 unsigned int cmd, unsigned long arg) 932 unsigned int cmd, unsigned long arg)
907{ 933{
908 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); 934 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
909 void __user *p = (void __user *)arg; 935 void __user *p = (void __user *)arg;
@@ -932,6 +958,10 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
932 return autofs4_get_protosubver(sbi, p); 958 return autofs4_get_protosubver(sbi, p);
933 case AUTOFS_IOC_SETTIMEOUT: 959 case AUTOFS_IOC_SETTIMEOUT:
934 return autofs4_get_set_timeout(sbi, p); 960 return autofs4_get_set_timeout(sbi, p);
961#ifdef CONFIG_COMPAT
962 case AUTOFS_IOC_SETTIMEOUT32:
963 return autofs4_compat_get_set_timeout(sbi, p);
964#endif
935 965
936 case AUTOFS_IOC_ASKUMOUNT: 966 case AUTOFS_IOC_ASKUMOUNT:
937 return autofs4_ask_umount(filp->f_path.mnt, p); 967 return autofs4_ask_umount(filp->f_path.mnt, p);
@@ -947,3 +977,35 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
947 return -ENOSYS; 977 return -ENOSYS;
948 } 978 }
949} 979}
980
981static long autofs4_root_ioctl(struct file *filp,
982 unsigned int cmd, unsigned long arg)
983{
984 long ret;
985 struct inode *inode = filp->f_dentry->d_inode;
986
987 lock_kernel();
988 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
989 unlock_kernel();
990
991 return ret;
992}
993
994#ifdef CONFIG_COMPAT
995static long autofs4_root_compat_ioctl(struct file *filp,
996 unsigned int cmd, unsigned long arg)
997{
998 struct inode *inode = filp->f_path.dentry->d_inode;
999 int ret;
1000
1001 lock_kernel();
1002 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
1003 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
1004 else
1005 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
1006 (unsigned long)compat_ptr(arg));
1007 unlock_kernel();
1008
1009 return ret;
1010}
1011#endif
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..f024d8aaddef 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -55,12 +55,6 @@ static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
55 return POLLERR; 55 return POLLERR;
56} 56}
57 57
58static int bad_file_ioctl (struct inode *inode, struct file *filp,
59 unsigned int cmd, unsigned long arg)
60{
61 return -EIO;
62}
63
64static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd, 58static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
65 unsigned long arg) 59 unsigned long arg)
66{ 60{
@@ -93,8 +87,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
93 return -EIO; 87 return -EIO;
94} 88}
95 89
96static int bad_file_fsync(struct file *file, struct dentry *dentry, 90static int bad_file_fsync(struct file *file, int datasync)
97 int datasync)
98{ 91{
99 return -EIO; 92 return -EIO;
100} 93}
@@ -160,7 +153,6 @@ static const struct file_operations bad_file_ops =
160 .aio_write = bad_file_aio_write, 153 .aio_write = bad_file_aio_write,
161 .readdir = bad_file_readdir, 154 .readdir = bad_file_readdir,
162 .poll = bad_file_poll, 155 .poll = bad_file_poll,
163 .ioctl = bad_file_ioctl,
164 .unlocked_ioctl = bad_file_unlocked_ioctl, 156 .unlocked_ioctl = bad_file_unlocked_ioctl,
165 .compat_ioctl = bad_file_compat_ioctl, 157 .compat_ioctl = bad_file_compat_ioctl,
166 .mmap = bad_file_mmap, 158 .mmap = bad_file_mmap,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 34ddda888e63..dc39d2824885 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -436,7 +436,7 @@ befs_init_inodecache(void)
436 init_once); 436 init_once);
437 if (befs_inode_cachep == NULL) { 437 if (befs_inode_cachep == NULL) {
438 printk(KERN_ERR "befs_init_inodecache: " 438 printk(KERN_ERR "befs_init_inodecache: "
439 "Couldn't initalize inode slabcache\n"); 439 "Couldn't initialize inode slabcache\n");
440 return -ENOMEM; 440 return -ENOMEM;
441 } 441 }
442 442
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 7109e451abf7..f7f87e233dd9 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -17,7 +17,6 @@ struct bfs_sb_info {
17 unsigned long si_lf_eblk; 17 unsigned long si_lf_eblk;
18 unsigned long si_lasti; 18 unsigned long si_lasti;
19 unsigned long *si_imap; 19 unsigned long *si_imap;
20 struct buffer_head *si_sbh; /* buffer header w/superblock */
21 struct mutex bfs_lock; 20 struct mutex bfs_lock;
22}; 21};
23 22
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb1068..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
78const struct file_operations bfs_dir_operations = { 78const struct file_operations bfs_dir_operations = {
79 .read = generic_read_dir, 79 .read = generic_read_dir,
80 .readdir = bfs_readdir, 80 .readdir = bfs_readdir,
81 .fsync = simple_fsync, 81 .fsync = generic_file_fsync,
82 .llseek = generic_file_llseek, 82 .llseek = generic_file_llseek,
83}; 83};
84 84
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
105 } 105 }
106 set_bit(ino, info->si_imap); 106 set_bit(ino, info->si_imap);
107 info->si_freei--; 107 info->si_freei--;
108 inode->i_uid = current_fsuid(); 108 inode_init_owner(inode, dir, mode);
109 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
110 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 109 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
111 inode->i_blocks = 0; 110 inode->i_blocks = 0;
112 inode->i_op = &bfs_file_inops; 111 inode->i_op = &bfs_file_inops;
113 inode->i_fop = &bfs_file_operations; 112 inode->i_fop = &bfs_file_operations;
114 inode->i_mapping->a_ops = &bfs_aops; 113 inode->i_mapping->a_ops = &bfs_aops;
115 inode->i_mode = mode;
116 inode->i_ino = ino; 114 inode->i_ino = ino;
117 BFS_I(inode)->i_dsk_ino = ino; 115 BFS_I(inode)->i_dsk_ino = ino;
118 BFS_I(inode)->i_sblock = 0; 116 BFS_I(inode)->i_sblock = 0;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 88b9a3ff44e4..eb67edd0f8ea 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -70,7 +70,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
70 struct super_block *sb = inode->i_sb; 70 struct super_block *sb = inode->i_sb;
71 struct bfs_sb_info *info = BFS_SB(sb); 71 struct bfs_sb_info *info = BFS_SB(sb);
72 struct bfs_inode_info *bi = BFS_I(inode); 72 struct bfs_inode_info *bi = BFS_I(inode);
73 struct buffer_head *sbh = info->si_sbh;
74 73
75 phys = bi->i_sblock + block; 74 phys = bi->i_sblock + block;
76 if (!create) { 75 if (!create) {
@@ -112,7 +111,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
112 info->si_freeb -= phys - bi->i_eblock; 111 info->si_freeb -= phys - bi->i_eblock;
113 info->si_lf_eblk = bi->i_eblock = phys; 112 info->si_lf_eblk = bi->i_eblock = phys;
114 mark_inode_dirty(inode); 113 mark_inode_dirty(inode);
115 mark_buffer_dirty(sbh);
116 err = 0; 114 err = 0;
117 goto out; 115 goto out;
118 } 116 }
@@ -147,7 +145,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
147 */ 145 */
148 info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks; 146 info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks;
149 mark_inode_dirty(inode); 147 mark_inode_dirty(inode);
150 mark_buffer_dirty(sbh);
151 map_bh(bh_result, sb, phys); 148 map_bh(bh_result, sb, phys);
152out: 149out:
153 mutex_unlock(&info->bfs_lock); 150 mutex_unlock(&info->bfs_lock);
@@ -168,9 +165,17 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
168 loff_t pos, unsigned len, unsigned flags, 165 loff_t pos, unsigned len, unsigned flags,
169 struct page **pagep, void **fsdata) 166 struct page **pagep, void **fsdata)
170{ 167{
171 *pagep = NULL; 168 int ret;
172 return block_write_begin(file, mapping, pos, len, flags, 169
173 pagep, fsdata, bfs_get_block); 170 ret = block_write_begin(mapping, pos, len, flags, pagep,
171 bfs_get_block);
172 if (unlikely(ret)) {
173 loff_t isize = mapping->host->i_size;
174 if (pos + len > isize)
175 vmtruncate(mapping->host, isize);
176 }
177
178 return ret;
174} 179}
175 180
176static sector_t bfs_bmap(struct address_space *mapping, sector_t block) 181static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index f22a7d3dc362..c4daf0f5fc02 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -31,7 +31,6 @@ MODULE_LICENSE("GPL");
31#define dprintf(x...) 31#define dprintf(x...)
32#endif 32#endif
33 33
34static void bfs_write_super(struct super_block *s);
35void dump_imap(const char *prefix, struct super_block *s); 34void dump_imap(const char *prefix, struct super_block *s);
36 35
37struct inode *bfs_iget(struct super_block *sb, unsigned long ino) 36struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -99,6 +98,24 @@ error:
99 return ERR_PTR(-EIO); 98 return ERR_PTR(-EIO);
100} 99}
101 100
101static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p)
102{
103 if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) {
104 printf("Bad inode number %s:%08x\n", sb->s_id, ino);
105 return ERR_PTR(-EIO);
106 }
107
108 ino -= BFS_ROOT_INO;
109
110 *p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK);
111 if (!*p) {
112 printf("Unable to read inode %s:%08x\n", sb->s_id, ino);
113 return ERR_PTR(-EIO);
114 }
115
116 return (struct bfs_inode *)(*p)->b_data + ino % BFS_INODES_PER_BLOCK;
117}
118
102static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) 119static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
103{ 120{
104 struct bfs_sb_info *info = BFS_SB(inode->i_sb); 121 struct bfs_sb_info *info = BFS_SB(inode->i_sb);
@@ -106,28 +123,15 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
106 unsigned long i_sblock; 123 unsigned long i_sblock;
107 struct bfs_inode *di; 124 struct bfs_inode *di;
108 struct buffer_head *bh; 125 struct buffer_head *bh;
109 int block, off;
110 int err = 0; 126 int err = 0;
111 127
112 dprintf("ino=%08x\n", ino); 128 dprintf("ino=%08x\n", ino);
113 129
114 if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { 130 di = find_inode(inode->i_sb, ino, &bh);
115 printf("Bad inode number %s:%08x\n", inode->i_sb->s_id, ino); 131 if (IS_ERR(di))
116 return -EIO; 132 return PTR_ERR(di);
117 }
118 133
119 mutex_lock(&info->bfs_lock); 134 mutex_lock(&info->bfs_lock);
120 block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
121 bh = sb_bread(inode->i_sb, block);
122 if (!bh) {
123 printf("Unable to read inode %s:%08x\n",
124 inode->i_sb->s_id, ino);
125 mutex_unlock(&info->bfs_lock);
126 return -EIO;
127 }
128
129 off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
130 di = (struct bfs_inode *)bh->b_data + off;
131 135
132 if (ino == BFS_ROOT_INO) 136 if (ino == BFS_ROOT_INO)
133 di->i_vtype = cpu_to_le32(BFS_VDIR); 137 di->i_vtype = cpu_to_le32(BFS_VDIR);
@@ -158,12 +162,11 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
158 return err; 162 return err;
159} 163}
160 164
161static void bfs_delete_inode(struct inode *inode) 165static void bfs_evict_inode(struct inode *inode)
162{ 166{
163 unsigned long ino = inode->i_ino; 167 unsigned long ino = inode->i_ino;
164 struct bfs_inode *di; 168 struct bfs_inode *di;
165 struct buffer_head *bh; 169 struct buffer_head *bh;
166 int block, off;
167 struct super_block *s = inode->i_sb; 170 struct super_block *s = inode->i_sb;
168 struct bfs_sb_info *info = BFS_SB(s); 171 struct bfs_sb_info *info = BFS_SB(s);
169 struct bfs_inode_info *bi = BFS_I(inode); 172 struct bfs_inode_info *bi = BFS_I(inode);
@@ -171,28 +174,19 @@ static void bfs_delete_inode(struct inode *inode)
171 dprintf("ino=%08lx\n", ino); 174 dprintf("ino=%08lx\n", ino);
172 175
173 truncate_inode_pages(&inode->i_data, 0); 176 truncate_inode_pages(&inode->i_data, 0);
177 invalidate_inode_buffers(inode);
178 end_writeback(inode);
174 179
175 if ((ino < BFS_ROOT_INO) || (ino > info->si_lasti)) { 180 if (inode->i_nlink)
176 printf("invalid ino=%08lx\n", ino);
177 return; 181 return;
178 }
179
180 inode->i_size = 0;
181 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
182 mutex_lock(&info->bfs_lock);
183 mark_inode_dirty(inode);
184 182
185 block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; 183 di = find_inode(s, inode->i_ino, &bh);
186 bh = sb_bread(s, block); 184 if (IS_ERR(di))
187 if (!bh) {
188 printf("Unable to read inode %s:%08lx\n",
189 inode->i_sb->s_id, ino);
190 mutex_unlock(&info->bfs_lock);
191 return; 185 return;
192 } 186
193 off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; 187 mutex_lock(&info->bfs_lock);
194 di = (struct bfs_inode *)bh->b_data + off; 188 /* clear on-disk inode */
195 memset((void *)di, 0, sizeof(struct bfs_inode)); 189 memset(di, 0, sizeof(struct bfs_inode));
196 mark_buffer_dirty(bh); 190 mark_buffer_dirty(bh);
197 brelse(bh); 191 brelse(bh);
198 192
@@ -209,32 +203,9 @@ static void bfs_delete_inode(struct inode *inode)
209 * "last block of the last file" even if there is no 203 * "last block of the last file" even if there is no
210 * real file there, saves us 1 gap. 204 * real file there, saves us 1 gap.
211 */ 205 */
212 if (info->si_lf_eblk == bi->i_eblock) { 206 if (info->si_lf_eblk == bi->i_eblock)
213 info->si_lf_eblk = bi->i_sblock - 1; 207 info->si_lf_eblk = bi->i_sblock - 1;
214 mark_buffer_dirty(info->si_sbh);
215 }
216 mutex_unlock(&info->bfs_lock); 208 mutex_unlock(&info->bfs_lock);
217 clear_inode(inode);
218}
219
220static int bfs_sync_fs(struct super_block *sb, int wait)
221{
222 struct bfs_sb_info *info = BFS_SB(sb);
223
224 mutex_lock(&info->bfs_lock);
225 mark_buffer_dirty(info->si_sbh);
226 sb->s_dirt = 0;
227 mutex_unlock(&info->bfs_lock);
228
229 return 0;
230}
231
232static void bfs_write_super(struct super_block *sb)
233{
234 if (!(sb->s_flags & MS_RDONLY))
235 bfs_sync_fs(sb, 1);
236 else
237 sb->s_dirt = 0;
238} 209}
239 210
240static void bfs_put_super(struct super_block *s) 211static void bfs_put_super(struct super_block *s)
@@ -246,10 +217,6 @@ static void bfs_put_super(struct super_block *s)
246 217
247 lock_kernel(); 218 lock_kernel();
248 219
249 if (s->s_dirt)
250 bfs_write_super(s);
251
252 brelse(info->si_sbh);
253 mutex_destroy(&info->bfs_lock); 220 mutex_destroy(&info->bfs_lock);
254 kfree(info->si_imap); 221 kfree(info->si_imap);
255 kfree(info); 222 kfree(info);
@@ -319,10 +286,8 @@ static const struct super_operations bfs_sops = {
319 .alloc_inode = bfs_alloc_inode, 286 .alloc_inode = bfs_alloc_inode,
320 .destroy_inode = bfs_destroy_inode, 287 .destroy_inode = bfs_destroy_inode,
321 .write_inode = bfs_write_inode, 288 .write_inode = bfs_write_inode,
322 .delete_inode = bfs_delete_inode, 289 .evict_inode = bfs_evict_inode,
323 .put_super = bfs_put_super, 290 .put_super = bfs_put_super,
324 .write_super = bfs_write_super,
325 .sync_fs = bfs_sync_fs,
326 .statfs = bfs_statfs, 291 .statfs = bfs_statfs,
327}; 292};
328 293
@@ -349,7 +314,7 @@ void dump_imap(const char *prefix, struct super_block *s)
349 314
350static int bfs_fill_super(struct super_block *s, void *data, int silent) 315static int bfs_fill_super(struct super_block *s, void *data, int silent)
351{ 316{
352 struct buffer_head *bh; 317 struct buffer_head *bh, *sbh;
353 struct bfs_super_block *bfs_sb; 318 struct bfs_super_block *bfs_sb;
354 struct inode *inode; 319 struct inode *inode;
355 unsigned i, imap_len; 320 unsigned i, imap_len;
@@ -365,10 +330,10 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
365 330
366 sb_set_blocksize(s, BFS_BSIZE); 331 sb_set_blocksize(s, BFS_BSIZE);
367 332
368 info->si_sbh = sb_bread(s, 0); 333 sbh = sb_bread(s, 0);
369 if (!info->si_sbh) 334 if (!sbh)
370 goto out; 335 goto out;
371 bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data; 336 bfs_sb = (struct bfs_super_block *)sbh->b_data;
372 if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { 337 if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
373 if (!silent) 338 if (!silent)
374 printf("No BFS filesystem on %s (magic=%08x)\n", 339 printf("No BFS filesystem on %s (magic=%08x)\n",
@@ -472,10 +437,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
472 info->si_lf_eblk = eblock; 437 info->si_lf_eblk = eblock;
473 } 438 }
474 brelse(bh); 439 brelse(bh);
475 if (!(s->s_flags & MS_RDONLY)) { 440 brelse(sbh);
476 mark_buffer_dirty(info->si_sbh);
477 s->s_dirt = 1;
478 }
479 dump_imap("read_super", s); 441 dump_imap("read_super", s);
480 return 0; 442 return 0;
481 443
@@ -485,7 +447,7 @@ out3:
485out2: 447out2:
486 kfree(info->si_imap); 448 kfree(info->si_imap);
487out1: 449out1:
488 brelse(info->si_sbh); 450 brelse(sbh);
489out: 451out:
490 mutex_destroy(&info->bfs_lock); 452 mutex_destroy(&info->bfs_lock);
491 kfree(info); 453 kfree(info);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f96eff04e11a..a6395bdb26ae 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
134 if (!dump_write(file, dump_start, dump_size)) 134 if (!dump_write(file, dump_start, dump_size))
135 goto end_coredump; 135 goto end_coredump;
136 } 136 }
137/* Finally dump the task struct. Not be used by gdb, but could be useful */
138 set_fs(KERNEL_DS);
139 if (!dump_write(file, current, sizeof(*current)))
140 goto end_coredump;
141end_coredump: 137end_coredump:
142 set_fs(fs); 138 set_fs(fs);
143 return has_dumped; 139 return has_dumped;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c5f9a0e5d72..63039ed9576f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
990 990
991 /* clear any space allocated but not loaded */ 991 /* clear any space allocated but not loaded */
992 if (phdr->p_filesz < phdr->p_memsz) { 992 if (phdr->p_filesz < phdr->p_memsz) {
993 ret = clear_user((void *) (seg->addr + phdr->p_filesz), 993 if (clear_user((void *) (seg->addr + phdr->p_filesz),
994 phdr->p_memsz - phdr->p_filesz); 994 phdr->p_memsz - phdr->p_filesz))
995 if (ret) 995 return -EFAULT;
996 return ret;
997 } 996 }
998 997
999 if (mm) { 998 if (mm) {
@@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1027 struct elf32_fdpic_loadseg *seg; 1026 struct elf32_fdpic_loadseg *seg;
1028 struct elf32_phdr *phdr; 1027 struct elf32_phdr *phdr;
1029 unsigned long load_addr, delta_vaddr; 1028 unsigned long load_addr, delta_vaddr;
1030 int loop, dvset, ret; 1029 int loop, dvset;
1031 1030
1032 load_addr = params->load_addr; 1031 load_addr = params->load_addr;
1033 delta_vaddr = 0; 1032 delta_vaddr = 0;
@@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1127 * PT_LOAD */ 1126 * PT_LOAD */
1128 if (prot & PROT_WRITE && disp > 0) { 1127 if (prot & PROT_WRITE && disp > 0) {
1129 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); 1128 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
1130 ret = clear_user((void __user *) maddr, disp); 1129 if (clear_user((void __user *) maddr, disp))
1131 if (ret) 1130 return -EFAULT;
1132 return ret;
1133 maddr += disp; 1131 maddr += disp;
1134 } 1132 }
1135 1133
@@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1164 if (prot & PROT_WRITE && excess1 > 0) { 1162 if (prot & PROT_WRITE && excess1 > 0) {
1165 kdebug("clear[%d] ad=%lx sz=%lx", 1163 kdebug("clear[%d] ad=%lx sz=%lx",
1166 loop, maddr + phdr->p_filesz, excess1); 1164 loop, maddr + phdr->p_filesz, excess1);
1167 ret = clear_user((void __user *) maddr + phdr->p_filesz, 1165 if (clear_user((void __user *) maddr + phdr->p_filesz,
1168 excess1); 1166 excess1))
1169 if (ret) 1167 return -EFAULT;
1170 return ret;
1171 } 1168 }
1172 1169
1173#else 1170#else
1174 if (excess > 0) { 1171 if (excess > 0) {
1175 kdebug("clear[%d] ad=%lx sz=%lx", 1172 kdebug("clear[%d] ad=%lx sz=%lx",
1176 loop, maddr + phdr->p_filesz, excess); 1173 loop, maddr + phdr->p_filesz, excess);
1177 ret = clear_user((void *) maddr + phdr->p_filesz, excess); 1174 if (clear_user((void *) maddr + phdr->p_filesz, excess))
1178 if (ret) 1175 return -EFAULT;
1179 return ret;
1180 } 1176 }
1181#endif 1177#endif
1182 1178
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 49566c1687d8..811384bec8de 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -56,16 +56,19 @@
56#endif 56#endif
57 57
58/* 58/*
59 * User data (stack, data section and bss) needs to be aligned 59 * User data (data section and bss) needs to be aligned.
60 * for the same reasons as SLAB memory is, and to the same amount. 60 * We pick 0x20 here because it is the max value elf2flt has always
61 * Avoid duplicating architecture specific code by using the same 61 * used in producing FLAT files, and because it seems to be large
62 * macro as with SLAB allocation: 62 * enough to make all the gcc alignment related tests happy.
63 */ 63 */
64#ifdef ARCH_SLAB_MINALIGN 64#define FLAT_DATA_ALIGN (0x20)
65#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN) 65
66#else 66/*
67#define FLAT_DATA_ALIGN (sizeof(void *)) 67 * User data (stack) also needs to be aligned.
68#endif 68 * Here we can be a bit looser than the data sections since this
69 * needs to only meet arch ABI requirements.
70 */
71#define FLAT_STACK_ALIGN max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN)
69 72
70#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ 73#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */
71#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ 74#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */
@@ -129,7 +132,7 @@ static unsigned long create_flat_tables(
129 132
130 sp = (unsigned long *)p; 133 sp = (unsigned long *)p;
131 sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); 134 sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
132 sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN); 135 sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
133 argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); 136 argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
134 envp = argv + (argc + 1); 137 envp = argv + (argc + 1);
135 138
@@ -589,7 +592,7 @@ static int load_flat_file(struct linux_binprm * bprm,
589 if (IS_ERR_VALUE(result)) { 592 if (IS_ERR_VALUE(result)) {
590 printk("Unable to read data+bss, errno %d\n", (int)-result); 593 printk("Unable to read data+bss, errno %d\n", (int)-result);
591 do_munmap(current->mm, textpos, text_len); 594 do_munmap(current->mm, textpos, text_len);
592 do_munmap(current->mm, realdatastart, data_len + extra); 595 do_munmap(current->mm, realdatastart, len);
593 ret = result; 596 ret = result;
594 goto err; 597 goto err;
595 } 598 }
@@ -876,7 +879,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
876 stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ 879 stack_len = TOP_OF_ARGS - bprm->p; /* the strings */
877 stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ 880 stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
878 stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ 881 stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
879 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ 882 stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */
880 883
881 res = load_flat_file(bprm, &libinfo, 0, &stack_len); 884 res = load_flat_file(bprm, &libinfo, 0, &stack_len);
882 if (IS_ERR_VALUE(res)) 885 if (IS_ERR_VALUE(res))
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index c4e83537ead7..fd0cc0bf9a40 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -108,7 +108,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
108 Node *fmt; 108 Node *fmt;
109 struct file * interp_file = NULL; 109 struct file * interp_file = NULL;
110 char iname[BINPRM_BUF_SIZE]; 110 char iname[BINPRM_BUF_SIZE];
111 char *iname_addr = iname; 111 const char *iname_addr = iname;
112 int retval; 112 int retval;
113 int fd_binary = -1; 113 int fd_binary = -1;
114 114
@@ -502,8 +502,9 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
502 return inode; 502 return inode;
503} 503}
504 504
505static void bm_clear_inode(struct inode *inode) 505static void bm_evict_inode(struct inode *inode)
506{ 506{
507 end_writeback(inode);
507 kfree(inode->i_private); 508 kfree(inode->i_private);
508} 509}
509 510
@@ -685,7 +686,7 @@ static const struct file_operations bm_status_operations = {
685 686
686static const struct super_operations s_ops = { 687static const struct super_operations s_ops = {
687 .statfs = simple_statfs, 688 .statfs = simple_statfs,
688 .clear_inode = bm_clear_inode, 689 .evict_inode = bm_evict_inode,
689}; 690};
690 691
691static int bm_fill_super(struct super_block * sb, void * data, int silent) 692static int bm_fill_super(struct super_block * sb, void * data, int silent)
@@ -723,7 +724,7 @@ static int __init init_misc_binfmt(void)
723{ 724{
724 int err = register_filesystem(&bm_fs_type); 725 int err = register_filesystem(&bm_fs_type);
725 if (!err) { 726 if (!err) {
726 err = register_binfmt(&misc_format); 727 err = insert_binfmt(&misc_format);
727 if (err) 728 if (err)
728 unregister_filesystem(&bm_fs_type); 729 unregister_filesystem(&bm_fs_type);
729 } 730 }
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index aca9d55afb22..396a9884591f 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -16,7 +16,8 @@
16 16
17static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) 17static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
18{ 18{
19 char *cp, *i_name, *i_arg; 19 const char *i_arg, *i_name;
20 char *cp;
20 struct file *file; 21 struct file *file;
21 char interp[BINPRM_BUF_SIZE]; 22 char interp[BINPRM_BUF_SIZE];
22 int retval; 23 int retval;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 612a5c38d3c1..4d0ff5ee27b8 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -413,10 +413,10 @@ int bio_integrity_prep(struct bio *bio)
413 413
414 /* Allocate kernel buffer for protection data */ 414 /* Allocate kernel buffer for protection data */
415 len = sectors * blk_integrity_tuple_size(bi); 415 len = sectors * blk_integrity_tuple_size(bi);
416 buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); 416 buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
417 if (unlikely(buf == NULL)) { 417 if (unlikely(buf == NULL)) {
418 printk(KERN_ERR "could not allocate integrity buffer\n"); 418 printk(KERN_ERR "could not allocate integrity buffer\n");
419 return -EIO; 419 return -ENOMEM;
420 } 420 }
421 421
422 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 422 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
diff --git a/fs/bio.c b/fs/bio.c
index e7bf6ca64dcf..8abb2dfb2e7c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -843,7 +843,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
843 if (!bio) 843 if (!bio)
844 goto out_bmd; 844 goto out_bmd;
845 845
846 bio->bi_rw |= (!write_to_vm << BIO_RW); 846 if (!write_to_vm)
847 bio->bi_rw |= REQ_WRITE;
847 848
848 ret = 0; 849 ret = 0;
849 850
@@ -1024,7 +1025,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
1024 * set data direction, and check if mapped pages need bouncing 1025 * set data direction, and check if mapped pages need bouncing
1025 */ 1026 */
1026 if (!write_to_vm) 1027 if (!write_to_vm)
1027 bio->bi_rw |= (1 << BIO_RW); 1028 bio->bi_rw |= REQ_WRITE;
1028 1029
1029 bio->bi_bdev = bdev; 1030 bio->bi_bdev = bdev;
1030 bio->bi_flags |= (1 << BIO_USER_MAPPED); 1031 bio->bi_flags |= (1 << BIO_USER_MAPPED);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5d..50e8c8582faa 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,8 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
172 struct file *file = iocb->ki_filp; 172 struct file *file = iocb->ki_filp;
173 struct inode *inode = file->f_mapping->host; 173 struct inode *inode = file->f_mapping->host;
174 174
175 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 175 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
176 iov, offset, nr_segs, blkdev_get_blocks, NULL); 176 nr_segs, blkdev_get_blocks, NULL, NULL, 0);
177} 177}
178 178
179int __sync_blockdev(struct block_device *bdev, int wait) 179int __sync_blockdev(struct block_device *bdev, int wait)
@@ -245,37 +245,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
245 sb = get_active_super(bdev); 245 sb = get_active_super(bdev);
246 if (!sb) 246 if (!sb)
247 goto out; 247 goto out;
248 if (sb->s_flags & MS_RDONLY) { 248 error = freeze_super(sb);
249 sb->s_frozen = SB_FREEZE_TRANS; 249 if (error) {
250 up_write(&sb->s_umount); 250 deactivate_super(sb);
251 bdev->bd_fsfreeze_count--;
251 mutex_unlock(&bdev->bd_fsfreeze_mutex); 252 mutex_unlock(&bdev->bd_fsfreeze_mutex);
252 return sb; 253 return ERR_PTR(error);
253 }
254
255 sb->s_frozen = SB_FREEZE_WRITE;
256 smp_wmb();
257
258 sync_filesystem(sb);
259
260 sb->s_frozen = SB_FREEZE_TRANS;
261 smp_wmb();
262
263 sync_blockdev(sb->s_bdev);
264
265 if (sb->s_op->freeze_fs) {
266 error = sb->s_op->freeze_fs(sb);
267 if (error) {
268 printk(KERN_ERR
269 "VFS:Filesystem freeze failed\n");
270 sb->s_frozen = SB_UNFROZEN;
271 deactivate_locked_super(sb);
272 bdev->bd_fsfreeze_count--;
273 mutex_unlock(&bdev->bd_fsfreeze_mutex);
274 return ERR_PTR(error);
275 }
276 } 254 }
277 up_write(&sb->s_umount); 255 deactivate_super(sb);
278
279 out: 256 out:
280 sync_blockdev(bdev); 257 sync_blockdev(bdev);
281 mutex_unlock(&bdev->bd_fsfreeze_mutex); 258 mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -296,40 +273,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
296 273
297 mutex_lock(&bdev->bd_fsfreeze_mutex); 274 mutex_lock(&bdev->bd_fsfreeze_mutex);
298 if (!bdev->bd_fsfreeze_count) 275 if (!bdev->bd_fsfreeze_count)
299 goto out_unlock; 276 goto out;
300 277
301 error = 0; 278 error = 0;
302 if (--bdev->bd_fsfreeze_count > 0) 279 if (--bdev->bd_fsfreeze_count > 0)
303 goto out_unlock; 280 goto out;
304 281
305 if (!sb) 282 if (!sb)
306 goto out_unlock; 283 goto out;
307
308 BUG_ON(sb->s_bdev != bdev);
309 down_write(&sb->s_umount);
310 if (sb->s_flags & MS_RDONLY)
311 goto out_unfrozen;
312
313 if (sb->s_op->unfreeze_fs) {
314 error = sb->s_op->unfreeze_fs(sb);
315 if (error) {
316 printk(KERN_ERR
317 "VFS:Filesystem thaw failed\n");
318 sb->s_frozen = SB_FREEZE_TRANS;
319 bdev->bd_fsfreeze_count++;
320 mutex_unlock(&bdev->bd_fsfreeze_mutex);
321 return error;
322 }
323 }
324
325out_unfrozen:
326 sb->s_frozen = SB_UNFROZEN;
327 smp_wmb();
328 wake_up(&sb->s_wait_unfrozen);
329 284
330 if (sb) 285 error = thaw_super(sb);
331 deactivate_locked_super(sb); 286 if (error) {
332out_unlock: 287 bdev->bd_fsfreeze_count++;
288 mutex_unlock(&bdev->bd_fsfreeze_mutex);
289 return error;
290 }
291out:
333 mutex_unlock(&bdev->bd_fsfreeze_mutex); 292 mutex_unlock(&bdev->bd_fsfreeze_mutex);
334 return 0; 293 return 0;
335} 294}
@@ -349,9 +308,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
349 loff_t pos, unsigned len, unsigned flags, 308 loff_t pos, unsigned len, unsigned flags,
350 struct page **pagep, void **fsdata) 309 struct page **pagep, void **fsdata)
351{ 310{
352 *pagep = NULL; 311 return block_write_begin(mapping, pos, len, flags, pagep,
353 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 312 blkdev_get_block);
354 blkdev_get_block);
355} 313}
356 314
357static int blkdev_write_end(struct file *file, struct address_space *mapping, 315static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -399,12 +357,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
399 return retval; 357 return retval;
400} 358}
401 359
402/* 360int blkdev_fsync(struct file *filp, int datasync)
403 * Filp is never NULL; the only case when ->fsync() is called with
404 * NULL first argument is nfsd_sync_dir() and that's not a directory.
405 */
406
407int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
408{ 361{
409 struct inode *bd_inode = filp->f_mapping->host; 362 struct inode *bd_inode = filp->f_mapping->host;
410 struct block_device *bdev = I_BDEV(bd_inode); 363 struct block_device *bdev = I_BDEV(bd_inode);
@@ -417,7 +370,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
417 */ 370 */
418 mutex_unlock(&bd_inode->i_mutex); 371 mutex_unlock(&bd_inode->i_mutex);
419 372
420 error = blkdev_issue_flush(bdev, NULL); 373 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
421 if (error == -EOPNOTSUPP) 374 if (error == -EOPNOTSUPP)
422 error = 0; 375 error = 0;
423 376
@@ -473,10 +426,13 @@ static inline void __bd_forget(struct inode *inode)
473 inode->i_mapping = &inode->i_data; 426 inode->i_mapping = &inode->i_data;
474} 427}
475 428
476static void bdev_clear_inode(struct inode *inode) 429static void bdev_evict_inode(struct inode *inode)
477{ 430{
478 struct block_device *bdev = &BDEV_I(inode)->bdev; 431 struct block_device *bdev = &BDEV_I(inode)->bdev;
479 struct list_head *p; 432 struct list_head *p;
433 truncate_inode_pages(&inode->i_data, 0);
434 invalidate_inode_buffers(inode); /* is it needed here? */
435 end_writeback(inode);
480 spin_lock(&bdev_lock); 436 spin_lock(&bdev_lock);
481 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 437 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
482 __bd_forget(list_entry(p, struct inode, i_devices)); 438 __bd_forget(list_entry(p, struct inode, i_devices));
@@ -490,7 +446,7 @@ static const struct super_operations bdev_sops = {
490 .alloc_inode = bdev_alloc_inode, 446 .alloc_inode = bdev_alloc_inode,
491 .destroy_inode = bdev_destroy_inode, 447 .destroy_inode = bdev_destroy_inode,
492 .drop_inode = generic_delete_inode, 448 .drop_inode = generic_delete_inode,
493 .clear_inode = bdev_clear_inode, 449 .evict_inode = bdev_evict_inode,
494}; 450};
495 451
496static int bd_get_sb(struct file_system_type *fs_type, 452static int bd_get_sb(struct file_system_type *fs_type,
@@ -668,41 +624,233 @@ void bd_forget(struct inode *inode)
668 iput(bdev->bd_inode); 624 iput(bdev->bd_inode);
669} 625}
670 626
671int bd_claim(struct block_device *bdev, void *holder) 627/**
628 * bd_may_claim - test whether a block device can be claimed
629 * @bdev: block device of interest
630 * @whole: whole block device containing @bdev, may equal @bdev
631 * @holder: holder trying to claim @bdev
632 *
633 * Test whther @bdev can be claimed by @holder.
634 *
635 * CONTEXT:
636 * spin_lock(&bdev_lock).
637 *
638 * RETURNS:
639 * %true if @bdev can be claimed, %false otherwise.
640 */
641static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
642 void *holder)
672{ 643{
673 int res;
674 spin_lock(&bdev_lock);
675
676 /* first decide result */
677 if (bdev->bd_holder == holder) 644 if (bdev->bd_holder == holder)
678 res = 0; /* already a holder */ 645 return true; /* already a holder */
679 else if (bdev->bd_holder != NULL) 646 else if (bdev->bd_holder != NULL)
680 res = -EBUSY; /* held by someone else */ 647 return false; /* held by someone else */
681 else if (bdev->bd_contains == bdev) 648 else if (bdev->bd_contains == bdev)
682 res = 0; /* is a whole device which isn't held */ 649 return true; /* is a whole device which isn't held */
683 650
684 else if (bdev->bd_contains->bd_holder == bd_claim) 651 else if (whole->bd_holder == bd_claim)
685 res = 0; /* is a partition of a device that is being partitioned */ 652 return true; /* is a partition of a device that is being partitioned */
686 else if (bdev->bd_contains->bd_holder != NULL) 653 else if (whole->bd_holder != NULL)
687 res = -EBUSY; /* is a partition of a held device */ 654 return false; /* is a partition of a held device */
688 else 655 else
689 res = 0; /* is a partition of an un-held device */ 656 return true; /* is a partition of an un-held device */
657}
690 658
691 /* now impose change */ 659/**
692 if (res==0) { 660 * bd_prepare_to_claim - prepare to claim a block device
693 /* note that for a whole device bd_holders 661 * @bdev: block device of interest
694 * will be incremented twice, and bd_holder will 662 * @whole: the whole device containing @bdev, may equal @bdev
695 * be set to bd_claim before being set to holder 663 * @holder: holder trying to claim @bdev
696 */ 664 *
697 bdev->bd_contains->bd_holders ++; 665 * Prepare to claim @bdev. This function fails if @bdev is already
698 bdev->bd_contains->bd_holder = bd_claim; 666 * claimed by another holder and waits if another claiming is in
699 bdev->bd_holders++; 667 * progress. This function doesn't actually claim. On successful
700 bdev->bd_holder = holder; 668 * return, the caller has ownership of bd_claiming and bd_holder[s].
669 *
670 * CONTEXT:
671 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
672 * it multiple times.
673 *
674 * RETURNS:
675 * 0 if @bdev can be claimed, -EBUSY otherwise.
676 */
677static int bd_prepare_to_claim(struct block_device *bdev,
678 struct block_device *whole, void *holder)
679{
680retry:
681 /* if someone else claimed, fail */
682 if (!bd_may_claim(bdev, whole, holder))
683 return -EBUSY;
684
685 /* if claiming is already in progress, wait for it to finish */
686 if (whole->bd_claiming) {
687 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
688 DEFINE_WAIT(wait);
689
690 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
691 spin_unlock(&bdev_lock);
692 schedule();
693 finish_wait(wq, &wait);
694 spin_lock(&bdev_lock);
695 goto retry;
696 }
697
698 /* yay, all mine */
699 return 0;
700}
701
702/**
703 * bd_start_claiming - start claiming a block device
704 * @bdev: block device of interest
705 * @holder: holder trying to claim @bdev
706 *
707 * @bdev is about to be opened exclusively. Check @bdev can be opened
708 * exclusively and mark that an exclusive open is in progress. Each
709 * successful call to this function must be matched with a call to
710 * either bd_finish_claiming() or bd_abort_claiming() (which do not
711 * fail).
712 *
713 * This function is used to gain exclusive access to the block device
714 * without actually causing other exclusive open attempts to fail. It
715 * should be used when the open sequence itself requires exclusive
716 * access but may subsequently fail.
717 *
718 * CONTEXT:
719 * Might sleep.
720 *
721 * RETURNS:
722 * Pointer to the block device containing @bdev on success, ERR_PTR()
723 * value on failure.
724 */
725static struct block_device *bd_start_claiming(struct block_device *bdev,
726 void *holder)
727{
728 struct gendisk *disk;
729 struct block_device *whole;
730 int partno, err;
731
732 might_sleep();
733
734 /*
735 * @bdev might not have been initialized properly yet, look up
736 * and grab the outer block device the hard way.
737 */
738 disk = get_gendisk(bdev->bd_dev, &partno);
739 if (!disk)
740 return ERR_PTR(-ENXIO);
741
742 whole = bdget_disk(disk, 0);
743 module_put(disk->fops->owner);
744 put_disk(disk);
745 if (!whole)
746 return ERR_PTR(-ENOMEM);
747
748 /* prepare to claim, if successful, mark claiming in progress */
749 spin_lock(&bdev_lock);
750
751 err = bd_prepare_to_claim(bdev, whole, holder);
752 if (err == 0) {
753 whole->bd_claiming = holder;
754 spin_unlock(&bdev_lock);
755 return whole;
756 } else {
757 spin_unlock(&bdev_lock);
758 bdput(whole);
759 return ERR_PTR(err);
701 } 760 }
761}
762
763/* releases bdev_lock */
764static void __bd_abort_claiming(struct block_device *whole, void *holder)
765{
766 BUG_ON(whole->bd_claiming != holder);
767 whole->bd_claiming = NULL;
768 wake_up_bit(&whole->bd_claiming, 0);
769
702 spin_unlock(&bdev_lock); 770 spin_unlock(&bdev_lock);
703 return res; 771 bdput(whole);
772}
773
774/**
775 * bd_abort_claiming - abort claiming a block device
776 * @whole: whole block device returned by bd_start_claiming()
777 * @holder: holder trying to claim @bdev
778 *
779 * Abort a claiming block started by bd_start_claiming(). Note that
780 * @whole is not the block device to be claimed but the whole device
781 * returned by bd_start_claiming().
782 *
783 * CONTEXT:
784 * Grabs and releases bdev_lock.
785 */
786static void bd_abort_claiming(struct block_device *whole, void *holder)
787{
788 spin_lock(&bdev_lock);
789 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
790}
791
792/* increment holders when we have a legitimate claim. requires bdev_lock */
793static void __bd_claim(struct block_device *bdev, struct block_device *whole,
794 void *holder)
795{
796 /* note that for a whole device bd_holders
797 * will be incremented twice, and bd_holder will
798 * be set to bd_claim before being set to holder
799 */
800 whole->bd_holders++;
801 whole->bd_holder = bd_claim;
802 bdev->bd_holders++;
803 bdev->bd_holder = holder;
704} 804}
705 805
806/**
807 * bd_finish_claiming - finish claiming a block device
808 * @bdev: block device of interest (passed to bd_start_claiming())
809 * @whole: whole block device returned by bd_start_claiming()
810 * @holder: holder trying to claim @bdev
811 *
812 * Finish a claiming block started by bd_start_claiming().
813 *
814 * CONTEXT:
815 * Grabs and releases bdev_lock.
816 */
817static void bd_finish_claiming(struct block_device *bdev,
818 struct block_device *whole, void *holder)
819{
820 spin_lock(&bdev_lock);
821 BUG_ON(!bd_may_claim(bdev, whole, holder));
822 __bd_claim(bdev, whole, holder);
823 __bd_abort_claiming(whole, holder); /* not actually an abort */
824}
825
826/**
827 * bd_claim - claim a block device
828 * @bdev: block device to claim
829 * @holder: holder trying to claim @bdev
830 *
831 * Try to claim @bdev which must have been opened successfully.
832 *
833 * CONTEXT:
834 * Might sleep.
835 *
836 * RETURNS:
837 * 0 if successful, -EBUSY if @bdev is already claimed.
838 */
839int bd_claim(struct block_device *bdev, void *holder)
840{
841 struct block_device *whole = bdev->bd_contains;
842 int res;
843
844 might_sleep();
845
846 spin_lock(&bdev_lock);
847 res = bd_prepare_to_claim(bdev, whole, holder);
848 if (res == 0)
849 __bd_claim(bdev, whole, holder);
850 spin_unlock(&bdev_lock);
851
852 return res;
853}
706EXPORT_SYMBOL(bd_claim); 854EXPORT_SYMBOL(bd_claim);
707 855
708void bd_release(struct block_device *bdev) 856void bd_release(struct block_device *bdev)
@@ -1192,19 +1340,20 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1192 /* 1340 /*
1193 * hooks: /n/, see "layering violations". 1341 * hooks: /n/, see "layering violations".
1194 */ 1342 */
1195 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1343 if (!for_part) {
1196 if (ret != 0) { 1344 ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1197 bdput(bdev); 1345 if (ret != 0) {
1198 return ret; 1346 bdput(bdev);
1347 return ret;
1348 }
1199 } 1349 }
1200 1350
1201 lock_kernel();
1202 restart: 1351 restart:
1203 1352
1204 ret = -ENXIO; 1353 ret = -ENXIO;
1205 disk = get_gendisk(bdev->bd_dev, &partno); 1354 disk = get_gendisk(bdev->bd_dev, &partno);
1206 if (!disk) 1355 if (!disk)
1207 goto out_unlock_kernel; 1356 goto out;
1208 1357
1209 mutex_lock_nested(&bdev->bd_mutex, for_part); 1358 mutex_lock_nested(&bdev->bd_mutex, for_part);
1210 if (!bdev->bd_openers) { 1359 if (!bdev->bd_openers) {
@@ -1284,7 +1433,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1284 if (for_part) 1433 if (for_part)
1285 bdev->bd_part_count++; 1434 bdev->bd_part_count++;
1286 mutex_unlock(&bdev->bd_mutex); 1435 mutex_unlock(&bdev->bd_mutex);
1287 unlock_kernel();
1288 return 0; 1436 return 0;
1289 1437
1290 out_clear: 1438 out_clear:
@@ -1297,9 +1445,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1297 bdev->bd_contains = NULL; 1445 bdev->bd_contains = NULL;
1298 out_unlock_bdev: 1446 out_unlock_bdev:
1299 mutex_unlock(&bdev->bd_mutex); 1447 mutex_unlock(&bdev->bd_mutex);
1300 out_unlock_kernel: 1448 out:
1301 unlock_kernel();
1302
1303 if (disk) 1449 if (disk)
1304 module_put(disk->fops->owner); 1450 module_put(disk->fops->owner);
1305 put_disk(disk); 1451 put_disk(disk);
@@ -1316,6 +1462,7 @@ EXPORT_SYMBOL(blkdev_get);
1316 1462
1317static int blkdev_open(struct inode * inode, struct file * filp) 1463static int blkdev_open(struct inode * inode, struct file * filp)
1318{ 1464{
1465 struct block_device *whole = NULL;
1319 struct block_device *bdev; 1466 struct block_device *bdev;
1320 int res; 1467 int res;
1321 1468
@@ -1338,22 +1485,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1338 if (bdev == NULL) 1485 if (bdev == NULL)
1339 return -ENOMEM; 1486 return -ENOMEM;
1340 1487
1488 if (filp->f_mode & FMODE_EXCL) {
1489 whole = bd_start_claiming(bdev, filp);
1490 if (IS_ERR(whole)) {
1491 bdput(bdev);
1492 return PTR_ERR(whole);
1493 }
1494 }
1495
1341 filp->f_mapping = bdev->bd_inode->i_mapping; 1496 filp->f_mapping = bdev->bd_inode->i_mapping;
1342 1497
1343 res = blkdev_get(bdev, filp->f_mode); 1498 res = blkdev_get(bdev, filp->f_mode);
1344 if (res)
1345 return res;
1346 1499
1347 if (filp->f_mode & FMODE_EXCL) { 1500 if (whole) {
1348 res = bd_claim(bdev, filp); 1501 if (res == 0)
1349 if (res) 1502 bd_finish_claiming(bdev, whole, filp);
1350 goto out_blkdev_put; 1503 else
1504 bd_abort_claiming(whole, filp);
1351 } 1505 }
1352 1506
1353 return 0;
1354
1355 out_blkdev_put:
1356 blkdev_put(bdev, filp->f_mode);
1357 return res; 1507 return res;
1358} 1508}
1359 1509
@@ -1364,7 +1514,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1364 struct block_device *victim = NULL; 1514 struct block_device *victim = NULL;
1365 1515
1366 mutex_lock_nested(&bdev->bd_mutex, for_part); 1516 mutex_lock_nested(&bdev->bd_mutex, for_part);
1367 lock_kernel();
1368 if (for_part) 1517 if (for_part)
1369 bdev->bd_part_count--; 1518 bdev->bd_part_count--;
1370 1519
@@ -1389,7 +1538,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1389 victim = bdev->bd_contains; 1538 victim = bdev->bd_contains;
1390 bdev->bd_contains = NULL; 1539 bdev->bd_contains = NULL;
1391 } 1540 }
1392 unlock_kernel();
1393 mutex_unlock(&bdev->bd_mutex); 1541 mutex_unlock(&bdev->bd_mutex);
1394 bdput(bdev); 1542 bdput(bdev);
1395 if (victim) 1543 if (victim)
@@ -1564,27 +1712,34 @@ EXPORT_SYMBOL(lookup_bdev);
1564 */ 1712 */
1565struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1713struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1566{ 1714{
1567 struct block_device *bdev; 1715 struct block_device *bdev, *whole;
1568 int error = 0; 1716 int error;
1569 1717
1570 bdev = lookup_bdev(path); 1718 bdev = lookup_bdev(path);
1571 if (IS_ERR(bdev)) 1719 if (IS_ERR(bdev))
1572 return bdev; 1720 return bdev;
1573 1721
1722 whole = bd_start_claiming(bdev, holder);
1723 if (IS_ERR(whole)) {
1724 bdput(bdev);
1725 return whole;
1726 }
1727
1574 error = blkdev_get(bdev, mode); 1728 error = blkdev_get(bdev, mode);
1575 if (error) 1729 if (error)
1576 return ERR_PTR(error); 1730 goto out_abort_claiming;
1731
1577 error = -EACCES; 1732 error = -EACCES;
1578 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1733 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1579 goto blkdev_put; 1734 goto out_blkdev_put;
1580 error = bd_claim(bdev, holder);
1581 if (error)
1582 goto blkdev_put;
1583 1735
1736 bd_finish_claiming(bdev, whole, holder);
1584 return bdev; 1737 return bdev;
1585 1738
1586blkdev_put: 1739out_blkdev_put:
1587 blkdev_put(bdev, mode); 1740 blkdev_put(bdev, mode);
1741out_abort_claiming:
1742 bd_abort_claiming(whole, holder);
1588 return ERR_PTR(error); 1743 return ERR_PTR(error);
1589} 1744}
1590 1745
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ef7b26724ec..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 61 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl))
64 return acl;
63 set_cached_acl(inode, type, acl); 65 set_cached_acl(inode, type, acl);
64 } 66 }
65 kfree(value); 67 kfree(value);
@@ -160,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
160 int ret; 162 int ret;
161 struct posix_acl *acl = NULL; 163 struct posix_acl *acl = NULL;
162 164
165 if (!is_owner_or_cap(dentry->d_inode))
166 return -EPERM;
167
168 if (!IS_POSIXACL(dentry->d_inode))
169 return -EOPNOTSUPP;
170
163 if (value) { 171 if (value) {
164 acl = posix_acl_from_xattr(value, size); 172 acl = posix_acl_from_xattr(value, size);
165 if (acl == NULL) { 173 if (acl == NULL) {
@@ -282,14 +290,14 @@ int btrfs_acl_chmod(struct inode *inode)
282 return ret; 290 return ret;
283} 291}
284 292
285struct xattr_handler btrfs_xattr_acl_default_handler = { 293const struct xattr_handler btrfs_xattr_acl_default_handler = {
286 .prefix = POSIX_ACL_XATTR_DEFAULT, 294 .prefix = POSIX_ACL_XATTR_DEFAULT,
287 .flags = ACL_TYPE_DEFAULT, 295 .flags = ACL_TYPE_DEFAULT,
288 .get = btrfs_xattr_acl_get, 296 .get = btrfs_xattr_acl_get,
289 .set = btrfs_xattr_acl_set, 297 .set = btrfs_xattr_acl_set,
290}; 298};
291 299
292struct xattr_handler btrfs_xattr_acl_access_handler = { 300const struct xattr_handler btrfs_xattr_acl_access_handler = {
293 .prefix = POSIX_ACL_XATTR_ACCESS, 301 .prefix = POSIX_ACL_XATTR_ACCESS,
294 .flags = ACL_TYPE_ACCESS, 302 .flags = ACL_TYPE_ACCESS,
295 .get = btrfs_xattr_acl_get, 303 .get = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
377 if (!list_empty(&worker->pending) || 377 if (!list_empty(&worker->pending) ||
378 !list_empty(&worker->prio_pending)) { 378 !list_empty(&worker->prio_pending)) {
379 spin_unlock_irq(&worker->lock); 379 spin_unlock_irq(&worker->lock);
380 set_current_state(TASK_RUNNING);
380 goto again; 381 goto again;
381 } 382 }
382 383
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock; 139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents;
140 int reserved_extents; 141 int reserved_extents;
141 int outstanding_extents;
142 142
143 /* 143 /*
144 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
151 * of these. 151 * of these.
152 */ 152 */
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1;
154 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
155 156
156 /* 157 /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
281 struct btrfs_root *root, 281 struct btrfs_root *root,
282 struct extent_buffer *buf, 282 struct extent_buffer *buf,
283 struct extent_buffer *cow) 283 struct extent_buffer *cow,
284 int *last_ref)
284{ 285{
285 u64 refs; 286 u64 refs;
286 u64 owner; 287 u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
366 BUG_ON(ret); 367 BUG_ON(ret);
367 } 368 }
368 clean_tree_block(trans, root, buf); 369 clean_tree_block(trans, root, buf);
370 *last_ref = 1;
369 } 371 }
370 return 0; 372 return 0;
371} 373}
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
392 struct btrfs_disk_key disk_key; 394 struct btrfs_disk_key disk_key;
393 struct extent_buffer *cow; 395 struct extent_buffer *cow;
394 int level; 396 int level;
397 int last_ref = 0;
395 int unlock_orig = 0; 398 int unlock_orig = 0;
396 u64 parent_start; 399 u64 parent_start;
397 400
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
442 (unsigned long)btrfs_header_fsid(cow), 445 (unsigned long)btrfs_header_fsid(cow),
443 BTRFS_FSID_SIZE); 446 BTRFS_FSID_SIZE);
444 447
445 update_ref_for_cow(trans, root, buf, cow); 448 update_ref_for_cow(trans, root, buf, cow, &last_ref);
449
450 if (root->ref_cows)
451 btrfs_reloc_cow_block(trans, root, buf, cow);
446 452
447 if (buf == root->node) { 453 if (buf == root->node) {
448 WARN_ON(parent && parent != buf); 454 WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
457 extent_buffer_get(cow); 463 extent_buffer_get(cow);
458 spin_unlock(&root->node_lock); 464 spin_unlock(&root->node_lock);
459 465
460 btrfs_free_tree_block(trans, root, buf->start, buf->len, 466 btrfs_free_tree_block(trans, root, buf, parent_start,
461 parent_start, root->root_key.objectid, level); 467 last_ref);
462 free_extent_buffer(buf); 468 free_extent_buffer(buf);
463 add_root_to_dirty_list(root); 469 add_root_to_dirty_list(root);
464 } else { 470 } else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
473 btrfs_set_node_ptr_generation(parent, parent_slot, 479 btrfs_set_node_ptr_generation(parent, parent_slot,
474 trans->transid); 480 trans->transid);
475 btrfs_mark_buffer_dirty(parent); 481 btrfs_mark_buffer_dirty(parent);
476 btrfs_free_tree_block(trans, root, buf->start, buf->len, 482 btrfs_free_tree_block(trans, root, buf, parent_start,
477 parent_start, root->root_key.objectid, level); 483 last_ref);
478 } 484 }
479 if (unlock_orig) 485 if (unlock_orig)
480 btrfs_tree_unlock(buf); 486 btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
949 return bin_search(eb, key, level, slot); 955 return bin_search(eb, key, level, slot);
950} 956}
951 957
958static void root_add_used(struct btrfs_root *root, u32 size)
959{
960 spin_lock(&root->accounting_lock);
961 btrfs_set_root_used(&root->root_item,
962 btrfs_root_used(&root->root_item) + size);
963 spin_unlock(&root->accounting_lock);
964}
965
966static void root_sub_used(struct btrfs_root *root, u32 size)
967{
968 spin_lock(&root->accounting_lock);
969 btrfs_set_root_used(&root->root_item,
970 btrfs_root_used(&root->root_item) - size);
971 spin_unlock(&root->accounting_lock);
972}
973
952/* given a node and slot number, this reads the blocks it points to. The 974/* given a node and slot number, this reads the blocks it points to. The
953 * extent buffer is returned with a reference taken (but unlocked). 975 * extent buffer is returned with a reference taken (but unlocked).
954 * NULL is returned on error. 976 * NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1019 btrfs_tree_lock(child); 1041 btrfs_tree_lock(child);
1020 btrfs_set_lock_blocking(child); 1042 btrfs_set_lock_blocking(child);
1021 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1043 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1022 BUG_ON(ret); 1044 if (ret) {
1045 btrfs_tree_unlock(child);
1046 free_extent_buffer(child);
1047 goto enospc;
1048 }
1023 1049
1024 spin_lock(&root->node_lock); 1050 spin_lock(&root->node_lock);
1025 root->node = child; 1051 root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1034 btrfs_tree_unlock(mid); 1060 btrfs_tree_unlock(mid);
1035 /* once for the path */ 1061 /* once for the path */
1036 free_extent_buffer(mid); 1062 free_extent_buffer(mid);
1037 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, 1063
1038 0, root->root_key.objectid, level); 1064 root_sub_used(root, mid->len);
1065 btrfs_free_tree_block(trans, root, mid, 0, 1);
1039 /* once for the root ptr */ 1066 /* once for the root ptr */
1040 free_extent_buffer(mid); 1067 free_extent_buffer(mid);
1041 return ret; 1068 return 0;
1042 } 1069 }
1043 if (btrfs_header_nritems(mid) > 1070 if (btrfs_header_nritems(mid) >
1044 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1088 if (wret < 0 && wret != -ENOSPC) 1115 if (wret < 0 && wret != -ENOSPC)
1089 ret = wret; 1116 ret = wret;
1090 if (btrfs_header_nritems(right) == 0) { 1117 if (btrfs_header_nritems(right) == 0) {
1091 u64 bytenr = right->start;
1092 u32 blocksize = right->len;
1093
1094 clean_tree_block(trans, root, right); 1118 clean_tree_block(trans, root, right);
1095 btrfs_tree_unlock(right); 1119 btrfs_tree_unlock(right);
1096 free_extent_buffer(right);
1097 right = NULL;
1098 wret = del_ptr(trans, root, path, level + 1, pslot + 1120 wret = del_ptr(trans, root, path, level + 1, pslot +
1099 1); 1121 1);
1100 if (wret) 1122 if (wret)
1101 ret = wret; 1123 ret = wret;
1102 wret = btrfs_free_tree_block(trans, root, 1124 root_sub_used(root, right->len);
1103 bytenr, blocksize, 0, 1125 btrfs_free_tree_block(trans, root, right, 0, 1);
1104 root->root_key.objectid, 1126 free_extent_buffer(right);
1105 level); 1127 right = NULL;
1106 if (wret)
1107 ret = wret;
1108 } else { 1128 } else {
1109 struct btrfs_disk_key right_key; 1129 struct btrfs_disk_key right_key;
1110 btrfs_node_key(right, &right_key, 0); 1130 btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1136 BUG_ON(wret == 1); 1156 BUG_ON(wret == 1);
1137 } 1157 }
1138 if (btrfs_header_nritems(mid) == 0) { 1158 if (btrfs_header_nritems(mid) == 0) {
1139 /* we've managed to empty the middle node, drop it */
1140 u64 bytenr = mid->start;
1141 u32 blocksize = mid->len;
1142
1143 clean_tree_block(trans, root, mid); 1159 clean_tree_block(trans, root, mid);
1144 btrfs_tree_unlock(mid); 1160 btrfs_tree_unlock(mid);
1145 free_extent_buffer(mid);
1146 mid = NULL;
1147 wret = del_ptr(trans, root, path, level + 1, pslot); 1161 wret = del_ptr(trans, root, path, level + 1, pslot);
1148 if (wret) 1162 if (wret)
1149 ret = wret; 1163 ret = wret;
1150 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, 1164 root_sub_used(root, mid->len);
1151 0, root->root_key.objectid, level); 1165 btrfs_free_tree_block(trans, root, mid, 0, 1);
1152 if (wret) 1166 free_extent_buffer(mid);
1153 ret = wret; 1167 mid = NULL;
1154 } else { 1168 } else {
1155 /* update the parent key to reflect our changes */ 1169 /* update the parent key to reflect our changes */
1156 struct btrfs_disk_key mid_key; 1170 struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1590 btrfs_release_path(NULL, p); 1604 btrfs_release_path(NULL, p);
1591 1605
1592 ret = -EAGAIN; 1606 ret = -EAGAIN;
1593 tmp = read_tree_block(root, blocknr, blocksize, gen); 1607 tmp = read_tree_block(root, blocknr, blocksize, 0);
1594 if (tmp) { 1608 if (tmp) {
1595 /* 1609 /*
1596 * If the read above didn't mark this buffer up to date, 1610 * If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
1740 p->nodes[level + 1], 1754 p->nodes[level + 1],
1741 p->slots[level + 1], &b); 1755 p->slots[level + 1], &b);
1742 if (err) { 1756 if (err) {
1743 free_extent_buffer(b);
1744 ret = err; 1757 ret = err;
1745 goto done; 1758 goto done;
1746 } 1759 }
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2076 if (IS_ERR(c)) 2089 if (IS_ERR(c))
2077 return PTR_ERR(c); 2090 return PTR_ERR(c);
2078 2091
2092 root_add_used(root, root->nodesize);
2093
2079 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 2094 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2080 btrfs_set_header_nritems(c, 1); 2095 btrfs_set_header_nritems(c, 1);
2081 btrfs_set_header_level(c, level); 2096 btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2134 int nritems; 2149 int nritems;
2135 2150
2136 BUG_ON(!path->nodes[level]); 2151 BUG_ON(!path->nodes[level]);
2152 btrfs_assert_tree_locked(path->nodes[level]);
2137 lower = path->nodes[level]; 2153 lower = path->nodes[level];
2138 nritems = btrfs_header_nritems(lower); 2154 nritems = btrfs_header_nritems(lower);
2139 BUG_ON(slot > nritems); 2155 BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2202 if (IS_ERR(split)) 2218 if (IS_ERR(split))
2203 return PTR_ERR(split); 2219 return PTR_ERR(split);
2204 2220
2221 root_add_used(root, root->nodesize);
2222
2205 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 2223 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2206 btrfs_set_header_level(split, btrfs_header_level(c)); 2224 btrfs_set_header_level(split, btrfs_header_level(c));
2207 btrfs_set_header_bytenr(split, split->start); 2225 btrfs_set_header_bytenr(split, split->start);
@@ -2286,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2286 return ret; 2304 return ret;
2287} 2305}
2288 2306
2307/*
2308 * min slot controls the lowest index we're willing to push to the
2309 * right. We'll push up to and including min_slot, but no lower
2310 */
2289static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 2311static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2290 struct btrfs_root *root, 2312 struct btrfs_root *root,
2291 struct btrfs_path *path, 2313 struct btrfs_path *path,
2292 int data_size, int empty, 2314 int data_size, int empty,
2293 struct extent_buffer *right, 2315 struct extent_buffer *right,
2294 int free_space, u32 left_nritems) 2316 int free_space, u32 left_nritems,
2317 u32 min_slot)
2295{ 2318{
2296 struct extent_buffer *left = path->nodes[0]; 2319 struct extent_buffer *left = path->nodes[0];
2297 struct extent_buffer *upper = path->nodes[1]; 2320 struct extent_buffer *upper = path->nodes[1];
@@ -2309,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2309 if (empty) 2332 if (empty)
2310 nr = 0; 2333 nr = 0;
2311 else 2334 else
2312 nr = 1; 2335 nr = max_t(u32, 1, min_slot);
2313 2336
2314 if (path->slots[0] >= left_nritems) 2337 if (path->slots[0] >= left_nritems)
2315 push_space += data_size; 2338 push_space += data_size;
@@ -2415,6 +2438,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2415 2438
2416 if (left_nritems) 2439 if (left_nritems)
2417 btrfs_mark_buffer_dirty(left); 2440 btrfs_mark_buffer_dirty(left);
2441 else
2442 clean_tree_block(trans, root, left);
2443
2418 btrfs_mark_buffer_dirty(right); 2444 btrfs_mark_buffer_dirty(right);
2419 2445
2420 btrfs_item_key(right, &disk_key, 0); 2446 btrfs_item_key(right, &disk_key, 0);
@@ -2448,10 +2474,14 @@ out_unlock:
2448 * 2474 *
2449 * returns 1 if the push failed because the other node didn't have enough 2475 * returns 1 if the push failed because the other node didn't have enough
2450 * room, 0 if everything worked out and < 0 if there were major errors. 2476 * room, 0 if everything worked out and < 0 if there were major errors.
2477 *
2478 * this will push starting from min_slot to the end of the leaf. It won't
2479 * push any slot lower than min_slot
2451 */ 2480 */
2452static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 2481static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2453 *root, struct btrfs_path *path, int data_size, 2482 *root, struct btrfs_path *path,
2454 int empty) 2483 int min_data_size, int data_size,
2484 int empty, u32 min_slot)
2455{ 2485{
2456 struct extent_buffer *left = path->nodes[0]; 2486 struct extent_buffer *left = path->nodes[0];
2457 struct extent_buffer *right; 2487 struct extent_buffer *right;
@@ -2493,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2493 if (left_nritems == 0) 2523 if (left_nritems == 0)
2494 goto out_unlock; 2524 goto out_unlock;
2495 2525
2496 return __push_leaf_right(trans, root, path, data_size, empty, 2526 return __push_leaf_right(trans, root, path, min_data_size, empty,
2497 right, free_space, left_nritems); 2527 right, free_space, left_nritems, min_slot);
2498out_unlock: 2528out_unlock:
2499 btrfs_tree_unlock(right); 2529 btrfs_tree_unlock(right);
2500 free_extent_buffer(right); 2530 free_extent_buffer(right);
@@ -2504,12 +2534,17 @@ out_unlock:
2504/* 2534/*
2505 * push some data in the path leaf to the left, trying to free up at 2535 * push some data in the path leaf to the left, trying to free up at
2506 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2536 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2537 *
2538 * max_slot can put a limit on how far into the leaf we'll push items. The
2539 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
2540 * items
2507 */ 2541 */
2508static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 2542static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2509 struct btrfs_root *root, 2543 struct btrfs_root *root,
2510 struct btrfs_path *path, int data_size, 2544 struct btrfs_path *path, int data_size,
2511 int empty, struct extent_buffer *left, 2545 int empty, struct extent_buffer *left,
2512 int free_space, int right_nritems) 2546 int free_space, u32 right_nritems,
2547 u32 max_slot)
2513{ 2548{
2514 struct btrfs_disk_key disk_key; 2549 struct btrfs_disk_key disk_key;
2515 struct extent_buffer *right = path->nodes[0]; 2550 struct extent_buffer *right = path->nodes[0];
@@ -2528,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2528 slot = path->slots[1]; 2563 slot = path->slots[1];
2529 2564
2530 if (empty) 2565 if (empty)
2531 nr = right_nritems; 2566 nr = min(right_nritems, max_slot);
2532 else 2567 else
2533 nr = right_nritems - 1; 2568 nr = min(right_nritems - 1, max_slot);
2534 2569
2535 for (i = 0; i < nr; i++) { 2570 for (i = 0; i < nr; i++) {
2536 item = btrfs_item_nr(right, i); 2571 item = btrfs_item_nr(right, i);
@@ -2660,6 +2695,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2660 btrfs_mark_buffer_dirty(left); 2695 btrfs_mark_buffer_dirty(left);
2661 if (right_nritems) 2696 if (right_nritems)
2662 btrfs_mark_buffer_dirty(right); 2697 btrfs_mark_buffer_dirty(right);
2698 else
2699 clean_tree_block(trans, root, right);
2663 2700
2664 btrfs_item_key(right, &disk_key, 0); 2701 btrfs_item_key(right, &disk_key, 0);
2665 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2702 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2706,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2669 /* then fixup the leaf pointer in the path */ 2706 /* then fixup the leaf pointer in the path */
2670 if (path->slots[0] < push_items) { 2707 if (path->slots[0] < push_items) {
2671 path->slots[0] += old_left_nritems; 2708 path->slots[0] += old_left_nritems;
2672 if (btrfs_header_nritems(path->nodes[0]) == 0)
2673 clean_tree_block(trans, root, path->nodes[0]);
2674 btrfs_tree_unlock(path->nodes[0]); 2709 btrfs_tree_unlock(path->nodes[0]);
2675 free_extent_buffer(path->nodes[0]); 2710 free_extent_buffer(path->nodes[0]);
2676 path->nodes[0] = left; 2711 path->nodes[0] = left;
@@ -2691,10 +2726,14 @@ out:
2691/* 2726/*
2692 * push some data in the path leaf to the left, trying to free up at 2727 * push some data in the path leaf to the left, trying to free up at
2693 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2728 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2729 *
2730 * max_slot can put a limit on how far into the leaf we'll push items. The
2731 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
2732 * items
2694 */ 2733 */
2695static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2734static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2696 *root, struct btrfs_path *path, int data_size, 2735 *root, struct btrfs_path *path, int min_data_size,
2697 int empty) 2736 int data_size, int empty, u32 max_slot)
2698{ 2737{
2699 struct extent_buffer *right = path->nodes[0]; 2738 struct extent_buffer *right = path->nodes[0];
2700 struct extent_buffer *left; 2739 struct extent_buffer *left;
@@ -2740,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2740 goto out; 2779 goto out;
2741 } 2780 }
2742 2781
2743 return __push_leaf_left(trans, root, path, data_size, 2782 return __push_leaf_left(trans, root, path, min_data_size,
2744 empty, left, free_space, right_nritems); 2783 empty, left, free_space, right_nritems,
2784 max_slot);
2745out: 2785out:
2746 btrfs_tree_unlock(left); 2786 btrfs_tree_unlock(left);
2747 free_extent_buffer(left); 2787 free_extent_buffer(left);
@@ -2834,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2834} 2874}
2835 2875
2836/* 2876/*
2877 * double splits happen when we need to insert a big item in the middle
2878 * of a leaf. A double split can leave us with 3 mostly empty leaves:
2879 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2880 * A B C
2881 *
2882 * We avoid this by trying to push the items on either side of our target
2883 * into the adjacent leaves. If all goes well we can avoid the double split
2884 * completely.
2885 */
2886static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2887 struct btrfs_root *root,
2888 struct btrfs_path *path,
2889 int data_size)
2890{
2891 int ret;
2892 int progress = 0;
2893 int slot;
2894 u32 nritems;
2895
2896 slot = path->slots[0];
2897
2898 /*
2899 * try to push all the items after our slot into the
2900 * right leaf
2901 */
2902 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2903 if (ret < 0)
2904 return ret;
2905
2906 if (ret == 0)
2907 progress++;
2908
2909 nritems = btrfs_header_nritems(path->nodes[0]);
2910 /*
2911 * our goal is to get our slot at the start or end of a leaf. If
2912 * we've done so we're done
2913 */
2914 if (path->slots[0] == 0 || path->slots[0] == nritems)
2915 return 0;
2916
2917 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2918 return 0;
2919
2920 /* try to push all the items before our slot into the next leaf */
2921 slot = path->slots[0];
2922 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2923 if (ret < 0)
2924 return ret;
2925
2926 if (ret == 0)
2927 progress++;
2928
2929 if (progress)
2930 return 0;
2931 return 1;
2932}
2933
2934/*
2837 * split the path's leaf in two, making sure there is at least data_size 2935 * split the path's leaf in two, making sure there is at least data_size
2838 * available for the resulting leaf level of the path. 2936 * available for the resulting leaf level of the path.
2839 * 2937 *
@@ -2855,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2855 int wret; 2953 int wret;
2856 int split; 2954 int split;
2857 int num_doubles = 0; 2955 int num_doubles = 0;
2956 int tried_avoid_double = 0;
2858 2957
2859 l = path->nodes[0]; 2958 l = path->nodes[0];
2860 slot = path->slots[0]; 2959 slot = path->slots[0];
@@ -2863,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2863 return -EOVERFLOW; 2962 return -EOVERFLOW;
2864 2963
2865 /* first try to make some room by pushing left and right */ 2964 /* first try to make some room by pushing left and right */
2866 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2965 if (data_size) {
2867 wret = push_leaf_right(trans, root, path, data_size, 0); 2966 wret = push_leaf_right(trans, root, path, data_size,
2967 data_size, 0, 0);
2868 if (wret < 0) 2968 if (wret < 0)
2869 return wret; 2969 return wret;
2870 if (wret) { 2970 if (wret) {
2871 wret = push_leaf_left(trans, root, path, data_size, 0); 2971 wret = push_leaf_left(trans, root, path, data_size,
2972 data_size, 0, (u32)-1);
2872 if (wret < 0) 2973 if (wret < 0)
2873 return wret; 2974 return wret;
2874 } 2975 }
@@ -2902,6 +3003,8 @@ again:
2902 if (mid != nritems && 3003 if (mid != nritems &&
2903 leaf_space_used(l, mid, nritems - mid) + 3004 leaf_space_used(l, mid, nritems - mid) +
2904 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3005 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3006 if (data_size && !tried_avoid_double)
3007 goto push_for_double;
2905 split = 2; 3008 split = 2;
2906 } 3009 }
2907 } 3010 }
@@ -2918,6 +3021,8 @@ again:
2918 if (mid != nritems && 3021 if (mid != nritems &&
2919 leaf_space_used(l, mid, nritems - mid) + 3022 leaf_space_used(l, mid, nritems - mid) +
2920 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3023 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3024 if (data_size && !tried_avoid_double)
3025 goto push_for_double;
2921 split = 2 ; 3026 split = 2 ;
2922 } 3027 }
2923 } 3028 }
@@ -2932,10 +3037,10 @@ again:
2932 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3037 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2933 root->root_key.objectid, 3038 root->root_key.objectid,
2934 &disk_key, 0, l->start, 0); 3039 &disk_key, 0, l->start, 0);
2935 if (IS_ERR(right)) { 3040 if (IS_ERR(right))
2936 BUG_ON(1);
2937 return PTR_ERR(right); 3041 return PTR_ERR(right);
2938 } 3042
3043 root_add_used(root, root->leafsize);
2939 3044
2940 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 3045 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2941 btrfs_set_header_bytenr(right, right->start); 3046 btrfs_set_header_bytenr(right, right->start);
@@ -2998,6 +3103,13 @@ again:
2998 } 3103 }
2999 3104
3000 return ret; 3105 return ret;
3106
3107push_for_double:
3108 push_for_double_split(trans, root, path, data_size);
3109 tried_avoid_double = 1;
3110 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3111 return 0;
3112 goto again;
3001} 3113}
3002 3114
3003static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3115static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3054,7 +3166,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3054 3166
3055 btrfs_set_path_blocking(path); 3167 btrfs_set_path_blocking(path);
3056 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3168 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3057 BUG_ON(ret); 3169 if (ret)
3170 goto err;
3058 3171
3059 path->keep_locks = 0; 3172 path->keep_locks = 0;
3060 btrfs_unlock_up_safe(path, 1); 3173 btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3909,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3796 */ 3909 */
3797 btrfs_unlock_up_safe(path, 0); 3910 btrfs_unlock_up_safe(path, 0);
3798 3911
3799 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, 3912 root_sub_used(root, leaf->len);
3800 0, root->root_key.objectid, 0); 3913
3801 return ret; 3914 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3915 return 0;
3802} 3916}
3803/* 3917/*
3804 * delete the item at the leaf level in path. If that empties 3918 * delete the item at the leaf level in path. If that empties
@@ -3865,6 +3979,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3865 if (leaf == root->node) { 3979 if (leaf == root->node) {
3866 btrfs_set_header_level(leaf, 0); 3980 btrfs_set_header_level(leaf, 0);
3867 } else { 3981 } else {
3982 btrfs_set_path_blocking(path);
3983 clean_tree_block(trans, root, leaf);
3868 ret = btrfs_del_leaf(trans, root, path, leaf); 3984 ret = btrfs_del_leaf(trans, root, path, leaf);
3869 BUG_ON(ret); 3985 BUG_ON(ret);
3870 } 3986 }
@@ -3890,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3890 extent_buffer_get(leaf); 4006 extent_buffer_get(leaf);
3891 4007
3892 btrfs_set_path_blocking(path); 4008 btrfs_set_path_blocking(path);
3893 wret = push_leaf_left(trans, root, path, 1, 1); 4009 wret = push_leaf_left(trans, root, path, 1, 1,
4010 1, (u32)-1);
3894 if (wret < 0 && wret != -ENOSPC) 4011 if (wret < 0 && wret != -ENOSPC)
3895 ret = wret; 4012 ret = wret;
3896 4013
3897 if (path->nodes[0] == leaf && 4014 if (path->nodes[0] == leaf &&
3898 btrfs_header_nritems(leaf)) { 4015 btrfs_header_nritems(leaf)) {
3899 wret = push_leaf_right(trans, root, path, 1, 1); 4016 wret = push_leaf_right(trans, root, path, 1,
4017 1, 1, 0);
3900 if (wret < 0 && wret != -ENOSPC) 4018 if (wret < 0 && wret != -ENOSPC)
3901 ret = wret; 4019 ret = wret;
3902 } 4020 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..eaf286abad17 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
34 34
35struct btrfs_trans_handle; 35struct btrfs_trans_handle;
36struct btrfs_transaction; 36struct btrfs_transaction;
37struct btrfs_pending_snapshot;
37extern struct kmem_cache *btrfs_trans_handle_cachep; 38extern struct kmem_cache *btrfs_trans_handle_cachep;
38extern struct kmem_cache *btrfs_transaction_cachep; 39extern struct kmem_cache *btrfs_transaction_cachep;
39extern struct kmem_cache *btrfs_bit_radix_cachep; 40extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
663#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 664#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
664#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 665#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
665#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 666#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
667#define BTRFS_NR_RAID_TYPES 5
666 668
667struct btrfs_block_group_item { 669struct btrfs_block_group_item {
668 __le64 used; 670 __le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
674 u64 flags; 676 u64 flags;
675 677
676 u64 total_bytes; /* total bytes in the space */ 678 u64 total_bytes; /* total bytes in the space */
677 u64 bytes_used; /* total bytes used on disk */ 679 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */
678 u64 bytes_pinned; /* total bytes pinned, will be freed when the 681 u64 bytes_pinned; /* total bytes pinned, will be freed when the
679 transaction finishes */ 682 transaction finishes */
680 u64 bytes_reserved; /* total bytes the allocator has reserved for 683 u64 bytes_reserved; /* total bytes the allocator has reserved for
681 current allocations */ 684 current allocations */
682 u64 bytes_readonly; /* total bytes that are read only */ 685 u64 bytes_readonly; /* total bytes that are read only */
683 u64 bytes_super; /* total bytes reserved for the super blocks */ 686
684 u64 bytes_root; /* the number of bytes needed to commit a
685 transaction */
686 u64 bytes_may_use; /* number of bytes that may be used for 687 u64 bytes_may_use; /* number of bytes that may be used for
687 delalloc/allocations */ 688 delalloc/allocations */
688 u64 bytes_delalloc; /* number of bytes currently reserved for 689 u64 disk_used; /* total bytes used on disk */
689 delayed allocation */
690 690
691 int full; /* indicates that we cannot allocate any more 691 int full; /* indicates that we cannot allocate any more
692 chunks for this space */ 692 chunks for this space */
693 int force_alloc; /* set if we need to force a chunk alloc for 693 int force_alloc; /* set if we need to force a chunk alloc for
694 this space */ 694 this space */
695 int force_delalloc; /* make people start doing filemap_flush until
696 we're under a threshold */
697 695
698 struct list_head list; 696 struct list_head list;
699 697
700 /* for controlling how we free up space for allocations */
701 wait_queue_head_t allocate_wait;
702 wait_queue_head_t flush_wait;
703 int allocating_chunk;
704 int flushing;
705
706 /* for block groups in our same type */ 698 /* for block groups in our same type */
707 struct list_head block_groups; 699 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
708 spinlock_t lock; 700 spinlock_t lock;
709 struct rw_semaphore groups_sem; 701 struct rw_semaphore groups_sem;
710 atomic_t caching_threads; 702 atomic_t caching_threads;
711}; 703};
712 704
705struct btrfs_block_rsv {
706 u64 size;
707 u64 reserved;
708 u64 freed[2];
709 struct btrfs_space_info *space_info;
710 struct list_head list;
711 spinlock_t lock;
712 atomic_t usage;
713 unsigned int priority:8;
714 unsigned int durable:1;
715 unsigned int refill_used:1;
716 unsigned int full:1;
717};
718
713/* 719/*
714 * free clusters are used to claim free space in relatively large chunks, 720 * free clusters are used to claim free space in relatively large chunks,
715 * allowing us to do less seeky writes. They are used for all metadata 721 * allowing us to do less seeky writes. They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
760 spinlock_t lock; 766 spinlock_t lock;
761 u64 pinned; 767 u64 pinned;
762 u64 reserved; 768 u64 reserved;
769 u64 reserved_pinned;
763 u64 bytes_super; 770 u64 bytes_super;
764 u64 flags; 771 u64 flags;
765 u64 sectorsize; 772 u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
825 /* logical->physical extent mapping */ 832 /* logical->physical extent mapping */
826 struct btrfs_mapping_tree mapping_tree; 833 struct btrfs_mapping_tree mapping_tree;
827 834
835 /* block reservation for extent, checksum and root tree */
836 struct btrfs_block_rsv global_block_rsv;
837 /* block reservation for delay allocation */
838 struct btrfs_block_rsv delalloc_block_rsv;
839 /* block reservation for metadata operations */
840 struct btrfs_block_rsv trans_block_rsv;
841 /* block reservation for chunk tree */
842 struct btrfs_block_rsv chunk_block_rsv;
843
844 struct btrfs_block_rsv empty_block_rsv;
845
846 /* list of block reservations that cross multiple transactions */
847 struct list_head durable_block_rsv_list;
848
849 struct mutex durable_block_rsv_mutex;
850
828 u64 generation; 851 u64 generation;
829 u64 last_trans_committed; 852 u64 last_trans_committed;
830 853
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
927 struct btrfs_workers endio_meta_write_workers; 950 struct btrfs_workers endio_meta_write_workers;
928 struct btrfs_workers endio_write_workers; 951 struct btrfs_workers endio_write_workers;
929 struct btrfs_workers submit_workers; 952 struct btrfs_workers submit_workers;
930 struct btrfs_workers enospc_workers;
931 /* 953 /*
932 * fixup workers take dirty pages that didn't properly go through 954 * fixup workers take dirty pages that didn't properly go through
933 * the cow mechanism and make them safe to write. It happens 955 * the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
943 int do_barriers; 965 int do_barriers;
944 int closing; 966 int closing;
945 int log_root_recovering; 967 int log_root_recovering;
968 int enospc_unlink;
946 969
947 u64 total_pinned; 970 u64 total_pinned;
948 971
@@ -1012,6 +1035,9 @@ struct btrfs_root {
1012 struct completion kobj_unregister; 1035 struct completion kobj_unregister;
1013 struct mutex objectid_mutex; 1036 struct mutex objectid_mutex;
1014 1037
1038 spinlock_t accounting_lock;
1039 struct btrfs_block_rsv *block_rsv;
1040
1015 struct mutex log_mutex; 1041 struct mutex log_mutex;
1016 wait_queue_head_t log_writer_wait; 1042 wait_queue_head_t log_writer_wait;
1017 wait_queue_head_t log_commit_wait[2]; 1043 wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
1043 int ref_cows; 1069 int ref_cows;
1044 int track_dirty; 1070 int track_dirty;
1045 int in_radix; 1071 int in_radix;
1046 int clean_orphans;
1047 1072
1048 u64 defrag_trans_start; 1073 u64 defrag_trans_start;
1049 struct btrfs_key defrag_progress; 1074 struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
1057 1082
1058 struct list_head root_list; 1083 struct list_head root_list;
1059 1084
1060 spinlock_t list_lock; 1085 spinlock_t orphan_lock;
1061 struct list_head orphan_list; 1086 struct list_head orphan_list;
1087 struct btrfs_block_rsv *orphan_block_rsv;
1088 int orphan_item_inserted;
1089 int orphan_cleanup_state;
1062 1090
1063 spinlock_t inode_lock; 1091 spinlock_t inode_lock;
1064 /* red-black tree that keeps track of in-memory inodes */ 1092 /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1965int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1966 struct btrfs_root *root, unsigned long count); 1994 struct btrfs_root *root, unsigned long count);
1967int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1995int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1996int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, u64 bytenr,
1998 u64 num_bytes, u64 *refs, u64 *flags);
1968int btrfs_pin_extent(struct btrfs_root *root, 1999int btrfs_pin_extent(struct btrfs_root *root,
1969 u64 bytenr, u64 num, int reserved); 2000 u64 bytenr, u64 num, int reserved);
1970int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 2001int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1984 u64 parent, u64 root_objectid, 2015 u64 parent, u64 root_objectid,
1985 struct btrfs_disk_key *key, int level, 2016 struct btrfs_disk_key *key, int level,
1986 u64 hint, u64 empty_size); 2017 u64 hint, u64 empty_size);
1987int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2018void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1988 struct btrfs_root *root, 2019 struct btrfs_root *root,
1989 u64 bytenr, u32 blocksize, 2020 struct extent_buffer *buf,
1990 u64 parent, u64 root_objectid, int level); 2021 u64 parent, int last_ref);
1991struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2022struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1992 struct btrfs_root *root, 2023 struct btrfs_root *root,
1993 u64 bytenr, u32 blocksize, 2024 u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2041 u64 size); 2072 u64 size);
2042int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2073int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2043 struct btrfs_root *root, u64 group_start); 2074 struct btrfs_root *root, u64 group_start);
2044int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
2045 struct btrfs_block_group_cache *group);
2046
2047u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2075u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2048void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2076void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2049void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2077void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2050 2078int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2051int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); 2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2052int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); 2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2053int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2081 struct btrfs_root *root,
2054 struct inode *inode, int num_items); 2082 int num_items, int *retries);
2055int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2056 struct inode *inode, int num_items); 2084 struct btrfs_root *root);
2057int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
2058 u64 bytes); 2086 struct inode *inode);
2059void btrfs_free_reserved_data_space(struct btrfs_root *root, 2087void btrfs_orphan_release_metadata(struct inode *inode);
2060 struct inode *inode, u64 bytes); 2088int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
2061void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 2089 struct btrfs_pending_snapshot *pending);
2062 u64 bytes); 2090int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2063void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2091void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2064 u64 bytes); 2092int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2093void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2094void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2095struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2096void btrfs_free_block_rsv(struct btrfs_root *root,
2097 struct btrfs_block_rsv *rsv);
2098void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2099 struct btrfs_block_rsv *rsv);
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv,
2107 u64 min_reserved, int min_factor);
2108int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2109 struct btrfs_block_rsv *dst_rsv,
2110 u64 num_bytes);
2111void btrfs_block_rsv_release(struct btrfs_root *root,
2112 struct btrfs_block_rsv *block_rsv,
2113 u64 num_bytes);
2114int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache);
2065/* ctree.c */ 2118/* ctree.c */
2066int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2067 int level, int *slot); 2120 int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2152int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2205int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2153int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2206int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2154int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2207int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2155int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); 2208int btrfs_drop_snapshot(struct btrfs_root *root,
2209 struct btrfs_block_rsv *block_rsv, int update_ref);
2156int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2210int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2157 struct btrfs_root *root, 2211 struct btrfs_root *root,
2158 struct extent_buffer *node, 2212 struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
2245 struct btrfs_root *root, 2299 struct btrfs_root *root,
2246 const char *name, int name_len, 2300 const char *name, int name_len,
2247 u64 inode_objectid, u64 ref_objectid, u64 *index); 2301 u64 inode_objectid, u64 ref_objectid, u64 *index);
2302struct btrfs_inode_ref *
2303btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
2304 struct btrfs_root *root,
2305 struct btrfs_path *path,
2306 const char *name, int name_len,
2307 u64 inode_objectid, u64 ref_objectid, int mod);
2248int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 2308int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root, 2309 struct btrfs_root *root,
2250 struct btrfs_path *path, u64 objectid); 2310 struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2257 struct btrfs_root *root, u64 bytenr, u64 len); 2317 struct btrfs_root *root, u64 bytenr, u64 len);
2258int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2259 struct bio *bio, u32 *dst); 2319 struct bio *bio, u32 *dst);
2320int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2321 struct bio *bio, u64 logical_offset, u32 *dst);
2260int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2322int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2261 struct btrfs_root *root, 2323 struct btrfs_root *root,
2262 u64 objectid, u64 pos, 2324 u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2311 u32 min_type); 2373 u32 min_type);
2312 2374
2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state); 2378 struct extent_state **cached_state);
2316int btrfs_writepages(struct address_space *mapping, 2379int btrfs_writepages(struct address_space *mapping,
@@ -2326,13 +2389,13 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
2326 pgoff_t offset, pgoff_t last_index); 2389 pgoff_t offset, pgoff_t last_index);
2327int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2390int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2328int btrfs_readpage(struct file *file, struct page *page); 2391int btrfs_readpage(struct file *file, struct page *page);
2329void btrfs_delete_inode(struct inode *inode); 2392void btrfs_evict_inode(struct inode *inode);
2330void btrfs_put_inode(struct inode *inode); 2393void btrfs_put_inode(struct inode *inode);
2331int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2394int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2332void btrfs_dirty_inode(struct inode *inode); 2395void btrfs_dirty_inode(struct inode *inode);
2333struct inode *btrfs_alloc_inode(struct super_block *sb); 2396struct inode *btrfs_alloc_inode(struct super_block *sb);
2334void btrfs_destroy_inode(struct inode *inode); 2397void btrfs_destroy_inode(struct inode *inode);
2335void btrfs_drop_inode(struct inode *inode); 2398int btrfs_drop_inode(struct inode *inode);
2336int btrfs_init_cachep(void); 2399int btrfs_init_cachep(void);
2337void btrfs_destroy_cachep(void); 2400void btrfs_destroy_cachep(void);
2338long btrfs_ioctl_trans_end(struct file *file); 2401long btrfs_ioctl_trans_end(struct file *file);
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2349int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2412int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2350int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2413int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2351void btrfs_orphan_cleanup(struct btrfs_root *root); 2414void btrfs_orphan_cleanup(struct btrfs_root *root);
2415void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2416 struct btrfs_pending_snapshot *pending,
2417 u64 *bytes_to_reserve);
2418void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2419 struct btrfs_pending_snapshot *pending);
2420void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root);
2352int btrfs_cont_expand(struct inode *inode, loff_t size); 2422int btrfs_cont_expand(struct inode *inode, loff_t size);
2353int btrfs_invalidate_inodes(struct btrfs_root *root); 2423int btrfs_invalidate_inodes(struct btrfs_root *root);
2354void btrfs_add_delayed_iput(struct inode *inode); 2424void btrfs_add_delayed_iput(struct inode *inode);
2355void btrfs_run_delayed_iputs(struct btrfs_root *root); 2425void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint);
2356extern const struct dentry_operations btrfs_dentry_operations; 2429extern const struct dentry_operations btrfs_dentry_operations;
2357 2430
2358/* ioctl.c */ 2431/* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
2361void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2434void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2362 2435
2363/* file.c */ 2436/* file.c */
2364int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2437int btrfs_sync_file(struct file *file, int datasync);
2365int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2438int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2366 int skip_pinned); 2439 int skip_pinned);
2367int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2440int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2409 struct btrfs_root *root); 2482 struct btrfs_root *root);
2410int btrfs_recover_relocation(struct btrfs_root *root); 2483int btrfs_recover_relocation(struct btrfs_root *root);
2411int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 2484int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2485void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
2486 struct btrfs_root *root, struct extent_buffer *buf,
2487 struct extent_buffer *cow);
2488void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2489 struct btrfs_pending_snapshot *pending,
2490 u64 *bytes_to_reserve);
2491void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2492 struct btrfs_pending_snapshot *pending);
2412#endif 2493#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
319} 319}
320 320
321/* 321/*
322 * helper function to lookup reference count and flags of extent.
323 *
324 * the head node for delayed ref is used to store the sum of all the
325 * reference count modifications queued up in the rbtree. the head
326 * node may also store the extent flags to set. This way you can check
327 * to see what the reference count and extent flags would be if all of
328 * the delayed refs are not processed.
329 */
330int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
331 struct btrfs_root *root, u64 bytenr,
332 u64 num_bytes, u64 *refs, u64 *flags)
333{
334 struct btrfs_delayed_ref_node *ref;
335 struct btrfs_delayed_ref_head *head;
336 struct btrfs_delayed_ref_root *delayed_refs;
337 struct btrfs_path *path;
338 struct btrfs_extent_item *ei;
339 struct extent_buffer *leaf;
340 struct btrfs_key key;
341 u32 item_size;
342 u64 num_refs;
343 u64 extent_flags;
344 int ret;
345
346 path = btrfs_alloc_path();
347 if (!path)
348 return -ENOMEM;
349
350 key.objectid = bytenr;
351 key.type = BTRFS_EXTENT_ITEM_KEY;
352 key.offset = num_bytes;
353 delayed_refs = &trans->transaction->delayed_refs;
354again:
355 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
356 &key, path, 0, 0);
357 if (ret < 0)
358 goto out;
359
360 if (ret == 0) {
361 leaf = path->nodes[0];
362 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
363 if (item_size >= sizeof(*ei)) {
364 ei = btrfs_item_ptr(leaf, path->slots[0],
365 struct btrfs_extent_item);
366 num_refs = btrfs_extent_refs(leaf, ei);
367 extent_flags = btrfs_extent_flags(leaf, ei);
368 } else {
369#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
370 struct btrfs_extent_item_v0 *ei0;
371 BUG_ON(item_size != sizeof(*ei0));
372 ei0 = btrfs_item_ptr(leaf, path->slots[0],
373 struct btrfs_extent_item_v0);
374 num_refs = btrfs_extent_refs_v0(leaf, ei0);
375 /* FIXME: this isn't correct for data */
376 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
377#else
378 BUG();
379#endif
380 }
381 BUG_ON(num_refs == 0);
382 } else {
383 num_refs = 0;
384 extent_flags = 0;
385 ret = 0;
386 }
387
388 spin_lock(&delayed_refs->lock);
389 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
390 if (ref) {
391 head = btrfs_delayed_node_to_head(ref);
392 if (!mutex_trylock(&head->mutex)) {
393 atomic_inc(&ref->refs);
394 spin_unlock(&delayed_refs->lock);
395
396 btrfs_release_path(root->fs_info->extent_root, path);
397
398 mutex_lock(&head->mutex);
399 mutex_unlock(&head->mutex);
400 btrfs_put_delayed_ref(ref);
401 goto again;
402 }
403 if (head->extent_op && head->extent_op->update_flags)
404 extent_flags |= head->extent_op->flags_to_set;
405 else
406 BUG_ON(num_refs == 0);
407
408 num_refs += ref->ref_mod;
409 mutex_unlock(&head->mutex);
410 }
411 WARN_ON(num_refs == 0);
412 if (refs)
413 *refs = num_refs;
414 if (flags)
415 *flags = extent_flags;
416out:
417 spin_unlock(&delayed_refs->lock);
418 btrfs_free_path(path);
419 return ret;
420}
421
422/*
423 * helper function to update an extent delayed ref in the 322 * helper function to update an extent delayed ref in the
424 * rbtree. existing and update must both have the same 323 * rbtree. existing and update must both have the same
425 * bytenr and parent 324 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, u64 bytenr,
172 u64 num_bytes, u64 *refs, u64 *flags);
173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
174 u64 bytenr, u64 num_bytes, u64 orig_parent, 171 u64 bytenr, u64 num_bytes, u64 orig_parent,
175 u64 parent, u64 orig_ref_root, u64 ref_root, 172 u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..64f10082f048 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
74 int rw; 74 int rw;
75 int mirror_num; 75 int mirror_num;
76 unsigned long bio_flags; 76 unsigned long bio_flags;
77 /*
78 * bio_offset is optional, can be used if the pages in the bio
79 * can't tell us where in the file the bio should go
80 */
81 u64 bio_offset;
77 struct btrfs_work work; 82 struct btrfs_work work;
78}; 83};
79 84
@@ -475,7 +480,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
475 end_io_wq->work.func = end_workqueue_fn; 480 end_io_wq->work.func = end_workqueue_fn;
476 end_io_wq->work.flags = 0; 481 end_io_wq->work.flags = 0;
477 482
478 if (bio->bi_rw & (1 << BIO_RW)) { 483 if (bio->bi_rw & REQ_WRITE) {
479 if (end_io_wq->metadata) 484 if (end_io_wq->metadata)
480 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 485 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
481 &end_io_wq->work); 486 &end_io_wq->work);
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
534 async = container_of(work, struct async_submit_bio, work); 539 async = container_of(work, struct async_submit_bio, work);
535 fs_info = BTRFS_I(async->inode)->root->fs_info; 540 fs_info = BTRFS_I(async->inode)->root->fs_info;
536 async->submit_bio_start(async->inode, async->rw, async->bio, 541 async->submit_bio_start(async->inode, async->rw, async->bio,
537 async->mirror_num, async->bio_flags); 542 async->mirror_num, async->bio_flags,
543 async->bio_offset);
538} 544}
539 545
540static void run_one_async_done(struct btrfs_work *work) 546static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
556 wake_up(&fs_info->async_submit_wait); 562 wake_up(&fs_info->async_submit_wait);
557 563
558 async->submit_bio_done(async->inode, async->rw, async->bio, 564 async->submit_bio_done(async->inode, async->rw, async->bio,
559 async->mirror_num, async->bio_flags); 565 async->mirror_num, async->bio_flags,
566 async->bio_offset);
560} 567}
561 568
562static void run_one_async_free(struct btrfs_work *work) 569static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
570int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 577int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
571 int rw, struct bio *bio, int mirror_num, 578 int rw, struct bio *bio, int mirror_num,
572 unsigned long bio_flags, 579 unsigned long bio_flags,
580 u64 bio_offset,
573 extent_submit_bio_hook_t *submit_bio_start, 581 extent_submit_bio_hook_t *submit_bio_start,
574 extent_submit_bio_hook_t *submit_bio_done) 582 extent_submit_bio_hook_t *submit_bio_done)
575{ 583{
@@ -592,10 +600,11 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
592 600
593 async->work.flags = 0; 601 async->work.flags = 0;
594 async->bio_flags = bio_flags; 602 async->bio_flags = bio_flags;
603 async->bio_offset = bio_offset;
595 604
596 atomic_inc(&fs_info->nr_async_submits); 605 atomic_inc(&fs_info->nr_async_submits);
597 606
598 if (rw & (1 << BIO_RW_SYNCIO)) 607 if (rw & REQ_SYNC)
599 btrfs_set_work_high_prio(&async->work); 608 btrfs_set_work_high_prio(&async->work);
600 609
601 btrfs_queue_worker(&fs_info->workers, &async->work); 610 btrfs_queue_worker(&fs_info->workers, &async->work);
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
627 636
628static int __btree_submit_bio_start(struct inode *inode, int rw, 637static int __btree_submit_bio_start(struct inode *inode, int rw,
629 struct bio *bio, int mirror_num, 638 struct bio *bio, int mirror_num,
630 unsigned long bio_flags) 639 unsigned long bio_flags,
640 u64 bio_offset)
631{ 641{
632 /* 642 /*
633 * when we're called for a write, we're already in the async 643 * when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
638} 648}
639 649
640static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 650static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
641 int mirror_num, unsigned long bio_flags) 651 int mirror_num, unsigned long bio_flags,
652 u64 bio_offset)
642{ 653{
643 /* 654 /*
644 * when we're called for a write, we're already in the async 655 * when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
648} 659}
649 660
650static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 661static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
651 int mirror_num, unsigned long bio_flags) 662 int mirror_num, unsigned long bio_flags,
663 u64 bio_offset)
652{ 664{
653 int ret; 665 int ret;
654 666
@@ -656,7 +668,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
656 bio, 1); 668 bio, 1);
657 BUG_ON(ret); 669 BUG_ON(ret);
658 670
659 if (!(rw & (1 << BIO_RW))) { 671 if (!(rw & REQ_WRITE)) {
660 /* 672 /*
661 * called for a read, do the setup so that checksum validation 673 * called for a read, do the setup so that checksum validation
662 * can happen in the async kernel threads 674 * can happen in the async kernel threads
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
671 */ 683 */
672 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 684 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
673 inode, rw, bio, mirror_num, 0, 685 inode, rw, bio, mirror_num, 0,
686 bio_offset,
674 __btree_submit_bio_start, 687 __btree_submit_bio_start,
675 __btree_submit_bio_done); 688 __btree_submit_bio_done);
676} 689}
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
894 root->ref_cows = 0; 907 root->ref_cows = 0;
895 root->track_dirty = 0; 908 root->track_dirty = 0;
896 root->in_radix = 0; 909 root->in_radix = 0;
897 root->clean_orphans = 0; 910 root->orphan_item_inserted = 0;
911 root->orphan_cleanup_state = 0;
898 912
899 root->fs_info = fs_info; 913 root->fs_info = fs_info;
900 root->objectid = objectid; 914 root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
903 root->name = NULL; 917 root->name = NULL;
904 root->in_sysfs = 0; 918 root->in_sysfs = 0;
905 root->inode_tree = RB_ROOT; 919 root->inode_tree = RB_ROOT;
920 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL;
906 922
907 INIT_LIST_HEAD(&root->dirty_list); 923 INIT_LIST_HEAD(&root->dirty_list);
908 INIT_LIST_HEAD(&root->orphan_list); 924 INIT_LIST_HEAD(&root->orphan_list);
909 INIT_LIST_HEAD(&root->root_list); 925 INIT_LIST_HEAD(&root->root_list);
910 spin_lock_init(&root->node_lock); 926 spin_lock_init(&root->node_lock);
911 spin_lock_init(&root->list_lock); 927 spin_lock_init(&root->orphan_lock);
912 spin_lock_init(&root->inode_lock); 928 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock);
913 mutex_init(&root->objectid_mutex); 930 mutex_init(&root->objectid_mutex);
914 mutex_init(&root->log_mutex); 931 mutex_init(&root->log_mutex);
915 init_waitqueue_head(&root->log_writer_wait); 932 init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
968 return 0; 985 return 0;
969} 986}
970 987
971int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
972 struct btrfs_fs_info *fs_info)
973{
974 struct extent_buffer *eb;
975 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
976 u64 start = 0;
977 u64 end = 0;
978 int ret;
979
980 if (!log_root_tree)
981 return 0;
982
983 while (1) {
984 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
985 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
986 if (ret)
987 break;
988
989 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
990 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
991 }
992 eb = fs_info->log_root_tree->node;
993
994 WARN_ON(btrfs_header_level(eb) != 0);
995 WARN_ON(btrfs_header_nritems(eb) != 0);
996
997 ret = btrfs_free_reserved_extent(fs_info->tree_root,
998 eb->start, eb->len);
999 BUG_ON(ret);
1000
1001 free_extent_buffer(eb);
1002 kfree(fs_info->log_root_tree);
1003 fs_info->log_root_tree = NULL;
1004 return 0;
1005}
1006
1007static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 988static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1008 struct btrfs_fs_info *fs_info) 989 struct btrfs_fs_info *fs_info)
1009{ 990{
@@ -1191,19 +1172,23 @@ again:
1191 if (root) 1172 if (root)
1192 return root; 1173 return root;
1193 1174
1194 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1195 if (ret == 0)
1196 ret = -ENOENT;
1197 if (ret < 0)
1198 return ERR_PTR(ret);
1199
1200 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1175 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1201 if (IS_ERR(root)) 1176 if (IS_ERR(root))
1202 return root; 1177 return root;
1203 1178
1204 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1205 set_anon_super(&root->anon_super, NULL); 1179 set_anon_super(&root->anon_super, NULL);
1206 1180
1181 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT;
1183 goto fail;
1184 }
1185
1186 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1187 if (ret < 0)
1188 goto fail;
1189 if (ret == 0)
1190 root->orphan_item_inserted = 1;
1191
1207 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1192 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1208 if (ret) 1193 if (ret)
1209 goto fail; 1194 goto fail;
@@ -1212,10 +1197,9 @@ again:
1212 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1197 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1213 (unsigned long)root->root_key.objectid, 1198 (unsigned long)root->root_key.objectid,
1214 root); 1199 root);
1215 if (ret == 0) { 1200 if (ret == 0)
1216 root->in_radix = 1; 1201 root->in_radix = 1;
1217 root->clean_orphans = 1; 1202
1218 }
1219 spin_unlock(&fs_info->fs_roots_radix_lock); 1203 spin_unlock(&fs_info->fs_roots_radix_lock);
1220 radix_tree_preload_end(); 1204 radix_tree_preload_end();
1221 if (ret) { 1205 if (ret) {
@@ -1443,7 +1427,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
1443 * ram and up to date before trying to verify things. For 1427 * ram and up to date before trying to verify things. For
1444 * blocksize <= pagesize, it is basically a noop 1428 * blocksize <= pagesize, it is basically a noop
1445 */ 1429 */
1446 if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && 1430 if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
1447 !bio_ready_for_csum(bio)) { 1431 !bio_ready_for_csum(bio)) {
1448 btrfs_queue_worker(&fs_info->endio_meta_workers, 1432 btrfs_queue_worker(&fs_info->endio_meta_workers,
1449 &end_io_wq->work); 1433 &end_io_wq->work);
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
1461 struct btrfs_root *root = arg; 1445 struct btrfs_root *root = arg;
1462 1446
1463 do { 1447 do {
1464 smp_mb();
1465 if (root->fs_info->closing)
1466 break;
1467
1468 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1448 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1469 1449
1470 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1450 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
1477 if (freezing(current)) { 1457 if (freezing(current)) {
1478 refrigerator(); 1458 refrigerator();
1479 } else { 1459 } else {
1480 smp_mb();
1481 if (root->fs_info->closing)
1482 break;
1483 set_current_state(TASK_INTERRUPTIBLE); 1460 set_current_state(TASK_INTERRUPTIBLE);
1484 schedule(); 1461 if (!kthread_should_stop())
1462 schedule();
1485 __set_current_state(TASK_RUNNING); 1463 __set_current_state(TASK_RUNNING);
1486 } 1464 }
1487 } while (!kthread_should_stop()); 1465 } while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
1493 struct btrfs_root *root = arg; 1471 struct btrfs_root *root = arg;
1494 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1495 struct btrfs_transaction *cur; 1473 struct btrfs_transaction *cur;
1474 u64 transid;
1496 unsigned long now; 1475 unsigned long now;
1497 unsigned long delay; 1476 unsigned long delay;
1498 int ret; 1477 int ret;
1499 1478
1500 do { 1479 do {
1501 smp_mb();
1502 if (root->fs_info->closing)
1503 break;
1504
1505 delay = HZ * 30; 1480 delay = HZ * 30;
1506 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1507 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1482 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1508 1483
1509 mutex_lock(&root->fs_info->trans_mutex); 1484 spin_lock(&root->fs_info->new_trans_lock);
1510 cur = root->fs_info->running_transaction; 1485 cur = root->fs_info->running_transaction;
1511 if (!cur) { 1486 if (!cur) {
1512 mutex_unlock(&root->fs_info->trans_mutex); 1487 spin_unlock(&root->fs_info->new_trans_lock);
1513 goto sleep; 1488 goto sleep;
1514 } 1489 }
1515 1490
1516 now = get_seconds(); 1491 now = get_seconds();
1517 if (now < cur->start_time || now - cur->start_time < 30) { 1492 if (!cur->blocked &&
1518 mutex_unlock(&root->fs_info->trans_mutex); 1493 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock);
1519 delay = HZ * 5; 1495 delay = HZ * 5;
1520 goto sleep; 1496 goto sleep;
1521 } 1497 }
1522 mutex_unlock(&root->fs_info->trans_mutex); 1498 transid = cur->transid;
1523 trans = btrfs_start_transaction(root, 1); 1499 spin_unlock(&root->fs_info->new_trans_lock);
1524 ret = btrfs_commit_transaction(trans, root);
1525 1500
1501 trans = btrfs_join_transaction(root, 1);
1502 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret);
1505 } else {
1506 btrfs_end_transaction(trans, root);
1507 }
1526sleep: 1508sleep:
1527 wake_up_process(root->fs_info->cleaner_kthread); 1509 wake_up_process(root->fs_info->cleaner_kthread);
1528 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1510 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
1530 if (freezing(current)) { 1512 if (freezing(current)) {
1531 refrigerator(); 1513 refrigerator();
1532 } else { 1514 } else {
1533 if (root->fs_info->closing)
1534 break;
1535 set_current_state(TASK_INTERRUPTIBLE); 1515 set_current_state(TASK_INTERRUPTIBLE);
1536 schedule_timeout(delay); 1516 if (!kthread_should_stop() &&
1517 !btrfs_transaction_blocked(root->fs_info))
1518 schedule_timeout(delay);
1537 __set_current_state(TASK_RUNNING); 1519 __set_current_state(TASK_RUNNING);
1538 } 1520 }
1539 } while (!kthread_should_stop()); 1521 } while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1620 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1602 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1621 INIT_LIST_HEAD(&fs_info->space_info); 1603 INIT_LIST_HEAD(&fs_info->space_info);
1622 btrfs_mapping_init(&fs_info->mapping_tree); 1604 btrfs_mapping_init(&fs_info->mapping_tree);
1605 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1606 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1607 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1608 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1609 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1610 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1611 mutex_init(&fs_info->durable_block_rsv_mutex);
1623 atomic_set(&fs_info->nr_async_submits, 0); 1612 atomic_set(&fs_info->nr_async_submits, 0);
1624 atomic_set(&fs_info->async_delalloc_pages, 0); 1613 atomic_set(&fs_info->async_delalloc_pages, 0);
1625 atomic_set(&fs_info->async_submit_draining, 0); 1614 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1759 min_t(u64, fs_devices->num_devices, 1748 min_t(u64, fs_devices->num_devices,
1760 fs_info->thread_pool_size), 1749 fs_info->thread_pool_size),
1761 &fs_info->generic_worker); 1750 &fs_info->generic_worker);
1762 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1763 fs_info->thread_pool_size,
1764 &fs_info->generic_worker);
1765 1751
1766 /* a higher idle thresh on the submit workers makes it much more 1752 /* a higher idle thresh on the submit workers makes it much more
1767 * likely that bios will be send down in a sane order to the 1753 * likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1809 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1810 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1811 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1812 btrfs_start_workers(&fs_info->enospc_workers, 1);
1813 1798
1814 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1815 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1912 1897
1913 csum_root->track_dirty = 1; 1898 csum_root->track_dirty = 1;
1914 1899
1900 fs_info->generation = generation;
1901 fs_info->last_trans_committed = generation;
1902 fs_info->data_alloc_profile = (u64)-1;
1903 fs_info->metadata_alloc_profile = (u64)-1;
1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905
1915 ret = btrfs_read_block_groups(extent_root); 1906 ret = btrfs_read_block_groups(extent_root);
1916 if (ret) { 1907 if (ret) {
1917 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1918 goto fail_block_groups; 1909 goto fail_block_groups;
1919 } 1910 }
1920 1911
1921 fs_info->generation = generation;
1922 fs_info->last_trans_committed = generation;
1923 fs_info->data_alloc_profile = (u64)-1;
1924 fs_info->metadata_alloc_profile = (u64)-1;
1925 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1926 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1912 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1927 "btrfs-cleaner"); 1913 "btrfs-cleaner");
1928 if (IS_ERR(fs_info->cleaner_kthread)) 1914 if (IS_ERR(fs_info->cleaner_kthread))
@@ -1955,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1955 btrfs_level_size(tree_root, 1941 btrfs_level_size(tree_root,
1956 btrfs_super_log_root_level(disk_super)); 1942 btrfs_super_log_root_level(disk_super));
1957 1943
1958 log_tree_root = kzalloc(sizeof(struct btrfs_root), 1944 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1959 GFP_NOFS); 1945 if (!log_tree_root) {
1946 err = -ENOMEM;
1947 goto fail_trans_kthread;
1948 }
1960 1949
1961 __setup_root(nodesize, leafsize, sectorsize, stripesize, 1950 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1962 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1951 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1977,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1977 BUG_ON(ret); 1966 BUG_ON(ret);
1978 1967
1979 if (!(sb->s_flags & MS_RDONLY)) { 1968 if (!(sb->s_flags & MS_RDONLY)) {
1969 ret = btrfs_cleanup_fs_roots(fs_info);
1970 BUG_ON(ret);
1971
1980 ret = btrfs_recover_relocation(tree_root); 1972 ret = btrfs_recover_relocation(tree_root);
1981 if (ret < 0) { 1973 if (ret < 0) {
1982 printk(KERN_WARNING 1974 printk(KERN_WARNING
@@ -1993,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1993 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 1985 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1994 if (!fs_info->fs_root) 1986 if (!fs_info->fs_root)
1995 goto fail_trans_kthread; 1987 goto fail_trans_kthread;
1988 if (IS_ERR(fs_info->fs_root)) {
1989 err = PTR_ERR(fs_info->fs_root);
1990 goto fail_trans_kthread;
1991 }
1996 1992
1997 if (!(sb->s_flags & MS_RDONLY)) { 1993 if (!(sb->s_flags & MS_RDONLY)) {
1998 down_read(&fs_info->cleanup_work_sem); 1994 down_read(&fs_info->cleanup_work_sem);
@@ -2040,7 +2036,6 @@ fail_sb_buffer:
2040 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2041 btrfs_stop_workers(&fs_info->endio_write_workers); 2037 btrfs_stop_workers(&fs_info->endio_write_workers);
2042 btrfs_stop_workers(&fs_info->submit_workers); 2038 btrfs_stop_workers(&fs_info->submit_workers);
2043 btrfs_stop_workers(&fs_info->enospc_workers);
2044fail_iput: 2039fail_iput:
2045 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2040 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2046 iput(fs_info->btree_inode); 2041 iput(fs_info->btree_inode);
@@ -2405,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root)
2405 down_write(&root->fs_info->cleanup_work_sem); 2400 down_write(&root->fs_info->cleanup_work_sem);
2406 up_write(&root->fs_info->cleanup_work_sem); 2401 up_write(&root->fs_info->cleanup_work_sem);
2407 2402
2408 trans = btrfs_start_transaction(root, 1); 2403 trans = btrfs_join_transaction(root, 1);
2409 ret = btrfs_commit_transaction(trans, root); 2404 ret = btrfs_commit_transaction(trans, root);
2410 BUG_ON(ret); 2405 BUG_ON(ret);
2411 /* run commit again to drop the original snapshot */ 2406 /* run commit again to drop the original snapshot */
2412 trans = btrfs_start_transaction(root, 1); 2407 trans = btrfs_join_transaction(root, 1);
2413 btrfs_commit_transaction(trans, root); 2408 btrfs_commit_transaction(trans, root);
2414 ret = btrfs_write_and_wait_transaction(NULL, root); 2409 ret = btrfs_write_and_wait_transaction(NULL, root);
2415 BUG_ON(ret); 2410 BUG_ON(ret);
@@ -2426,15 +2421,15 @@ int close_ctree(struct btrfs_root *root)
2426 fs_info->closing = 1; 2421 fs_info->closing = 1;
2427 smp_mb(); 2422 smp_mb();
2428 2423
2429 kthread_stop(root->fs_info->transaction_kthread);
2430 kthread_stop(root->fs_info->cleaner_kthread);
2431
2432 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2424 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2433 ret = btrfs_commit_super(root); 2425 ret = btrfs_commit_super(root);
2434 if (ret) 2426 if (ret)
2435 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2427 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2436 } 2428 }
2437 2429
2430 kthread_stop(root->fs_info->transaction_kthread);
2431 kthread_stop(root->fs_info->cleaner_kthread);
2432
2438 fs_info->closing = 2; 2433 fs_info->closing = 2;
2439 smp_mb(); 2434 smp_mb();
2440 2435
@@ -2473,7 +2468,6 @@ int close_ctree(struct btrfs_root *root)
2473 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2468 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2474 btrfs_stop_workers(&fs_info->endio_write_workers); 2469 btrfs_stop_workers(&fs_info->endio_write_workers);
2475 btrfs_stop_workers(&fs_info->submit_workers); 2470 btrfs_stop_workers(&fs_info->submit_workers);
2476 btrfs_stop_workers(&fs_info->enospc_workers);
2477 2471
2478 btrfs_close_devices(fs_info->fs_devices); 2472 btrfs_close_devices(fs_info->fs_devices);
2479 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2473 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 87 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
89 int rw, struct bio *bio, int mirror_num, 89 int rw, struct bio *bio, int mirror_num,
90 unsigned long bio_flags, 90 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 91 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 92 extent_submit_bio_hook_t *submit_bio_done);
93 93
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 96int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
98int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
99 struct btrfs_fs_info *fs_info);
100int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 98int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101 struct btrfs_fs_info *fs_info); 99 struct btrfs_fs_info *fs_info);
102int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 100int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
35 35
36static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 37 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc);
39 int mark_free); 39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve, int sinfo);
41 u64 num_bytes, int reserve);
42static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
43 struct btrfs_root *root, 42 struct btrfs_root *root,
44 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
61static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
63 u64 flags, int force); 62 u64 flags, int force);
64static int pin_down_bytes(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 struct btrfs_path *path,
67 u64 bytenr, u64 num_bytes,
68 int is_data, int reserved,
69 struct extent_buffer **must_clean);
70static int find_next_key(struct btrfs_path *path, int level, 63static int find_next_key(struct btrfs_path *path, int level,
71 struct btrfs_key *key); 64 struct btrfs_key *key);
72static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
91 84
92void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
93{ 86{
94 if (atomic_dec_and_test(&cache->count)) 87 if (atomic_dec_and_test(&cache->count)) {
88 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0);
95 kfree(cache); 91 kfree(cache);
92 }
96} 93}
97 94
98/* 95/*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
319 316
320 exclude_super_stripes(extent_root, block_group); 317 exclude_super_stripes(extent_root, block_group);
321 spin_lock(&block_group->space_info->lock); 318 spin_lock(&block_group->space_info->lock);
322 block_group->space_info->bytes_super += block_group->bytes_super; 319 block_group->space_info->bytes_readonly += block_group->bytes_super;
323 spin_unlock(&block_group->space_info->lock); 320 spin_unlock(&block_group->space_info->lock);
324 321
325 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
507 struct list_head *head = &info->space_info; 504 struct list_head *head = &info->space_info;
508 struct btrfs_space_info *found; 505 struct btrfs_space_info *found;
509 506
507 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
508 BTRFS_BLOCK_GROUP_METADATA;
509
510 rcu_read_lock(); 510 rcu_read_lock();
511 list_for_each_entry_rcu(found, head, list) { 511 list_for_each_entry_rcu(found, head, list) {
512 if (found->flags == flags) { 512 if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
610} 610}
611 611
612/* 612/*
613 * helper function to lookup reference count and flags of extent.
614 *
615 * the head node for delayed ref is used to store the sum of all the
616 * reference count modifications queued up in the rbtree. the head
617 * node may also store the extent flags to set. This way you can check
618 * to see what the reference count and extent flags would be if all of
619 * the delayed refs are not processed.
620 */
621int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
622 struct btrfs_root *root, u64 bytenr,
623 u64 num_bytes, u64 *refs, u64 *flags)
624{
625 struct btrfs_delayed_ref_head *head;
626 struct btrfs_delayed_ref_root *delayed_refs;
627 struct btrfs_path *path;
628 struct btrfs_extent_item *ei;
629 struct extent_buffer *leaf;
630 struct btrfs_key key;
631 u32 item_size;
632 u64 num_refs;
633 u64 extent_flags;
634 int ret;
635
636 path = btrfs_alloc_path();
637 if (!path)
638 return -ENOMEM;
639
640 key.objectid = bytenr;
641 key.type = BTRFS_EXTENT_ITEM_KEY;
642 key.offset = num_bytes;
643 if (!trans) {
644 path->skip_locking = 1;
645 path->search_commit_root = 1;
646 }
647again:
648 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
649 &key, path, 0, 0);
650 if (ret < 0)
651 goto out_free;
652
653 if (ret == 0) {
654 leaf = path->nodes[0];
655 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
656 if (item_size >= sizeof(*ei)) {
657 ei = btrfs_item_ptr(leaf, path->slots[0],
658 struct btrfs_extent_item);
659 num_refs = btrfs_extent_refs(leaf, ei);
660 extent_flags = btrfs_extent_flags(leaf, ei);
661 } else {
662#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
663 struct btrfs_extent_item_v0 *ei0;
664 BUG_ON(item_size != sizeof(*ei0));
665 ei0 = btrfs_item_ptr(leaf, path->slots[0],
666 struct btrfs_extent_item_v0);
667 num_refs = btrfs_extent_refs_v0(leaf, ei0);
668 /* FIXME: this isn't correct for data */
669 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
670#else
671 BUG();
672#endif
673 }
674 BUG_ON(num_refs == 0);
675 } else {
676 num_refs = 0;
677 extent_flags = 0;
678 ret = 0;
679 }
680
681 if (!trans)
682 goto out;
683
684 delayed_refs = &trans->transaction->delayed_refs;
685 spin_lock(&delayed_refs->lock);
686 head = btrfs_find_delayed_ref_head(trans, bytenr);
687 if (head) {
688 if (!mutex_trylock(&head->mutex)) {
689 atomic_inc(&head->node.refs);
690 spin_unlock(&delayed_refs->lock);
691
692 btrfs_release_path(root->fs_info->extent_root, path);
693
694 mutex_lock(&head->mutex);
695 mutex_unlock(&head->mutex);
696 btrfs_put_delayed_ref(&head->node);
697 goto again;
698 }
699 if (head->extent_op && head->extent_op->update_flags)
700 extent_flags |= head->extent_op->flags_to_set;
701 else
702 BUG_ON(num_refs == 0);
703
704 num_refs += head->node.ref_mod;
705 mutex_unlock(&head->mutex);
706 }
707 spin_unlock(&delayed_refs->lock);
708out:
709 WARN_ON(num_refs == 0);
710 if (refs)
711 *refs = num_refs;
712 if (flags)
713 *flags = extent_flags;
714out_free:
715 btrfs_free_path(path);
716 return ret;
717}
718
719/*
613 * Back reference rules. Back refs have three main goals: 720 * Back reference rules. Back refs have three main goals:
614 * 721 *
615 * 1) differentiate between all holders of references to an extent so that 722 * 1) differentiate between all holders of references to an extent so that
@@ -1589,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
1589 u64 start, u64 len) 1696 u64 start, u64 len)
1590{ 1697{
1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1592 DISCARD_FL_BARRIER); 1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1593} 1700}
1594 1701
1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1871 return ret; 1978 return ret;
1872} 1979}
1873 1980
1874
1875/* helper function to actually process a single delayed ref entry */ 1981/* helper function to actually process a single delayed ref entry */
1876static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1982static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1877 struct btrfs_root *root, 1983 struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1891 BUG_ON(extent_op); 1997 BUG_ON(extent_op);
1892 head = btrfs_delayed_node_to_head(node); 1998 head = btrfs_delayed_node_to_head(node);
1893 if (insert_reserved) { 1999 if (insert_reserved) {
1894 int mark_free = 0; 2000 btrfs_pin_extent(root, node->bytenr,
1895 struct extent_buffer *must_clean = NULL; 2001 node->num_bytes, 1);
1896
1897 ret = pin_down_bytes(trans, root, NULL,
1898 node->bytenr, node->num_bytes,
1899 head->is_data, 1, &must_clean);
1900 if (ret > 0)
1901 mark_free = 1;
1902
1903 if (must_clean) {
1904 clean_tree_block(NULL, root, must_clean);
1905 btrfs_tree_unlock(must_clean);
1906 free_extent_buffer(must_clean);
1907 }
1908 if (head->is_data) { 2002 if (head->is_data) {
1909 ret = btrfs_del_csums(trans, root, 2003 ret = btrfs_del_csums(trans, root,
1910 node->bytenr, 2004 node->bytenr,
1911 node->num_bytes); 2005 node->num_bytes);
1912 BUG_ON(ret); 2006 BUG_ON(ret);
1913 } 2007 }
1914 if (mark_free) {
1915 ret = btrfs_free_reserved_extent(root,
1916 node->bytenr,
1917 node->num_bytes);
1918 BUG_ON(ret);
1919 }
1920 } 2008 }
1921 mutex_unlock(&head->mutex); 2009 mutex_unlock(&head->mutex);
1922 return 0; 2010 return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2347 ret = 0; 2435 ret = 0;
2348out: 2436out:
2349 btrfs_free_path(path); 2437 btrfs_free_path(path);
2438 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2439 WARN_ON(ret > 0);
2350 return ret; 2440 return ret;
2351} 2441}
2352 2442
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2660 struct btrfs_space_info **space_info) 2750 struct btrfs_space_info **space_info)
2661{ 2751{
2662 struct btrfs_space_info *found; 2752 struct btrfs_space_info *found;
2753 int i;
2754 int factor;
2755
2756 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2757 BTRFS_BLOCK_GROUP_RAID10))
2758 factor = 2;
2759 else
2760 factor = 1;
2663 2761
2664 found = __find_space_info(info, flags); 2762 found = __find_space_info(info, flags);
2665 if (found) { 2763 if (found) {
2666 spin_lock(&found->lock); 2764 spin_lock(&found->lock);
2667 found->total_bytes += total_bytes; 2765 found->total_bytes += total_bytes;
2668 found->bytes_used += bytes_used; 2766 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor;
2669 found->full = 0; 2768 found->full = 0;
2670 spin_unlock(&found->lock); 2769 spin_unlock(&found->lock);
2671 *space_info = found; 2770 *space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2675 if (!found) 2774 if (!found)
2676 return -ENOMEM; 2775 return -ENOMEM;
2677 2776
2678 INIT_LIST_HEAD(&found->block_groups); 2777 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2778 INIT_LIST_HEAD(&found->block_groups[i]);
2679 init_rwsem(&found->groups_sem); 2779 init_rwsem(&found->groups_sem);
2680 init_waitqueue_head(&found->flush_wait);
2681 init_waitqueue_head(&found->allocate_wait);
2682 spin_lock_init(&found->lock); 2780 spin_lock_init(&found->lock);
2683 found->flags = flags; 2781 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2782 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA);
2684 found->total_bytes = total_bytes; 2784 found->total_bytes = total_bytes;
2685 found->bytes_used = bytes_used; 2785 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor;
2686 found->bytes_pinned = 0; 2787 found->bytes_pinned = 0;
2687 found->bytes_reserved = 0; 2788 found->bytes_reserved = 0;
2688 found->bytes_readonly = 0; 2789 found->bytes_readonly = 0;
2689 found->bytes_delalloc = 0; 2790 found->bytes_may_use = 0;
2690 found->full = 0; 2791 found->full = 0;
2691 found->force_alloc = 0; 2792 found->force_alloc = 0;
2692 *space_info = found; 2793 *space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2711 } 2812 }
2712} 2813}
2713 2814
2714static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2715{
2716 spin_lock(&cache->space_info->lock);
2717 spin_lock(&cache->lock);
2718 if (!cache->ro) {
2719 cache->space_info->bytes_readonly += cache->key.offset -
2720 btrfs_block_group_used(&cache->item);
2721 cache->ro = 1;
2722 }
2723 spin_unlock(&cache->lock);
2724 spin_unlock(&cache->space_info->lock);
2725}
2726
2727u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2815u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2728{ 2816{
2729 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2817 u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2752 return flags; 2840 return flags;
2753} 2841}
2754 2842
2755static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2843static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2756{
2757 struct btrfs_fs_info *info = root->fs_info;
2758 u64 alloc_profile;
2759
2760 if (data) {
2761 alloc_profile = info->avail_data_alloc_bits &
2762 info->data_alloc_profile;
2763 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2764 } else if (root == root->fs_info->chunk_root) {
2765 alloc_profile = info->avail_system_alloc_bits &
2766 info->system_alloc_profile;
2767 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2768 } else {
2769 alloc_profile = info->avail_metadata_alloc_bits &
2770 info->metadata_alloc_profile;
2771 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2772 }
2773
2774 return btrfs_reduce_alloc_profile(root, data);
2775}
2776
2777void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2778{
2779 u64 alloc_target;
2780
2781 alloc_target = btrfs_get_alloc_profile(root, 1);
2782 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2783 alloc_target);
2784}
2785
2786static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2787{
2788 u64 num_bytes;
2789 int level;
2790
2791 level = BTRFS_MAX_LEVEL - 2;
2792 /*
2793 * NOTE: these calculations are absolutely the worst possible case.
2794 * This assumes that _every_ item we insert will require a new leaf, and
2795 * that the tree has grown to its maximum level size.
2796 */
2797
2798 /*
2799 * for every item we insert we could insert both an extent item and a
2800 * extent ref item. Then for ever item we insert, we will need to cow
2801 * both the original leaf, plus the leaf to the left and right of it.
2802 *
2803 * Unless we are talking about the extent root, then we just want the
2804 * number of items * 2, since we just need the extent item plus its ref.
2805 */
2806 if (root == root->fs_info->extent_root)
2807 num_bytes = num_items * 2;
2808 else
2809 num_bytes = (num_items + (2 * num_items)) * 3;
2810
2811 /*
2812 * num_bytes is total number of leaves we could need times the leaf
2813 * size, and then for every leaf we could end up cow'ing 2 nodes per
2814 * level, down to the leaf level.
2815 */
2816 num_bytes = (num_bytes * root->leafsize) +
2817 (num_bytes * (level * 2)) * root->nodesize;
2818
2819 return num_bytes;
2820}
2821
2822/*
2823 * Unreserve metadata space for delalloc. If we have less reserved credits than
2824 * we have extents, this function does nothing.
2825 */
2826int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2827 struct inode *inode, int num_items)
2828{
2829 struct btrfs_fs_info *info = root->fs_info;
2830 struct btrfs_space_info *meta_sinfo;
2831 u64 num_bytes;
2832 u64 alloc_target;
2833 bool bug = false;
2834
2835 /* get the space info for where the metadata will live */
2836 alloc_target = btrfs_get_alloc_profile(root, 0);
2837 meta_sinfo = __find_space_info(info, alloc_target);
2838
2839 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2840 num_items);
2841
2842 spin_lock(&meta_sinfo->lock);
2843 spin_lock(&BTRFS_I(inode)->accounting_lock);
2844 if (BTRFS_I(inode)->reserved_extents <=
2845 BTRFS_I(inode)->outstanding_extents) {
2846 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2847 spin_unlock(&meta_sinfo->lock);
2848 return 0;
2849 }
2850 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2851
2852 BTRFS_I(inode)->reserved_extents -= num_items;
2853 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2854
2855 if (meta_sinfo->bytes_delalloc < num_bytes) {
2856 bug = true;
2857 meta_sinfo->bytes_delalloc = 0;
2858 } else {
2859 meta_sinfo->bytes_delalloc -= num_bytes;
2860 }
2861 spin_unlock(&meta_sinfo->lock);
2862
2863 BUG_ON(bug);
2864
2865 return 0;
2866}
2867
2868static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2869{ 2844{
2870 u64 thresh; 2845 if (flags & BTRFS_BLOCK_GROUP_DATA)
2871 2846 flags |= root->fs_info->avail_data_alloc_bits &
2872 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2847 root->fs_info->data_alloc_profile;
2873 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2848 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2874 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 2849 flags |= root->fs_info->avail_system_alloc_bits &
2875 meta_sinfo->bytes_may_use; 2850 root->fs_info->system_alloc_profile;
2876 2851 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2877 thresh = meta_sinfo->total_bytes - thresh; 2852 flags |= root->fs_info->avail_metadata_alloc_bits &
2878 thresh *= 80; 2853 root->fs_info->metadata_alloc_profile;
2879 do_div(thresh, 100); 2854 return btrfs_reduce_alloc_profile(root, flags);
2880 if (thresh <= meta_sinfo->bytes_delalloc)
2881 meta_sinfo->force_delalloc = 1;
2882 else
2883 meta_sinfo->force_delalloc = 0;
2884} 2855}
2885 2856
2886struct async_flush { 2857static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2887 struct btrfs_root *root;
2888 struct btrfs_space_info *info;
2889 struct btrfs_work work;
2890};
2891
2892static noinline void flush_delalloc_async(struct btrfs_work *work)
2893{ 2858{
2894 struct async_flush *async; 2859 u64 flags;
2895 struct btrfs_root *root;
2896 struct btrfs_space_info *info;
2897
2898 async = container_of(work, struct async_flush, work);
2899 root = async->root;
2900 info = async->info;
2901
2902 btrfs_start_delalloc_inodes(root, 0);
2903 wake_up(&info->flush_wait);
2904 btrfs_wait_ordered_extents(root, 0, 0);
2905
2906 spin_lock(&info->lock);
2907 info->flushing = 0;
2908 spin_unlock(&info->lock);
2909 wake_up(&info->flush_wait);
2910
2911 kfree(async);
2912}
2913
2914static void wait_on_flush(struct btrfs_space_info *info)
2915{
2916 DEFINE_WAIT(wait);
2917 u64 used;
2918
2919 while (1) {
2920 prepare_to_wait(&info->flush_wait, &wait,
2921 TASK_UNINTERRUPTIBLE);
2922 spin_lock(&info->lock);
2923 if (!info->flushing) {
2924 spin_unlock(&info->lock);
2925 break;
2926 }
2927
2928 used = info->bytes_used + info->bytes_reserved +
2929 info->bytes_pinned + info->bytes_readonly +
2930 info->bytes_super + info->bytes_root +
2931 info->bytes_may_use + info->bytes_delalloc;
2932 if (used < info->total_bytes) {
2933 spin_unlock(&info->lock);
2934 break;
2935 }
2936 spin_unlock(&info->lock);
2937 schedule();
2938 }
2939 finish_wait(&info->flush_wait, &wait);
2940}
2941
2942static void flush_delalloc(struct btrfs_root *root,
2943 struct btrfs_space_info *info)
2944{
2945 struct async_flush *async;
2946 bool wait = false;
2947
2948 spin_lock(&info->lock);
2949 2860
2950 if (!info->flushing) 2861 if (data)
2951 info->flushing = 1; 2862 flags = BTRFS_BLOCK_GROUP_DATA;
2863 else if (root == root->fs_info->chunk_root)
2864 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2952 else 2865 else
2953 wait = true; 2866 flags = BTRFS_BLOCK_GROUP_METADATA;
2954
2955 spin_unlock(&info->lock);
2956
2957 if (wait) {
2958 wait_on_flush(info);
2959 return;
2960 }
2961
2962 async = kzalloc(sizeof(*async), GFP_NOFS);
2963 if (!async)
2964 goto flush;
2965
2966 async->root = root;
2967 async->info = info;
2968 async->work.func = flush_delalloc_async;
2969
2970 btrfs_queue_worker(&root->fs_info->enospc_workers,
2971 &async->work);
2972 wait_on_flush(info);
2973 return;
2974
2975flush:
2976 btrfs_start_delalloc_inodes(root, 0);
2977 btrfs_wait_ordered_extents(root, 0, 0);
2978
2979 spin_lock(&info->lock);
2980 info->flushing = 0;
2981 spin_unlock(&info->lock);
2982 wake_up(&info->flush_wait);
2983}
2984
2985static int maybe_allocate_chunk(struct btrfs_root *root,
2986 struct btrfs_space_info *info)
2987{
2988 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2989 struct btrfs_trans_handle *trans;
2990 bool wait = false;
2991 int ret = 0;
2992 u64 min_metadata;
2993 u64 free_space;
2994
2995 free_space = btrfs_super_total_bytes(disk_super);
2996 /*
2997 * we allow the metadata to grow to a max of either 10gb or 5% of the
2998 * space in the volume.
2999 */
3000 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
3001 div64_u64(free_space * 5, 100));
3002 if (info->total_bytes >= min_metadata) {
3003 spin_unlock(&info->lock);
3004 return 0;
3005 }
3006
3007 if (info->full) {
3008 spin_unlock(&info->lock);
3009 return 0;
3010 }
3011
3012 if (!info->allocating_chunk) {
3013 info->force_alloc = 1;
3014 info->allocating_chunk = 1;
3015 } else {
3016 wait = true;
3017 }
3018
3019 spin_unlock(&info->lock);
3020
3021 if (wait) {
3022 wait_event(info->allocate_wait,
3023 !info->allocating_chunk);
3024 return 1;
3025 }
3026
3027 trans = btrfs_start_transaction(root, 1);
3028 if (!trans) {
3029 ret = -ENOMEM;
3030 goto out;
3031 }
3032
3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3034 4096 + 2 * 1024 * 1024,
3035 info->flags, 0);
3036 btrfs_end_transaction(trans, root);
3037 if (ret)
3038 goto out;
3039out:
3040 spin_lock(&info->lock);
3041 info->allocating_chunk = 0;
3042 spin_unlock(&info->lock);
3043 wake_up(&info->allocate_wait);
3044
3045 if (ret)
3046 return 0;
3047 return 1;
3048}
3049
3050/*
3051 * Reserve metadata space for delalloc.
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054 struct inode *inode, int num_items)
3055{
3056 struct btrfs_fs_info *info = root->fs_info;
3057 struct btrfs_space_info *meta_sinfo;
3058 u64 num_bytes;
3059 u64 used;
3060 u64 alloc_target;
3061 int flushed = 0;
3062 int force_delalloc;
3063
3064 /* get the space info for where the metadata will live */
3065 alloc_target = btrfs_get_alloc_profile(root, 0);
3066 meta_sinfo = __find_space_info(info, alloc_target);
3067
3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3069 num_items);
3070again:
3071 spin_lock(&meta_sinfo->lock);
3072
3073 force_delalloc = meta_sinfo->force_delalloc;
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!flushed)
3079 meta_sinfo->bytes_delalloc += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 flushed++;
3088
3089 if (flushed == 1) {
3090 if (maybe_allocate_chunk(root, meta_sinfo))
3091 goto again;
3092 flushed++;
3093 } else {
3094 spin_unlock(&meta_sinfo->lock);
3095 }
3096
3097 if (flushed == 2) {
3098 filemap_flush(inode->i_mapping);
3099 goto again;
3100 } else if (flushed == 3) {
3101 flush_delalloc(root, meta_sinfo);
3102 goto again;
3103 }
3104 spin_lock(&meta_sinfo->lock);
3105 meta_sinfo->bytes_delalloc -= num_bytes;
3106 spin_unlock(&meta_sinfo->lock);
3107 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108 BTRFS_I(inode)->outstanding_extents,
3109 BTRFS_I(inode)->reserved_extents);
3110 dump_space_info(meta_sinfo, 0, 0);
3111 return -ENOSPC;
3112 }
3113 2867
3114 BTRFS_I(inode)->reserved_extents += num_items; 2868 return get_alloc_profile(root, flags);
3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock);
3117
3118 if (!flushed && force_delalloc)
3119 filemap_flush(inode->i_mapping);
3120
3121 return 0;
3122} 2869}
3123 2870
3124/* 2871void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{
3135 struct btrfs_fs_info *info = root->fs_info;
3136 struct btrfs_space_info *meta_sinfo;
3137 u64 num_bytes;
3138 u64 alloc_target;
3139 bool bug = false;
3140
3141 /* get the space info for where the metadata will live */
3142 alloc_target = btrfs_get_alloc_profile(root, 0);
3143 meta_sinfo = __find_space_info(info, alloc_target);
3144
3145 num_bytes = calculate_bytes_needed(root, num_items);
3146
3147 spin_lock(&meta_sinfo->lock);
3148 if (meta_sinfo->bytes_may_use < num_bytes) {
3149 bug = true;
3150 meta_sinfo->bytes_may_use = 0;
3151 } else {
3152 meta_sinfo->bytes_may_use -= num_bytes;
3153 }
3154 spin_unlock(&meta_sinfo->lock);
3155
3156 BUG_ON(bug);
3157
3158 return 0;
3159}
3160
3161/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number
3163 * of bytes that would be needed to modify num_items number of items. If we
3164 * have space, fantastic, if not, you get -ENOSPC. Please call
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3175{ 2872{
3176 struct btrfs_fs_info *info = root->fs_info; 2873 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3177 struct btrfs_space_info *meta_sinfo; 2874 BTRFS_BLOCK_GROUP_DATA);
3178 u64 num_bytes;
3179 u64 used;
3180 u64 alloc_target;
3181 int retries = 0;
3182
3183 /* get the space info for where the metadata will live */
3184 alloc_target = btrfs_get_alloc_profile(root, 0);
3185 meta_sinfo = __find_space_info(info, alloc_target);
3186
3187 num_bytes = calculate_bytes_needed(root, num_items);
3188again:
3189 spin_lock(&meta_sinfo->lock);
3190
3191 if (unlikely(!meta_sinfo->bytes_root))
3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3193
3194 if (!retries)
3195 meta_sinfo->bytes_may_use += num_bytes;
3196
3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3201
3202 if (used > meta_sinfo->total_bytes) {
3203 retries++;
3204 if (retries == 1) {
3205 if (maybe_allocate_chunk(root, meta_sinfo))
3206 goto again;
3207 retries++;
3208 } else {
3209 spin_unlock(&meta_sinfo->lock);
3210 }
3211
3212 if (retries == 2) {
3213 flush_delalloc(root, meta_sinfo);
3214 goto again;
3215 }
3216 spin_lock(&meta_sinfo->lock);
3217 meta_sinfo->bytes_may_use -= num_bytes;
3218 spin_unlock(&meta_sinfo->lock);
3219
3220 dump_space_info(meta_sinfo, 0, 0);
3221 return -ENOSPC;
3222 }
3223
3224 check_force_delalloc(meta_sinfo);
3225 spin_unlock(&meta_sinfo->lock);
3226
3227 return 0;
3228} 2875}
3229 2876
3230/* 2877/*
3231 * This will check the space that the inode allocates from to make sure we have 2878 * This will check the space that the inode allocates from to make sure we have
3232 * enough space for bytes. 2879 * enough space for bytes.
3233 */ 2880 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2881int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3235 u64 bytes)
3236{ 2882{
3237 struct btrfs_space_info *data_sinfo; 2883 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root;
3238 u64 used; 2885 u64 used;
3239 int ret = 0, committed = 0, flushed = 0; 2886 int ret = 0, committed = 0;
3240 2887
3241 /* make sure bytes are sectorsize aligned */ 2888 /* make sure bytes are sectorsize aligned */
3242 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 2889 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3248again: 2895again:
3249 /* make sure we have enough space to handle the data first */ 2896 /* make sure we have enough space to handle the data first */
3250 spin_lock(&data_sinfo->lock); 2897 spin_lock(&data_sinfo->lock);
3251 used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + 2898 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3252 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + 2899 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3253 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + 2900 data_sinfo->bytes_may_use;
3254 data_sinfo->bytes_super;
3255 2901
3256 if (used + bytes > data_sinfo->total_bytes) { 2902 if (used + bytes > data_sinfo->total_bytes) {
3257 struct btrfs_trans_handle *trans; 2903 struct btrfs_trans_handle *trans;
3258 2904
3259 if (!flushed) {
3260 spin_unlock(&data_sinfo->lock);
3261 flush_delalloc(root, data_sinfo);
3262 flushed = 1;
3263 goto again;
3264 }
3265
3266 /* 2905 /*
3267 * if we don't have enough free bytes in this space then we need 2906 * if we don't have enough free bytes in this space then we need
3268 * to alloc a new chunk. 2907 * to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
3274 spin_unlock(&data_sinfo->lock); 2913 spin_unlock(&data_sinfo->lock);
3275alloc: 2914alloc:
3276 alloc_target = btrfs_get_alloc_profile(root, 1); 2915 alloc_target = btrfs_get_alloc_profile(root, 1);
3277 trans = btrfs_start_transaction(root, 1); 2916 trans = btrfs_join_transaction(root, 1);
3278 if (!trans) 2917 if (IS_ERR(trans))
3279 return -ENOMEM; 2918 return PTR_ERR(trans);
3280 2919
3281 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2920 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3282 bytes + 2 * 1024 * 1024, 2921 bytes + 2 * 1024 * 1024,
3283 alloc_target, 0); 2922 alloc_target, 0);
3284 btrfs_end_transaction(trans, root); 2923 btrfs_end_transaction(trans, root);
3285 if (ret) 2924 if (ret < 0)
3286 return ret; 2925 return ret;
3287 2926
3288 if (!data_sinfo) { 2927 if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
3297 if (!committed && !root->fs_info->open_ioctl_trans) { 2936 if (!committed && !root->fs_info->open_ioctl_trans) {
3298 committed = 1; 2937 committed = 1;
3299 trans = btrfs_join_transaction(root, 1); 2938 trans = btrfs_join_transaction(root, 1);
3300 if (!trans) 2939 if (IS_ERR(trans))
3301 return -ENOMEM; 2940 return PTR_ERR(trans);
3302 ret = btrfs_commit_transaction(trans, root); 2941 ret = btrfs_commit_transaction(trans, root);
3303 if (ret) 2942 if (ret)
3304 return ret; 2943 return ret;
3305 goto again; 2944 goto again;
3306 } 2945 }
3307 2946
3308 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2947#if 0 /* I hope we never need this code again, just in case */
3309 ", %llu bytes_used, %llu bytes_reserved, " 2948 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3310 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 2949 "%llu bytes_reserved, " "%llu bytes_pinned, "
3311 "%llu total\n", (unsigned long long)bytes, 2950 "%llu bytes_readonly, %llu may use %llu total\n",
3312 (unsigned long long)data_sinfo->bytes_delalloc, 2951 (unsigned long long)bytes,
3313 (unsigned long long)data_sinfo->bytes_used, 2952 (unsigned long long)data_sinfo->bytes_used,
3314 (unsigned long long)data_sinfo->bytes_reserved, 2953 (unsigned long long)data_sinfo->bytes_reserved,
3315 (unsigned long long)data_sinfo->bytes_pinned, 2954 (unsigned long long)data_sinfo->bytes_pinned,
3316 (unsigned long long)data_sinfo->bytes_readonly, 2955 (unsigned long long)data_sinfo->bytes_readonly,
3317 (unsigned long long)data_sinfo->bytes_may_use, 2956 (unsigned long long)data_sinfo->bytes_may_use,
3318 (unsigned long long)data_sinfo->total_bytes); 2957 (unsigned long long)data_sinfo->total_bytes);
2958#endif
3319 return -ENOSPC; 2959 return -ENOSPC;
3320 } 2960 }
3321 data_sinfo->bytes_may_use += bytes; 2961 data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
3326} 2966}
3327 2967
3328/* 2968/*
3329 * if there was an error for whatever reason after calling 2969 * called when we are clearing an delalloc extent from the
3330 * btrfs_check_data_free_space, call this so we can cleanup the counters. 2970 * inode's io_tree or there was an error for whatever reason
2971 * after calling btrfs_check_data_free_space
3331 */ 2972 */
3332void btrfs_free_reserved_data_space(struct btrfs_root *root, 2973void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3333 struct inode *inode, u64 bytes)
3334{ 2974{
2975 struct btrfs_root *root = BTRFS_I(inode)->root;
3335 struct btrfs_space_info *data_sinfo; 2976 struct btrfs_space_info *data_sinfo;
3336 2977
3337 /* make sure bytes are sectorsize aligned */ 2978 /* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
3344 spin_unlock(&data_sinfo->lock); 2985 spin_unlock(&data_sinfo->lock);
3345} 2986}
3346 2987
3347/* called when we are adding a delalloc extent to the inode's io_tree */
3348void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3349 u64 bytes)
3350{
3351 struct btrfs_space_info *data_sinfo;
3352
3353 /* get the space info for where this inode will be storing its data */
3354 data_sinfo = BTRFS_I(inode)->space_info;
3355
3356 /* make sure we have enough space to handle the data first */
3357 spin_lock(&data_sinfo->lock);
3358 data_sinfo->bytes_delalloc += bytes;
3359
3360 /*
3361 * we are adding a delalloc extent without calling
3362 * btrfs_check_data_free_space first. This happens on a weird
3363 * writepage condition, but shouldn't hurt our accounting
3364 */
3365 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3366 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3367 BTRFS_I(inode)->reserved_bytes = 0;
3368 } else {
3369 data_sinfo->bytes_may_use -= bytes;
3370 BTRFS_I(inode)->reserved_bytes -= bytes;
3371 }
3372
3373 spin_unlock(&data_sinfo->lock);
3374}
3375
3376/* called when we are clearing an delalloc extent from the inode's io_tree */
3377void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3378 u64 bytes)
3379{
3380 struct btrfs_space_info *info;
3381
3382 info = BTRFS_I(inode)->space_info;
3383
3384 spin_lock(&info->lock);
3385 info->bytes_delalloc -= bytes;
3386 spin_unlock(&info->lock);
3387}
3388
3389static void force_metadata_allocation(struct btrfs_fs_info *info) 2988static void force_metadata_allocation(struct btrfs_fs_info *info)
3390{ 2989{
3391 struct list_head *head = &info->space_info; 2990 struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3399 rcu_read_unlock(); 2998 rcu_read_unlock();
3400} 2999}
3401 3000
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3002 u64 alloc_bytes)
3003{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3005
3006 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3008 return 0;
3009
3010 if (sinfo->bytes_used + sinfo->bytes_reserved +
3011 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0;
3013
3014 return 1;
3015}
3016
3402static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3017static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3403 struct btrfs_root *extent_root, u64 alloc_bytes, 3018 struct btrfs_root *extent_root, u64 alloc_bytes,
3404 u64 flags, int force) 3019 u64 flags, int force)
3405{ 3020{
3406 struct btrfs_space_info *space_info; 3021 struct btrfs_space_info *space_info;
3407 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3022 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3408 u64 thresh;
3409 int ret = 0; 3023 int ret = 0;
3410 3024
3411 mutex_lock(&fs_info->chunk_mutex); 3025 mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3428 goto out; 3042 goto out;
3429 } 3043 }
3430 3044
3431 thresh = space_info->total_bytes - space_info->bytes_readonly; 3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3432 thresh = div_factor(thresh, 8);
3433 if (!force &&
3434 (space_info->bytes_used + space_info->bytes_pinned +
3435 space_info->bytes_reserved + alloc_bytes) < thresh) {
3436 spin_unlock(&space_info->lock); 3046 spin_unlock(&space_info->lock);
3437 goto out; 3047 goto out;
3438 } 3048 }
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3454 spin_lock(&space_info->lock); 3064 spin_lock(&space_info->lock);
3455 if (ret) 3065 if (ret)
3456 space_info->full = 1; 3066 space_info->full = 1;
3067 else
3068 ret = 1;
3457 space_info->force_alloc = 0; 3069 space_info->force_alloc = 0;
3458 spin_unlock(&space_info->lock); 3070 spin_unlock(&space_info->lock);
3459out: 3071out:
@@ -3461,13 +3073,713 @@ out:
3461 return ret; 3073 return ret;
3462} 3074}
3463 3075
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/*
3109 * shrink metadata reservation for delalloc
3110 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim)
3113{
3114 struct btrfs_block_rsv *block_rsv;
3115 u64 reserved;
3116 u64 max_reclaim;
3117 u64 reclaimed = 0;
3118 int pause = 1;
3119 int ret;
3120
3121 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock);
3123 reserved = block_rsv->reserved;
3124 spin_unlock(&block_rsv->lock);
3125
3126 if (reserved == 0)
3127 return 0;
3128
3129 max_reclaim = min(reserved, to_reclaim);
3130
3131 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3133 if (!ret) {
3134 __set_current_state(TASK_INTERRUPTIBLE);
3135 schedule_timeout(pause);
3136 pause <<= 1;
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142
3143 spin_lock(&block_rsv->lock);
3144 if (reserved > block_rsv->reserved)
3145 reclaimed = reserved - block_rsv->reserved;
3146 reserved = block_rsv->reserved;
3147 spin_unlock(&block_rsv->lock);
3148
3149 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break;
3151
3152 if (trans && trans->transaction->blocked)
3153 return -EAGAIN;
3154 }
3155 return reclaimed >= to_reclaim;
3156}
3157
3158static int should_retry_reserve(struct btrfs_trans_handle *trans,
3159 struct btrfs_root *root,
3160 struct btrfs_block_rsv *block_rsv,
3161 u64 num_bytes, int *retries)
3162{
3163 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret;
3165
3166 if ((*retries) > 2)
3167 return -ENOSPC;
3168
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3170 if (ret)
3171 return 1;
3172
3173 if (trans && trans->transaction->in_commit)
3174 return -ENOSPC;
3175
3176 ret = shrink_delalloc(trans, root, num_bytes);
3177 if (ret)
3178 return ret;
3179
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188
3189 if (trans)
3190 return -EAGAIN;
3191
3192 trans = btrfs_join_transaction(root, 1);
3193 BUG_ON(IS_ERR(trans));
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196
3197 return 1;
3198}
3199
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3201 u64 num_bytes)
3202{
3203 struct btrfs_space_info *space_info = block_rsv->space_info;
3204 u64 unused;
3205 int ret = -ENOSPC;
3206
3207 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved +
3209 space_info->bytes_pinned + space_info->bytes_readonly;
3210
3211 if (unused < space_info->total_bytes)
3212 unused = space_info->total_bytes - unused;
3213 else
3214 unused = 0;
3215
3216 if (unused >= num_bytes) {
3217 if (block_rsv->priority >= 10) {
3218 space_info->bytes_reserved += num_bytes;
3219 ret = 0;
3220 } else {
3221 if ((unused + block_rsv->reserved) *
3222 block_rsv->priority >=
3223 (num_bytes + block_rsv->reserved) * 10) {
3224 space_info->bytes_reserved += num_bytes;
3225 ret = 0;
3226 }
3227 }
3228 }
3229 spin_unlock(&space_info->lock);
3230
3231 return ret;
3232}
3233
3234static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3235 struct btrfs_root *root)
3236{
3237 struct btrfs_block_rsv *block_rsv;
3238 if (root->ref_cows)
3239 block_rsv = trans->block_rsv;
3240 else
3241 block_rsv = root->block_rsv;
3242
3243 if (!block_rsv)
3244 block_rsv = &root->fs_info->empty_block_rsv;
3245
3246 return block_rsv;
3247}
3248
3249static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3250 u64 num_bytes)
3251{
3252 int ret = -ENOSPC;
3253 spin_lock(&block_rsv->lock);
3254 if (block_rsv->reserved >= num_bytes) {
3255 block_rsv->reserved -= num_bytes;
3256 if (block_rsv->reserved < block_rsv->size)
3257 block_rsv->full = 0;
3258 ret = 0;
3259 }
3260 spin_unlock(&block_rsv->lock);
3261 return ret;
3262}
3263
3264static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3265 u64 num_bytes, int update_size)
3266{
3267 spin_lock(&block_rsv->lock);
3268 block_rsv->reserved += num_bytes;
3269 if (update_size)
3270 block_rsv->size += num_bytes;
3271 else if (block_rsv->reserved >= block_rsv->size)
3272 block_rsv->full = 1;
3273 spin_unlock(&block_rsv->lock);
3274}
3275
3276void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3277 struct btrfs_block_rsv *dest, u64 num_bytes)
3278{
3279 struct btrfs_space_info *space_info = block_rsv->space_info;
3280
3281 spin_lock(&block_rsv->lock);
3282 if (num_bytes == (u64)-1)
3283 num_bytes = block_rsv->size;
3284 block_rsv->size -= num_bytes;
3285 if (block_rsv->reserved >= block_rsv->size) {
3286 num_bytes = block_rsv->reserved - block_rsv->size;
3287 block_rsv->reserved = block_rsv->size;
3288 block_rsv->full = 1;
3289 } else {
3290 num_bytes = 0;
3291 }
3292 spin_unlock(&block_rsv->lock);
3293
3294 if (num_bytes > 0) {
3295 if (dest) {
3296 block_rsv_add_bytes(dest, num_bytes, 0);
3297 } else {
3298 spin_lock(&space_info->lock);
3299 space_info->bytes_reserved -= num_bytes;
3300 spin_unlock(&space_info->lock);
3301 }
3302 }
3303}
3304
3305static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3306 struct btrfs_block_rsv *dst, u64 num_bytes)
3307{
3308 int ret;
3309
3310 ret = block_rsv_use_bytes(src, num_bytes);
3311 if (ret)
3312 return ret;
3313
3314 block_rsv_add_bytes(dst, num_bytes, 1);
3315 return 0;
3316}
3317
3318void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3319{
3320 memset(rsv, 0, sizeof(*rsv));
3321 spin_lock_init(&rsv->lock);
3322 atomic_set(&rsv->usage, 1);
3323 rsv->priority = 6;
3324 INIT_LIST_HEAD(&rsv->list);
3325}
3326
3327struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{
3329 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv)
3335 return NULL;
3336
3337 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv;
3344}
3345
3346void btrfs_free_block_rsv(struct btrfs_root *root,
3347 struct btrfs_block_rsv *rsv)
3348{
3349 if (rsv && atomic_dec_and_test(&rsv->usage)) {
3350 btrfs_block_rsv_release(root, rsv, (u64)-1);
3351 if (!rsv->durable)
3352 kfree(rsv);
3353 }
3354}
3355
3356/*
3357 * make the block_rsv struct be able to capture freed space.
3358 * the captured space will re-add to the the block_rsv struct
3359 * after transaction commit
3360 */
3361void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3362 struct btrfs_block_rsv *block_rsv)
3363{
3364 block_rsv->durable = 1;
3365 mutex_lock(&fs_info->durable_block_rsv_mutex);
3366 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3367 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3368}
3369
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries)
3374{
3375 int ret;
3376
3377 if (num_bytes == 0)
3378 return 0;
3379again:
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3381 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0;
3384 }
3385
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret;
3391}
3392
3393int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3394 struct btrfs_root *root,
3395 struct btrfs_block_rsv *block_rsv,
3396 u64 min_reserved, int min_factor)
3397{
3398 u64 num_bytes = 0;
3399 int commit_trans = 0;
3400 int ret = -ENOSPC;
3401
3402 if (!block_rsv)
3403 return 0;
3404
3405 spin_lock(&block_rsv->lock);
3406 if (min_factor > 0)
3407 num_bytes = div_factor(block_rsv->size, min_factor);
3408 if (min_reserved > num_bytes)
3409 num_bytes = min_reserved;
3410
3411 if (block_rsv->reserved >= num_bytes) {
3412 ret = 0;
3413 } else {
3414 num_bytes -= block_rsv->reserved;
3415 if (block_rsv->durable &&
3416 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3417 commit_trans = 1;
3418 }
3419 spin_unlock(&block_rsv->lock);
3420 if (!ret)
3421 return 0;
3422
3423 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3425 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0;
3428 }
3429 }
3430
3431 if (commit_trans) {
3432 if (trans)
3433 return -EAGAIN;
3434
3435 trans = btrfs_join_transaction(root, 1);
3436 BUG_ON(IS_ERR(trans));
3437 ret = btrfs_commit_transaction(trans, root);
3438 return 0;
3439 }
3440
3441 WARN_ON(1);
3442 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3443 block_rsv->size, block_rsv->reserved,
3444 block_rsv->freed[0], block_rsv->freed[1]);
3445
3446 return -ENOSPC;
3447}
3448
3449int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3450 struct btrfs_block_rsv *dst_rsv,
3451 u64 num_bytes)
3452{
3453 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3454}
3455
3456void btrfs_block_rsv_release(struct btrfs_root *root,
3457 struct btrfs_block_rsv *block_rsv,
3458 u64 num_bytes)
3459{
3460 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3461 if (global_rsv->full || global_rsv == block_rsv ||
3462 block_rsv->space_info != global_rsv->space_info)
3463 global_rsv = NULL;
3464 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3465}
3466
3467/*
3468 * helper to calculate size of global block reservation.
3469 * the desired value is sum of space used by extent tree,
3470 * checksum tree and root tree
3471 */
3472static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3473{
3474 struct btrfs_space_info *sinfo;
3475 u64 num_bytes;
3476 u64 meta_used;
3477 u64 data_used;
3478 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3479#if 0
3480 /*
3481 * per tree used space accounting can be inaccuracy, so we
3482 * can't rely on it.
3483 */
3484 spin_lock(&fs_info->extent_root->accounting_lock);
3485 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3486 spin_unlock(&fs_info->extent_root->accounting_lock);
3487
3488 spin_lock(&fs_info->csum_root->accounting_lock);
3489 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3490 spin_unlock(&fs_info->csum_root->accounting_lock);
3491
3492 spin_lock(&fs_info->tree_root->accounting_lock);
3493 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3494 spin_unlock(&fs_info->tree_root->accounting_lock);
3495#endif
3496 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3497 spin_lock(&sinfo->lock);
3498 data_used = sinfo->bytes_used;
3499 spin_unlock(&sinfo->lock);
3500
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock);
3503 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock);
3505
3506 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3507 csum_size * 2;
3508 num_bytes += div64_u64(data_used + meta_used, 50);
3509
3510 if (num_bytes * 3 > meta_used)
3511 num_bytes = div64_u64(meta_used, 3);
3512
3513 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3514}
3515
3516static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3517{
3518 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3519 struct btrfs_space_info *sinfo = block_rsv->space_info;
3520 u64 num_bytes;
3521
3522 num_bytes = calc_global_metadata_size(fs_info);
3523
3524 spin_lock(&block_rsv->lock);
3525 spin_lock(&sinfo->lock);
3526
3527 block_rsv->size = num_bytes;
3528
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly;
3531
3532 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes;
3534 block_rsv->reserved += num_bytes;
3535 sinfo->bytes_reserved += num_bytes;
3536 }
3537
3538 if (block_rsv->reserved >= block_rsv->size) {
3539 num_bytes = block_rsv->reserved - block_rsv->size;
3540 sinfo->bytes_reserved -= num_bytes;
3541 block_rsv->reserved = block_rsv->size;
3542 block_rsv->full = 1;
3543 }
3544#if 0
3545 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3546 block_rsv->size, block_rsv->reserved);
3547#endif
3548 spin_unlock(&sinfo->lock);
3549 spin_unlock(&block_rsv->lock);
3550}
3551
3552static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3553{
3554 struct btrfs_space_info *space_info;
3555
3556 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3557 fs_info->chunk_block_rsv.space_info = space_info;
3558 fs_info->chunk_block_rsv.priority = 10;
3559
3560 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3561 fs_info->global_block_rsv.space_info = space_info;
3562 fs_info->global_block_rsv.priority = 10;
3563 fs_info->global_block_rsv.refill_used = 1;
3564 fs_info->delalloc_block_rsv.space_info = space_info;
3565 fs_info->trans_block_rsv.space_info = space_info;
3566 fs_info->empty_block_rsv.space_info = space_info;
3567 fs_info->empty_block_rsv.priority = 10;
3568
3569 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3570 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3571 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3572 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3573 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3574
3575 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3576
3577 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3578
3579 update_global_block_rsv(fs_info);
3580}
3581
3582static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3583{
3584 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3585 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3586 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3587 WARN_ON(fs_info->trans_block_rsv.size > 0);
3588 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3589 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3590 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3591}
3592
3593static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3594{
3595 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3596 3 * num_items;
3597}
3598
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root,
3601 int num_items, int *retries)
3602{
3603 u64 num_bytes;
3604 int ret;
3605
3606 if (num_items == 0 || root->fs_info->chunk_root == root)
3607 return 0;
3608
3609 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries);
3612 if (!ret) {
3613 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv;
3615 }
3616 return ret;
3617}
3618
3619void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3620 struct btrfs_root *root)
3621{
3622 if (!trans->bytes_reserved)
3623 return;
3624
3625 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3626 btrfs_block_rsv_release(root, trans->block_rsv,
3627 trans->bytes_reserved);
3628 trans->bytes_reserved = 0;
3629}
3630
3631int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3632 struct inode *inode)
3633{
3634 struct btrfs_root *root = BTRFS_I(inode)->root;
3635 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3636 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3637
3638 /*
3639 * one for deleting orphan item, one for updating inode and
3640 * two for calling btrfs_truncate_inode_items.
3641 *
3642 * btrfs_truncate_inode_items is a delete operation, it frees
3643 * more space than it uses in most cases. So two units of
3644 * metadata space should be enough for calling it many times.
3645 * If all of the metadata space is used, we can commit
3646 * transaction and use space it freed.
3647 */
3648 u64 num_bytes = calc_trans_metadata_size(root, 4);
3649 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3650}
3651
3652void btrfs_orphan_release_metadata(struct inode *inode)
3653{
3654 struct btrfs_root *root = BTRFS_I(inode)->root;
3655 u64 num_bytes = calc_trans_metadata_size(root, 4);
3656 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3657}
3658
3659int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3660 struct btrfs_pending_snapshot *pending)
3661{
3662 struct btrfs_root *root = pending->root;
3663 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3664 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3665 /*
3666 * two for root back/forward refs, two for directory entries
3667 * and one for root of the snapshot.
3668 */
3669 u64 num_bytes = calc_trans_metadata_size(root, 5);
3670 dst_rsv->space_info = src_rsv->space_info;
3671 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3672}
3673
3674static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3675{
3676 return num_bytes >>= 3;
3677}
3678
3679int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3680{
3681 struct btrfs_root *root = BTRFS_I(inode)->root;
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve;
3684 int nr_extents;
3685 int retries = 0;
3686 int ret;
3687
3688 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1);
3690
3691 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again:
3693 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3696 nr_extents -= BTRFS_I(inode)->reserved_extents;
3697 to_reserve = calc_trans_metadata_size(root, nr_extents);
3698 } else {
3699 nr_extents = 0;
3700 to_reserve = 0;
3701 }
3702
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve);
3705 if (ret) {
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret;
3712 }
3713
3714 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3717
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719
3720 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve);
3722
3723 return 0;
3724}
3725
3726void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3727{
3728 struct btrfs_root *root = BTRFS_I(inode)->root;
3729 u64 to_free;
3730 int nr_extents;
3731
3732 num_bytes = ALIGN(num_bytes, root->sectorsize);
3733 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3734
3735 spin_lock(&BTRFS_I(inode)->accounting_lock);
3736 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3737 if (nr_extents < BTRFS_I(inode)->reserved_extents) {
3738 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
3739 BTRFS_I(inode)->reserved_extents -= nr_extents;
3740 } else {
3741 nr_extents = 0;
3742 }
3743 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3744
3745 to_free = calc_csum_metadata_size(inode, num_bytes);
3746 if (nr_extents > 0)
3747 to_free += calc_trans_metadata_size(root, nr_extents);
3748
3749 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3750 to_free);
3751}
3752
3753int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
3754{
3755 int ret;
3756
3757 ret = btrfs_check_data_free_space(inode, num_bytes);
3758 if (ret)
3759 return ret;
3760
3761 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3762 if (ret) {
3763 btrfs_free_reserved_data_space(inode, num_bytes);
3764 return ret;
3765 }
3766
3767 return 0;
3768}
3769
3770void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
3771{
3772 btrfs_delalloc_release_metadata(inode, num_bytes);
3773 btrfs_free_reserved_data_space(inode, num_bytes);
3774}
3775
3464static int update_block_group(struct btrfs_trans_handle *trans, 3776static int update_block_group(struct btrfs_trans_handle *trans,
3465 struct btrfs_root *root, 3777 struct btrfs_root *root,
3466 u64 bytenr, u64 num_bytes, int alloc, 3778 u64 bytenr, u64 num_bytes, int alloc)
3467 int mark_free)
3468{ 3779{
3469 struct btrfs_block_group_cache *cache; 3780 struct btrfs_block_group_cache *cache;
3470 struct btrfs_fs_info *info = root->fs_info; 3781 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3471 u64 total = num_bytes; 3783 u64 total = num_bytes;
3472 u64 old_val; 3784 u64 old_val;
3473 u64 byte_in_group; 3785 u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3486 cache = btrfs_lookup_block_group(info, bytenr); 3798 cache = btrfs_lookup_block_group(info, bytenr);
3487 if (!cache) 3799 if (!cache)
3488 return -1; 3800 return -1;
3801 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3802 BTRFS_BLOCK_GROUP_RAID1 |
3803 BTRFS_BLOCK_GROUP_RAID10))
3804 factor = 2;
3805 else
3806 factor = 1;
3489 byte_in_group = bytenr - cache->key.objectid; 3807 byte_in_group = bytenr - cache->key.objectid;
3490 WARN_ON(byte_in_group > cache->key.offset); 3808 WARN_ON(byte_in_group > cache->key.offset);
3491 3809
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3498 old_val += num_bytes; 3816 old_val += num_bytes;
3499 btrfs_set_block_group_used(&cache->item, old_val); 3817 btrfs_set_block_group_used(&cache->item, old_val);
3500 cache->reserved -= num_bytes; 3818 cache->reserved -= num_bytes;
3501 cache->space_info->bytes_used += num_bytes;
3502 cache->space_info->bytes_reserved -= num_bytes; 3819 cache->space_info->bytes_reserved -= num_bytes;
3503 if (cache->ro) 3820 cache->space_info->bytes_used += num_bytes;
3504 cache->space_info->bytes_readonly -= num_bytes; 3821 cache->space_info->disk_used += num_bytes * factor;
3505 spin_unlock(&cache->lock); 3822 spin_unlock(&cache->lock);
3506 spin_unlock(&cache->space_info->lock); 3823 spin_unlock(&cache->space_info->lock);
3507 } else { 3824 } else {
3508 old_val -= num_bytes; 3825 old_val -= num_bytes;
3509 cache->space_info->bytes_used -= num_bytes;
3510 if (cache->ro)
3511 cache->space_info->bytes_readonly += num_bytes;
3512 btrfs_set_block_group_used(&cache->item, old_val); 3826 btrfs_set_block_group_used(&cache->item, old_val);
3827 cache->pinned += num_bytes;
3828 cache->space_info->bytes_pinned += num_bytes;
3829 cache->space_info->bytes_used -= num_bytes;
3830 cache->space_info->disk_used -= num_bytes * factor;
3513 spin_unlock(&cache->lock); 3831 spin_unlock(&cache->lock);
3514 spin_unlock(&cache->space_info->lock); 3832 spin_unlock(&cache->space_info->lock);
3515 if (mark_free) {
3516 int ret;
3517
3518 ret = btrfs_discard_extent(root, bytenr,
3519 num_bytes);
3520 WARN_ON(ret);
3521 3833
3522 ret = btrfs_add_free_space(cache, bytenr, 3834 set_extent_dirty(info->pinned_extents,
3523 num_bytes); 3835 bytenr, bytenr + num_bytes - 1,
3524 WARN_ON(ret); 3836 GFP_NOFS | __GFP_NOFAIL);
3525 }
3526 } 3837 }
3527 btrfs_put_block_group(cache); 3838 btrfs_put_block_group(cache);
3528 total -= num_bytes; 3839 total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3546 return bytenr; 3857 return bytenr;
3547} 3858}
3548 3859
3549/* 3860static int pin_down_extent(struct btrfs_root *root,
3550 * this function must be called within transaction 3861 struct btrfs_block_group_cache *cache,
3551 */ 3862 u64 bytenr, u64 num_bytes, int reserved)
3552int btrfs_pin_extent(struct btrfs_root *root,
3553 u64 bytenr, u64 num_bytes, int reserved)
3554{ 3863{
3555 struct btrfs_fs_info *fs_info = root->fs_info;
3556 struct btrfs_block_group_cache *cache;
3557
3558 cache = btrfs_lookup_block_group(fs_info, bytenr);
3559 BUG_ON(!cache);
3560
3561 spin_lock(&cache->space_info->lock); 3864 spin_lock(&cache->space_info->lock);
3562 spin_lock(&cache->lock); 3865 spin_lock(&cache->lock);
3563 cache->pinned += num_bytes; 3866 cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
3569 spin_unlock(&cache->lock); 3872 spin_unlock(&cache->lock);
3570 spin_unlock(&cache->space_info->lock); 3873 spin_unlock(&cache->space_info->lock);
3571 3874
3572 btrfs_put_block_group(cache); 3875 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3876 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3877 return 0;
3878}
3573 3879
3574 set_extent_dirty(fs_info->pinned_extents, 3880/*
3575 bytenr, bytenr + num_bytes - 1, GFP_NOFS); 3881 * this function must be called within transaction
3882 */
3883int btrfs_pin_extent(struct btrfs_root *root,
3884 u64 bytenr, u64 num_bytes, int reserved)
3885{
3886 struct btrfs_block_group_cache *cache;
3887
3888 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3889 BUG_ON(!cache);
3890
3891 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3892
3893 btrfs_put_block_group(cache);
3576 return 0; 3894 return 0;
3577} 3895}
3578 3896
3579static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3897/*
3580 u64 num_bytes, int reserve) 3898 * update size of reserved extents. this function may return -EAGAIN
3899 * if 'reserve' is true or 'sinfo' is false.
3900 */
3901static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3902 u64 num_bytes, int reserve, int sinfo)
3581{ 3903{
3582 spin_lock(&cache->space_info->lock); 3904 int ret = 0;
3583 spin_lock(&cache->lock); 3905 if (sinfo) {
3584 if (reserve) { 3906 struct btrfs_space_info *space_info = cache->space_info;
3585 cache->reserved += num_bytes; 3907 spin_lock(&space_info->lock);
3586 cache->space_info->bytes_reserved += num_bytes; 3908 spin_lock(&cache->lock);
3909 if (reserve) {
3910 if (cache->ro) {
3911 ret = -EAGAIN;
3912 } else {
3913 cache->reserved += num_bytes;
3914 space_info->bytes_reserved += num_bytes;
3915 }
3916 } else {
3917 if (cache->ro)
3918 space_info->bytes_readonly += num_bytes;
3919 cache->reserved -= num_bytes;
3920 space_info->bytes_reserved -= num_bytes;
3921 }
3922 spin_unlock(&cache->lock);
3923 spin_unlock(&space_info->lock);
3587 } else { 3924 } else {
3588 cache->reserved -= num_bytes; 3925 spin_lock(&cache->lock);
3589 cache->space_info->bytes_reserved -= num_bytes; 3926 if (cache->ro) {
3927 ret = -EAGAIN;
3928 } else {
3929 if (reserve)
3930 cache->reserved += num_bytes;
3931 else
3932 cache->reserved -= num_bytes;
3933 }
3934 spin_unlock(&cache->lock);
3590 } 3935 }
3591 spin_unlock(&cache->lock); 3936 return ret;
3592 spin_unlock(&cache->space_info->lock);
3593 return 0;
3594} 3937}
3595 3938
3596int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3939int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3621 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3964 fs_info->pinned_extents = &fs_info->freed_extents[0];
3622 3965
3623 up_write(&fs_info->extent_commit_sem); 3966 up_write(&fs_info->extent_commit_sem);
3967
3968 update_global_block_rsv(fs_info);
3624 return 0; 3969 return 0;
3625} 3970}
3626 3971
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3647 btrfs_add_free_space(cache, start, len); 3992 btrfs_add_free_space(cache, start, len);
3648 } 3993 }
3649 3994
3995 start += len;
3996
3650 spin_lock(&cache->space_info->lock); 3997 spin_lock(&cache->space_info->lock);
3651 spin_lock(&cache->lock); 3998 spin_lock(&cache->lock);
3652 cache->pinned -= len; 3999 cache->pinned -= len;
3653 cache->space_info->bytes_pinned -= len; 4000 cache->space_info->bytes_pinned -= len;
4001 if (cache->ro) {
4002 cache->space_info->bytes_readonly += len;
4003 } else if (cache->reserved_pinned > 0) {
4004 len = min(len, cache->reserved_pinned);
4005 cache->reserved_pinned -= len;
4006 cache->space_info->bytes_reserved += len;
4007 }
3654 spin_unlock(&cache->lock); 4008 spin_unlock(&cache->lock);
3655 spin_unlock(&cache->space_info->lock); 4009 spin_unlock(&cache->space_info->lock);
3656
3657 start += len;
3658 } 4010 }
3659 4011
3660 if (cache) 4012 if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3667{ 4019{
3668 struct btrfs_fs_info *fs_info = root->fs_info; 4020 struct btrfs_fs_info *fs_info = root->fs_info;
3669 struct extent_io_tree *unpin; 4021 struct extent_io_tree *unpin;
4022 struct btrfs_block_rsv *block_rsv;
4023 struct btrfs_block_rsv *next_rsv;
3670 u64 start; 4024 u64 start;
3671 u64 end; 4025 u64 end;
4026 int idx;
3672 int ret; 4027 int ret;
3673 4028
3674 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4029 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3689 cond_resched(); 4044 cond_resched();
3690 } 4045 }
3691 4046
3692 return ret; 4047 mutex_lock(&fs_info->durable_block_rsv_mutex);
3693} 4048 list_for_each_entry_safe(block_rsv, next_rsv,
3694 4049 &fs_info->durable_block_rsv_list, list) {
3695static int pin_down_bytes(struct btrfs_trans_handle *trans,
3696 struct btrfs_root *root,
3697 struct btrfs_path *path,
3698 u64 bytenr, u64 num_bytes,
3699 int is_data, int reserved,
3700 struct extent_buffer **must_clean)
3701{
3702 int err = 0;
3703 struct extent_buffer *buf;
3704 4050
3705 if (is_data) 4051 idx = trans->transid & 0x1;
3706 goto pinit; 4052 if (block_rsv->freed[idx] > 0) {
3707 4053 block_rsv_add_bytes(block_rsv,
3708 /* 4054 block_rsv->freed[idx], 0);
3709 * discard is sloooow, and so triggering discards on 4055 block_rsv->freed[idx] = 0;
3710 * individual btree blocks isn't a good plan. Just 4056 }
3711 * pin everything in discard mode. 4057 if (atomic_read(&block_rsv->usage) == 0) {
3712 */ 4058 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3713 if (btrfs_test_opt(root, DISCARD))
3714 goto pinit;
3715
3716 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3717 if (!buf)
3718 goto pinit;
3719 4059
3720 /* we can reuse a block if it hasn't been written 4060 if (block_rsv->freed[0] == 0 &&
3721 * and it is from this transaction. We can't 4061 block_rsv->freed[1] == 0) {
3722 * reuse anything from the tree log root because 4062 list_del_init(&block_rsv->list);
3723 * it has tiny sub-transactions. 4063 kfree(block_rsv);
3724 */ 4064 }
3725 if (btrfs_buffer_uptodate(buf, 0) && 4065 } else {
3726 btrfs_try_tree_lock(buf)) { 4066 btrfs_block_rsv_release(root, block_rsv, 0);
3727 u64 header_owner = btrfs_header_owner(buf);
3728 u64 header_transid = btrfs_header_generation(buf);
3729 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3730 header_transid == trans->transid &&
3731 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3732 *must_clean = buf;
3733 return 1;
3734 } 4067 }
3735 btrfs_tree_unlock(buf);
3736 } 4068 }
3737 free_extent_buffer(buf); 4069 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3738pinit:
3739 if (path)
3740 btrfs_set_path_blocking(path);
3741 /* unlocks the pinned mutex */
3742 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3743 4070
3744 BUG_ON(err < 0);
3745 return 0; 4071 return 0;
3746} 4072}
3747 4073
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3902 BUG_ON(ret); 4228 BUG_ON(ret);
3903 } 4229 }
3904 } else { 4230 } else {
3905 int mark_free = 0;
3906 struct extent_buffer *must_clean = NULL;
3907
3908 if (found_extent) { 4231 if (found_extent) {
3909 BUG_ON(is_data && refs_to_drop != 4232 BUG_ON(is_data && refs_to_drop !=
3910 extent_data_ref_count(root, path, iref)); 4233 extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3917 } 4240 }
3918 } 4241 }
3919 4242
3920 ret = pin_down_bytes(trans, root, path, bytenr,
3921 num_bytes, is_data, 0, &must_clean);
3922 if (ret > 0)
3923 mark_free = 1;
3924 BUG_ON(ret < 0);
3925 /*
3926 * it is going to be very rare for someone to be waiting
3927 * on the block we're freeing. del_items might need to
3928 * schedule, so rather than get fancy, just force it
3929 * to blocking here
3930 */
3931 if (must_clean)
3932 btrfs_set_lock_blocking(must_clean);
3933
3934 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4243 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3935 num_to_del); 4244 num_to_del);
3936 BUG_ON(ret); 4245 BUG_ON(ret);
3937 btrfs_release_path(extent_root, path); 4246 btrfs_release_path(extent_root, path);
3938 4247
3939 if (must_clean) {
3940 clean_tree_block(NULL, root, must_clean);
3941 btrfs_tree_unlock(must_clean);
3942 free_extent_buffer(must_clean);
3943 }
3944
3945 if (is_data) { 4248 if (is_data) {
3946 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4249 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3947 BUG_ON(ret); 4250 BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3951 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4254 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3952 } 4255 }
3953 4256
3954 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 4257 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
3955 mark_free);
3956 BUG_ON(ret); 4258 BUG_ON(ret);
3957 } 4259 }
3958 btrfs_free_path(path); 4260 btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3960} 4262}
3961 4263
3962/* 4264/*
3963 * when we free an extent, it is possible (and likely) that we free the last 4265 * when we free an block, it is possible (and likely) that we free the last
3964 * delayed ref for that extent as well. This searches the delayed ref tree for 4266 * delayed ref for that extent as well. This searches the delayed ref tree for
3965 * a given extent, and if there are no other delayed refs to be processed, it 4267 * a given extent, and if there are no other delayed refs to be processed, it
3966 * removes it from the tree. 4268 * removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3972 struct btrfs_delayed_ref_root *delayed_refs; 4274 struct btrfs_delayed_ref_root *delayed_refs;
3973 struct btrfs_delayed_ref_node *ref; 4275 struct btrfs_delayed_ref_node *ref;
3974 struct rb_node *node; 4276 struct rb_node *node;
3975 int ret; 4277 int ret = 0;
3976 4278
3977 delayed_refs = &trans->transaction->delayed_refs; 4279 delayed_refs = &trans->transaction->delayed_refs;
3978 spin_lock(&delayed_refs->lock); 4280 spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4024 list_del_init(&head->cluster); 4326 list_del_init(&head->cluster);
4025 spin_unlock(&delayed_refs->lock); 4327 spin_unlock(&delayed_refs->lock);
4026 4328
4027 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4329 BUG_ON(head->extent_op);
4028 &head->node, head->extent_op, 4330 if (head->must_insert_reserved)
4029 head->must_insert_reserved); 4331 ret = 1;
4030 BUG_ON(ret); 4332
4333 mutex_unlock(&head->mutex);
4031 btrfs_put_delayed_ref(&head->node); 4334 btrfs_put_delayed_ref(&head->node);
4032 return 0; 4335 return ret;
4033out: 4336out:
4034 spin_unlock(&delayed_refs->lock); 4337 spin_unlock(&delayed_refs->lock);
4035 return 0; 4338 return 0;
4036} 4339}
4037 4340
4341void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4342 struct btrfs_root *root,
4343 struct extent_buffer *buf,
4344 u64 parent, int last_ref)
4345{
4346 struct btrfs_block_rsv *block_rsv;
4347 struct btrfs_block_group_cache *cache = NULL;
4348 int ret;
4349
4350 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4351 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4352 parent, root->root_key.objectid,
4353 btrfs_header_level(buf),
4354 BTRFS_DROP_DELAYED_REF, NULL);
4355 BUG_ON(ret);
4356 }
4357
4358 if (!last_ref)
4359 return;
4360
4361 block_rsv = get_block_rsv(trans, root);
4362 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4363 if (block_rsv->space_info != cache->space_info)
4364 goto out;
4365
4366 if (btrfs_header_generation(buf) == trans->transid) {
4367 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4368 ret = check_ref_cleanup(trans, root, buf->start);
4369 if (!ret)
4370 goto pin;
4371 }
4372
4373 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4374 pin_down_extent(root, cache, buf->start, buf->len, 1);
4375 goto pin;
4376 }
4377
4378 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4379
4380 btrfs_add_free_space(cache, buf->start, buf->len);
4381 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4382 if (ret == -EAGAIN) {
4383 /* block group became read-only */
4384 update_reserved_bytes(cache, buf->len, 0, 1);
4385 goto out;
4386 }
4387
4388 ret = 1;
4389 spin_lock(&block_rsv->lock);
4390 if (block_rsv->reserved < block_rsv->size) {
4391 block_rsv->reserved += buf->len;
4392 ret = 0;
4393 }
4394 spin_unlock(&block_rsv->lock);
4395
4396 if (ret) {
4397 spin_lock(&cache->space_info->lock);
4398 cache->space_info->bytes_reserved -= buf->len;
4399 spin_unlock(&cache->space_info->lock);
4400 }
4401 goto out;
4402 }
4403pin:
4404 if (block_rsv->durable && !cache->ro) {
4405 ret = 0;
4406 spin_lock(&cache->lock);
4407 if (!cache->ro) {
4408 cache->reserved_pinned += buf->len;
4409 ret = 1;
4410 }
4411 spin_unlock(&cache->lock);
4412
4413 if (ret) {
4414 spin_lock(&block_rsv->lock);
4415 block_rsv->freed[trans->transid & 0x1] += buf->len;
4416 spin_unlock(&block_rsv->lock);
4417 }
4418 }
4419out:
4420 btrfs_put_block_group(cache);
4421}
4422
4038int btrfs_free_extent(struct btrfs_trans_handle *trans, 4423int btrfs_free_extent(struct btrfs_trans_handle *trans,
4039 struct btrfs_root *root, 4424 struct btrfs_root *root,
4040 u64 bytenr, u64 num_bytes, u64 parent, 4425 u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4056 parent, root_objectid, (int)owner, 4441 parent, root_objectid, (int)owner,
4057 BTRFS_DROP_DELAYED_REF, NULL); 4442 BTRFS_DROP_DELAYED_REF, NULL);
4058 BUG_ON(ret); 4443 BUG_ON(ret);
4059 ret = check_ref_cleanup(trans, root, bytenr);
4060 BUG_ON(ret);
4061 } else { 4444 } else {
4062 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4445 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4063 parent, root_objectid, owner, 4446 parent, root_objectid, owner,
@@ -4067,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4067 return ret; 4450 return ret;
4068} 4451}
4069 4452
4070int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4071 struct btrfs_root *root,
4072 u64 bytenr, u32 blocksize,
4073 u64 parent, u64 root_objectid, int level)
4074{
4075 u64 used;
4076 spin_lock(&root->node_lock);
4077 used = btrfs_root_used(&root->root_item) - blocksize;
4078 btrfs_set_root_used(&root->root_item, used);
4079 spin_unlock(&root->node_lock);
4080
4081 return btrfs_free_extent(trans, root, bytenr, blocksize,
4082 parent, root_objectid, level, 0);
4083}
4084
4085static u64 stripe_align(struct btrfs_root *root, u64 val) 4453static u64 stripe_align(struct btrfs_root *root, u64 val)
4086{ 4454{
4087 u64 mask = ((u64)root->stripesize - 1); 4455 u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4134 return 0; 4502 return 0;
4135} 4503}
4136 4504
4505static int get_block_group_index(struct btrfs_block_group_cache *cache)
4506{
4507 int index;
4508 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4509 index = 0;
4510 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4511 index = 1;
4512 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4513 index = 2;
4514 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4515 index = 3;
4516 else
4517 index = 4;
4518 return index;
4519}
4520
4137enum btrfs_loop_type { 4521enum btrfs_loop_type {
4138 LOOP_FIND_IDEAL = 0, 4522 LOOP_FIND_IDEAL = 0,
4139 LOOP_CACHING_NOWAIT = 1, 4523 LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4155 u64 num_bytes, u64 empty_size, 4539 u64 num_bytes, u64 empty_size,
4156 u64 search_start, u64 search_end, 4540 u64 search_start, u64 search_end,
4157 u64 hint_byte, struct btrfs_key *ins, 4541 u64 hint_byte, struct btrfs_key *ins,
4158 u64 exclude_start, u64 exclude_nr,
4159 int data) 4542 int data)
4160{ 4543{
4161 int ret = 0; 4544 int ret = 0;
@@ -4168,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4168 struct btrfs_space_info *space_info; 4551 struct btrfs_space_info *space_info;
4169 int last_ptr_loop = 0; 4552 int last_ptr_loop = 0;
4170 int loop = 0; 4553 int loop = 0;
4554 int index = 0;
4171 bool found_uncached_bg = false; 4555 bool found_uncached_bg = false;
4172 bool failed_cluster_refill = false; 4556 bool failed_cluster_refill = false;
4173 bool failed_alloc = false; 4557 bool failed_alloc = false;
@@ -4237,6 +4621,7 @@ ideal_cache:
4237 btrfs_put_block_group(block_group); 4621 btrfs_put_block_group(block_group);
4238 up_read(&space_info->groups_sem); 4622 up_read(&space_info->groups_sem);
4239 } else { 4623 } else {
4624 index = get_block_group_index(block_group);
4240 goto have_block_group; 4625 goto have_block_group;
4241 } 4626 }
4242 } else if (block_group) { 4627 } else if (block_group) {
@@ -4245,7 +4630,8 @@ ideal_cache:
4245 } 4630 }
4246search: 4631search:
4247 down_read(&space_info->groups_sem); 4632 down_read(&space_info->groups_sem);
4248 list_for_each_entry(block_group, &space_info->block_groups, list) { 4633 list_for_each_entry(block_group, &space_info->block_groups[index],
4634 list) {
4249 u64 offset; 4635 u64 offset;
4250 int cached; 4636 int cached;
4251 4637
@@ -4436,23 +4822,22 @@ checks:
4436 goto loop; 4822 goto loop;
4437 } 4823 }
4438 4824
4439 if (exclude_nr > 0 && 4825 ins->objectid = search_start;
4440 (search_start + num_bytes > exclude_start && 4826 ins->offset = num_bytes;
4441 search_start < exclude_start + exclude_nr)) { 4827
4442 search_start = exclude_start + exclude_nr; 4828 if (offset < search_start)
4829 btrfs_add_free_space(block_group, offset,
4830 search_start - offset);
4831 BUG_ON(offset > search_start);
4443 4832
4833 ret = update_reserved_bytes(block_group, num_bytes, 1,
4834 (data & BTRFS_BLOCK_GROUP_DATA));
4835 if (ret == -EAGAIN) {
4444 btrfs_add_free_space(block_group, offset, num_bytes); 4836 btrfs_add_free_space(block_group, offset, num_bytes);
4445 /*
4446 * if search_start is still in this block group
4447 * then we just re-search this block group
4448 */
4449 if (search_start >= block_group->key.objectid &&
4450 search_start < (block_group->key.objectid +
4451 block_group->key.offset))
4452 goto have_block_group;
4453 goto loop; 4837 goto loop;
4454 } 4838 }
4455 4839
4840 /* we are all good, lets return */
4456 ins->objectid = search_start; 4841 ins->objectid = search_start;
4457 ins->offset = num_bytes; 4842 ins->offset = num_bytes;
4458 4843
@@ -4460,18 +4845,18 @@ checks:
4460 btrfs_add_free_space(block_group, offset, 4845 btrfs_add_free_space(block_group, offset,
4461 search_start - offset); 4846 search_start - offset);
4462 BUG_ON(offset > search_start); 4847 BUG_ON(offset > search_start);
4463
4464 update_reserved_extents(block_group, num_bytes, 1);
4465
4466 /* we are all good, lets return */
4467 break; 4848 break;
4468loop: 4849loop:
4469 failed_cluster_refill = false; 4850 failed_cluster_refill = false;
4470 failed_alloc = false; 4851 failed_alloc = false;
4852 BUG_ON(index != get_block_group_index(block_group));
4471 btrfs_put_block_group(block_group); 4853 btrfs_put_block_group(block_group);
4472 } 4854 }
4473 up_read(&space_info->groups_sem); 4855 up_read(&space_info->groups_sem);
4474 4856
4857 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4858 goto search;
4859
4475 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4860 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4476 * for them to make caching progress. Also 4861 * for them to make caching progress. Also
4477 * determine the best possible bg to cache 4862 * determine the best possible bg to cache
@@ -4485,6 +4870,7 @@ loop:
4485 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4870 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4486 (found_uncached_bg || empty_size || empty_cluster || 4871 (found_uncached_bg || empty_size || empty_cluster ||
4487 allowed_chunk_alloc)) { 4872 allowed_chunk_alloc)) {
4873 index = 0;
4488 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4874 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4489 found_uncached_bg = false; 4875 found_uncached_bg = false;
4490 loop++; 4876 loop++;
@@ -4567,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4567 int dump_block_groups) 4953 int dump_block_groups)
4568{ 4954{
4569 struct btrfs_block_group_cache *cache; 4955 struct btrfs_block_group_cache *cache;
4956 int index = 0;
4570 4957
4571 spin_lock(&info->lock); 4958 spin_lock(&info->lock);
4572 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4959 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4573 (unsigned long long)(info->total_bytes - info->bytes_used - 4960 (unsigned long long)(info->total_bytes - info->bytes_used -
4574 info->bytes_pinned - info->bytes_reserved - 4961 info->bytes_pinned - info->bytes_reserved -
4575 info->bytes_super), 4962 info->bytes_readonly),
4576 (info->full) ? "" : "not "); 4963 (info->full) ? "" : "not ");
4577 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4964 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
4578 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4965 "reserved=%llu, may_use=%llu, readonly=%llu\n",
4579 "\n",
4580 (unsigned long long)info->total_bytes, 4966 (unsigned long long)info->total_bytes,
4967 (unsigned long long)info->bytes_used,
4581 (unsigned long long)info->bytes_pinned, 4968 (unsigned long long)info->bytes_pinned,
4582 (unsigned long long)info->bytes_delalloc, 4969 (unsigned long long)info->bytes_reserved,
4583 (unsigned long long)info->bytes_may_use, 4970 (unsigned long long)info->bytes_may_use,
4584 (unsigned long long)info->bytes_used, 4971 (unsigned long long)info->bytes_readonly);
4585 (unsigned long long)info->bytes_root,
4586 (unsigned long long)info->bytes_super,
4587 (unsigned long long)info->bytes_reserved);
4588 spin_unlock(&info->lock); 4972 spin_unlock(&info->lock);
4589 4973
4590 if (!dump_block_groups) 4974 if (!dump_block_groups)
4591 return; 4975 return;
4592 4976
4593 down_read(&info->groups_sem); 4977 down_read(&info->groups_sem);
4594 list_for_each_entry(cache, &info->block_groups, list) { 4978again:
4979 list_for_each_entry(cache, &info->block_groups[index], list) {
4595 spin_lock(&cache->lock); 4980 spin_lock(&cache->lock);
4596 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4981 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4597 "%llu pinned %llu reserved\n", 4982 "%llu pinned %llu reserved\n",
@@ -4603,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4603 btrfs_dump_free_space(cache, bytes); 4988 btrfs_dump_free_space(cache, bytes);
4604 spin_unlock(&cache->lock); 4989 spin_unlock(&cache->lock);
4605 } 4990 }
4991 if (++index < BTRFS_NR_RAID_TYPES)
4992 goto again;
4606 up_read(&info->groups_sem); 4993 up_read(&info->groups_sem);
4607} 4994}
4608 4995
@@ -4628,9 +5015,8 @@ again:
4628 5015
4629 WARN_ON(num_bytes < root->sectorsize); 5016 WARN_ON(num_bytes < root->sectorsize);
4630 ret = find_free_extent(trans, root, num_bytes, empty_size, 5017 ret = find_free_extent(trans, root, num_bytes, empty_size,
4631 search_start, search_end, hint_byte, ins, 5018 search_start, search_end, hint_byte,
4632 trans->alloc_exclude_start, 5019 ins, data);
4633 trans->alloc_exclude_nr, data);
4634 5020
4635 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5021 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4636 num_bytes = num_bytes >> 1; 5022 num_bytes = num_bytes >> 1;
@@ -4668,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4668 ret = btrfs_discard_extent(root, start, len); 5054 ret = btrfs_discard_extent(root, start, len);
4669 5055
4670 btrfs_add_free_space(cache, start, len); 5056 btrfs_add_free_space(cache, start, len);
4671 update_reserved_extents(cache, len, 0); 5057 update_reserved_bytes(cache, len, 0, 1);
4672 btrfs_put_block_group(cache); 5058 btrfs_put_block_group(cache);
4673 5059
4674 return ret; 5060 return ret;
@@ -4731,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4731 btrfs_mark_buffer_dirty(path->nodes[0]); 5117 btrfs_mark_buffer_dirty(path->nodes[0]);
4732 btrfs_free_path(path); 5118 btrfs_free_path(path);
4733 5119
4734 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5120 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4735 1, 0);
4736 if (ret) { 5121 if (ret) {
4737 printk(KERN_ERR "btrfs update block group failed for %llu " 5122 printk(KERN_ERR "btrfs update block group failed for %llu "
4738 "%llu\n", (unsigned long long)ins->objectid, 5123 "%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4792 btrfs_mark_buffer_dirty(leaf); 5177 btrfs_mark_buffer_dirty(leaf);
4793 btrfs_free_path(path); 5178 btrfs_free_path(path);
4794 5179
4795 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5180 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4796 1, 0);
4797 if (ret) { 5181 if (ret) {
4798 printk(KERN_ERR "btrfs update block group failed for %llu " 5182 printk(KERN_ERR "btrfs update block group failed for %llu "
4799 "%llu\n", (unsigned long long)ins->objectid, 5183 "%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4869 put_caching_control(caching_ctl); 5253 put_caching_control(caching_ctl);
4870 } 5254 }
4871 5255
4872 update_reserved_extents(block_group, ins->offset, 1); 5256 ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5257 BUG_ON(ret);
4873 btrfs_put_block_group(block_group); 5258 btrfs_put_block_group(block_group);
4874 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5259 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4875 0, owner, offset, ins, 1); 5260 0, owner, offset, ins, 1);
4876 return ret; 5261 return ret;
4877} 5262}
4878 5263
4879/*
4880 * finds a free extent and does all the dirty work required for allocation
4881 * returns the key for the extent through ins, and a tree buffer for
4882 * the first block of the extent through buf.
4883 *
4884 * returns 0 if everything worked, non-zero otherwise.
4885 */
4886static int alloc_tree_block(struct btrfs_trans_handle *trans,
4887 struct btrfs_root *root,
4888 u64 num_bytes, u64 parent, u64 root_objectid,
4889 struct btrfs_disk_key *key, int level,
4890 u64 empty_size, u64 hint_byte, u64 search_end,
4891 struct btrfs_key *ins)
4892{
4893 int ret;
4894 u64 flags = 0;
4895
4896 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4897 empty_size, hint_byte, search_end,
4898 ins, 0);
4899 if (ret)
4900 return ret;
4901
4902 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4903 if (parent == 0)
4904 parent = ins->objectid;
4905 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4906 } else
4907 BUG_ON(parent > 0);
4908
4909 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4910 struct btrfs_delayed_extent_op *extent_op;
4911 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4912 BUG_ON(!extent_op);
4913 if (key)
4914 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4915 else
4916 memset(&extent_op->key, 0, sizeof(extent_op->key));
4917 extent_op->flags_to_set = flags;
4918 extent_op->update_key = 1;
4919 extent_op->update_flags = 1;
4920 extent_op->is_data = 0;
4921
4922 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4923 ins->offset, parent, root_objectid,
4924 level, BTRFS_ADD_DELAYED_EXTENT,
4925 extent_op);
4926 BUG_ON(ret);
4927 }
4928
4929 if (root_objectid == root->root_key.objectid) {
4930 u64 used;
4931 spin_lock(&root->node_lock);
4932 used = btrfs_root_used(&root->root_item) + num_bytes;
4933 btrfs_set_root_used(&root->root_item, used);
4934 spin_unlock(&root->node_lock);
4935 }
4936 return ret;
4937}
4938
4939struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5264struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4940 struct btrfs_root *root, 5265 struct btrfs_root *root,
4941 u64 bytenr, u32 blocksize, 5266 u64 bytenr, u32 blocksize,
@@ -4974,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4974 return buf; 5299 return buf;
4975} 5300}
4976 5301
5302static struct btrfs_block_rsv *
5303use_block_rsv(struct btrfs_trans_handle *trans,
5304 struct btrfs_root *root, u32 blocksize)
5305{
5306 struct btrfs_block_rsv *block_rsv;
5307 int ret;
5308
5309 block_rsv = get_block_rsv(trans, root);
5310
5311 if (block_rsv->size == 0) {
5312 ret = reserve_metadata_bytes(block_rsv, blocksize);
5313 if (ret)
5314 return ERR_PTR(ret);
5315 return block_rsv;
5316 }
5317
5318 ret = block_rsv_use_bytes(block_rsv, blocksize);
5319 if (!ret)
5320 return block_rsv;
5321
5322 WARN_ON(1);
5323 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5324 block_rsv->size, block_rsv->reserved,
5325 block_rsv->freed[0], block_rsv->freed[1]);
5326
5327 return ERR_PTR(-ENOSPC);
5328}
5329
5330static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5331{
5332 block_rsv_add_bytes(block_rsv, blocksize, 0);
5333 block_rsv_release_bytes(block_rsv, NULL, 0);
5334}
5335
4977/* 5336/*
4978 * helper function to allocate a block for a given tree 5337 * finds a free extent and does all the dirty work required for allocation
5338 * returns the key for the extent through ins, and a tree buffer for
5339 * the first block of the extent through buf.
5340 *
4979 * returns the tree buffer or NULL. 5341 * returns the tree buffer or NULL.
4980 */ 5342 */
4981struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5343struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4985 u64 hint, u64 empty_size) 5347 u64 hint, u64 empty_size)
4986{ 5348{
4987 struct btrfs_key ins; 5349 struct btrfs_key ins;
4988 int ret; 5350 struct btrfs_block_rsv *block_rsv;
4989 struct extent_buffer *buf; 5351 struct extent_buffer *buf;
5352 u64 flags = 0;
5353 int ret;
5354
4990 5355
4991 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 5356 block_rsv = use_block_rsv(trans, root, blocksize);
4992 key, level, empty_size, hint, (u64)-1, &ins); 5357 if (IS_ERR(block_rsv))
5358 return ERR_CAST(block_rsv);
5359
5360 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5361 empty_size, hint, (u64)-1, &ins, 0);
4993 if (ret) { 5362 if (ret) {
4994 BUG_ON(ret > 0); 5363 unuse_block_rsv(block_rsv, blocksize);
4995 return ERR_PTR(ret); 5364 return ERR_PTR(ret);
4996 } 5365 }
4997 5366
4998 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5367 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4999 blocksize, level); 5368 blocksize, level);
5369 BUG_ON(IS_ERR(buf));
5370
5371 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5372 if (parent == 0)
5373 parent = ins.objectid;
5374 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5375 } else
5376 BUG_ON(parent > 0);
5377
5378 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5379 struct btrfs_delayed_extent_op *extent_op;
5380 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5381 BUG_ON(!extent_op);
5382 if (key)
5383 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5384 else
5385 memset(&extent_op->key, 0, sizeof(extent_op->key));
5386 extent_op->flags_to_set = flags;
5387 extent_op->update_key = 1;
5388 extent_op->update_flags = 1;
5389 extent_op->is_data = 0;
5390
5391 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5392 ins.offset, parent, root_objectid,
5393 level, BTRFS_ADD_DELAYED_EXTENT,
5394 extent_op);
5395 BUG_ON(ret);
5396 }
5000 return buf; 5397 return buf;
5001} 5398}
5002 5399
@@ -5321,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5321 struct btrfs_path *path, 5718 struct btrfs_path *path,
5322 struct walk_control *wc) 5719 struct walk_control *wc)
5323{ 5720{
5324 int ret = 0; 5721 int ret;
5325 int level = wc->level; 5722 int level = wc->level;
5326 struct extent_buffer *eb = path->nodes[level]; 5723 struct extent_buffer *eb = path->nodes[level];
5327 u64 parent = 0; 5724 u64 parent = 0;
@@ -5399,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5399 btrfs_header_owner(path->nodes[level + 1])); 5796 btrfs_header_owner(path->nodes[level + 1]));
5400 } 5797 }
5401 5798
5402 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5799 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5403 root->root_key.objectid, level, 0);
5404 BUG_ON(ret);
5405out: 5800out:
5406 wc->refs[level] = 0; 5801 wc->refs[level] = 0;
5407 wc->flags[level] = 0; 5802 wc->flags[level] = 0;
5408 return ret; 5803 return 0;
5409} 5804}
5410 5805
5411static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5806static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5483 * also make sure backrefs for the shared block and all lower level 5878 * also make sure backrefs for the shared block and all lower level
5484 * blocks are properly updated. 5879 * blocks are properly updated.
5485 */ 5880 */
5486int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5881int btrfs_drop_snapshot(struct btrfs_root *root,
5882 struct btrfs_block_rsv *block_rsv, int update_ref)
5487{ 5883{
5488 struct btrfs_path *path; 5884 struct btrfs_path *path;
5489 struct btrfs_trans_handle *trans; 5885 struct btrfs_trans_handle *trans;
@@ -5501,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5501 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5897 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5502 BUG_ON(!wc); 5898 BUG_ON(!wc);
5503 5899
5504 trans = btrfs_start_transaction(tree_root, 1); 5900 trans = btrfs_start_transaction(tree_root, 0);
5901 if (block_rsv)
5902 trans->block_rsv = block_rsv;
5505 5903
5506 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5904 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5507 level = btrfs_header_level(root->node); 5905 level = btrfs_header_level(root->node);
@@ -5589,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5589 } 5987 }
5590 5988
5591 BUG_ON(wc->level == 0); 5989 BUG_ON(wc->level == 0);
5592 if (trans->transaction->in_commit || 5990 if (btrfs_should_end_transaction(trans, tree_root)) {
5593 trans->transaction->delayed_refs.flushing) {
5594 ret = btrfs_update_root(trans, tree_root, 5991 ret = btrfs_update_root(trans, tree_root,
5595 &root->root_key, 5992 &root->root_key,
5596 root_item); 5993 root_item);
5597 BUG_ON(ret); 5994 BUG_ON(ret);
5598 5995
5599 btrfs_end_transaction(trans, tree_root); 5996 btrfs_end_transaction_throttle(trans, tree_root);
5600 trans = btrfs_start_transaction(tree_root, 1); 5997 trans = btrfs_start_transaction(tree_root, 0);
5601 } else { 5998 if (block_rsv)
5602 unsigned long update; 5999 trans->block_rsv = block_rsv;
5603 update = trans->delayed_ref_updates;
5604 trans->delayed_ref_updates = 0;
5605 if (update)
5606 btrfs_run_delayed_refs(trans, tree_root,
5607 update);
5608 } 6000 }
5609 } 6001 }
5610 btrfs_release_path(root, path); 6002 btrfs_release_path(root, path);
@@ -5632,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5632 kfree(root); 6024 kfree(root);
5633 } 6025 }
5634out: 6026out:
5635 btrfs_end_transaction(trans, tree_root); 6027 btrfs_end_transaction_throttle(trans, tree_root);
5636 kfree(wc); 6028 kfree(wc);
5637 btrfs_free_path(path); 6029 btrfs_free_path(path);
5638 return err; 6030 return err;
@@ -7228,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7228 return flags; 7620 return flags;
7229} 7621}
7230 7622
7231static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7623static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7232 struct btrfs_block_group_cache *shrink_block_group,
7233 int force)
7234{ 7624{
7235 struct btrfs_trans_handle *trans; 7625 struct btrfs_space_info *sinfo = cache->space_info;
7236 u64 new_alloc_flags; 7626 u64 num_bytes;
7237 u64 calc; 7627 int ret = -ENOSPC;
7238 7628
7239 spin_lock(&shrink_block_group->lock); 7629 if (cache->ro)
7240 if (btrfs_block_group_used(&shrink_block_group->item) + 7630 return 0;
7241 shrink_block_group->reserved > 0) {
7242 spin_unlock(&shrink_block_group->lock);
7243 7631
7244 trans = btrfs_start_transaction(root, 1); 7632 spin_lock(&sinfo->lock);
7245 spin_lock(&shrink_block_group->lock); 7633 spin_lock(&cache->lock);
7634 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7635 cache->bytes_super - btrfs_block_group_used(&cache->item);
7636
7637 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7638 sinfo->bytes_may_use + sinfo->bytes_readonly +
7639 cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7640 sinfo->bytes_readonly += num_bytes;
7641 sinfo->bytes_reserved += cache->reserved_pinned;
7642 cache->reserved_pinned = 0;
7643 cache->ro = 1;
7644 ret = 0;
7645 }
7646 spin_unlock(&cache->lock);
7647 spin_unlock(&sinfo->lock);
7648 return ret;
7649}
7246 7650
7247 new_alloc_flags = update_block_group_flags(root, 7651int btrfs_set_block_group_ro(struct btrfs_root *root,
7248 shrink_block_group->flags); 7652 struct btrfs_block_group_cache *cache)
7249 if (new_alloc_flags != shrink_block_group->flags) {
7250 calc =
7251 btrfs_block_group_used(&shrink_block_group->item);
7252 } else {
7253 calc = shrink_block_group->key.offset;
7254 }
7255 spin_unlock(&shrink_block_group->lock);
7256 7653
7257 do_chunk_alloc(trans, root->fs_info->extent_root, 7654{
7258 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7655 struct btrfs_trans_handle *trans;
7656 u64 alloc_flags;
7657 int ret;
7259 7658
7260 btrfs_end_transaction(trans, root); 7659 BUG_ON(cache->ro);
7261 } else 7660
7262 spin_unlock(&shrink_block_group->lock); 7661 trans = btrfs_join_transaction(root, 1);
7263 return 0; 7662 BUG_ON(IS_ERR(trans));
7264}
7265 7663
7664 alloc_flags = update_block_group_flags(root, cache->flags);
7665 if (alloc_flags != cache->flags)
7666 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7266 7667
7267int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7668 ret = set_block_group_ro(cache);
7268 struct btrfs_block_group_cache *group) 7669 if (!ret)
7670 goto out;
7671 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7672 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7673 if (ret < 0)
7674 goto out;
7675 ret = set_block_group_ro(cache);
7676out:
7677 btrfs_end_transaction(trans, root);
7678 return ret;
7679}
7269 7680
7681int btrfs_set_block_group_rw(struct btrfs_root *root,
7682 struct btrfs_block_group_cache *cache)
7270{ 7683{
7271 __alloc_chunk_for_shrink(root, group, 1); 7684 struct btrfs_space_info *sinfo = cache->space_info;
7272 set_block_group_readonly(group); 7685 u64 num_bytes;
7686
7687 BUG_ON(!cache->ro);
7688
7689 spin_lock(&sinfo->lock);
7690 spin_lock(&cache->lock);
7691 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7692 cache->bytes_super - btrfs_block_group_used(&cache->item);
7693 sinfo->bytes_readonly -= num_bytes;
7694 cache->ro = 0;
7695 spin_unlock(&cache->lock);
7696 spin_unlock(&sinfo->lock);
7273 return 0; 7697 return 0;
7274} 7698}
7275 7699
@@ -7436,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7436 */ 7860 */
7437 synchronize_rcu(); 7861 synchronize_rcu();
7438 7862
7863 release_global_block_rsv(info);
7864
7439 while(!list_empty(&info->space_info)) { 7865 while(!list_empty(&info->space_info)) {
7440 space_info = list_entry(info->space_info.next, 7866 space_info = list_entry(info->space_info.next,
7441 struct btrfs_space_info, 7867 struct btrfs_space_info,
7442 list); 7868 list);
7443 7869 if (space_info->bytes_pinned > 0 ||
7870 space_info->bytes_reserved > 0) {
7871 WARN_ON(1);
7872 dump_space_info(space_info, 0, 0);
7873 }
7444 list_del(&space_info->list); 7874 list_del(&space_info->list);
7445 kfree(space_info); 7875 kfree(space_info);
7446 } 7876 }
7447 return 0; 7877 return 0;
7448} 7878}
7449 7879
7880static void __link_block_group(struct btrfs_space_info *space_info,
7881 struct btrfs_block_group_cache *cache)
7882{
7883 int index = get_block_group_index(cache);
7884
7885 down_write(&space_info->groups_sem);
7886 list_add_tail(&cache->list, &space_info->block_groups[index]);
7887 up_write(&space_info->groups_sem);
7888}
7889
7450int btrfs_read_block_groups(struct btrfs_root *root) 7890int btrfs_read_block_groups(struct btrfs_root *root)
7451{ 7891{
7452 struct btrfs_path *path; 7892 struct btrfs_path *path;
@@ -7468,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7468 7908
7469 while (1) { 7909 while (1) {
7470 ret = find_first_block_group(root, path, &key); 7910 ret = find_first_block_group(root, path, &key);
7471 if (ret > 0) { 7911 if (ret > 0)
7472 ret = 0; 7912 break;
7473 goto error;
7474 }
7475 if (ret != 0) 7913 if (ret != 0)
7476 goto error; 7914 goto error;
7477 7915
@@ -7480,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7480 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7918 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7481 if (!cache) { 7919 if (!cache) {
7482 ret = -ENOMEM; 7920 ret = -ENOMEM;
7483 break; 7921 goto error;
7484 } 7922 }
7485 7923
7486 atomic_set(&cache->count, 1); 7924 atomic_set(&cache->count, 1);
@@ -7537,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7537 BUG_ON(ret); 7975 BUG_ON(ret);
7538 cache->space_info = space_info; 7976 cache->space_info = space_info;
7539 spin_lock(&cache->space_info->lock); 7977 spin_lock(&cache->space_info->lock);
7540 cache->space_info->bytes_super += cache->bytes_super; 7978 cache->space_info->bytes_readonly += cache->bytes_super;
7541 spin_unlock(&cache->space_info->lock); 7979 spin_unlock(&cache->space_info->lock);
7542 7980
7543 down_write(&space_info->groups_sem); 7981 __link_block_group(space_info, cache);
7544 list_add_tail(&cache->list, &space_info->block_groups);
7545 up_write(&space_info->groups_sem);
7546 7982
7547 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7983 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7548 BUG_ON(ret); 7984 BUG_ON(ret);
7549 7985
7550 set_avail_alloc_bits(root->fs_info, cache->flags); 7986 set_avail_alloc_bits(root->fs_info, cache->flags);
7551 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7987 if (btrfs_chunk_readonly(root, cache->key.objectid))
7552 set_block_group_readonly(cache); 7988 set_block_group_ro(cache);
7553 } 7989 }
7990
7991 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7992 if (!(get_alloc_profile(root, space_info->flags) &
7993 (BTRFS_BLOCK_GROUP_RAID10 |
7994 BTRFS_BLOCK_GROUP_RAID1 |
7995 BTRFS_BLOCK_GROUP_DUP)))
7996 continue;
7997 /*
7998 * avoid allocating from un-mirrored block group if there are
7999 * mirrored block groups.
8000 */
8001 list_for_each_entry(cache, &space_info->block_groups[3], list)
8002 set_block_group_ro(cache);
8003 list_for_each_entry(cache, &space_info->block_groups[4], list)
8004 set_block_group_ro(cache);
8005 }
8006
8007 init_global_block_rsv(info);
7554 ret = 0; 8008 ret = 0;
7555error: 8009error:
7556 btrfs_free_path(path); 8010 btrfs_free_path(path);
@@ -7611,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7611 BUG_ON(ret); 8065 BUG_ON(ret);
7612 8066
7613 spin_lock(&cache->space_info->lock); 8067 spin_lock(&cache->space_info->lock);
7614 cache->space_info->bytes_super += cache->bytes_super; 8068 cache->space_info->bytes_readonly += cache->bytes_super;
7615 spin_unlock(&cache->space_info->lock); 8069 spin_unlock(&cache->space_info->lock);
7616 8070
7617 down_write(&cache->space_info->groups_sem); 8071 __link_block_group(cache->space_info, cache);
7618 list_add_tail(&cache->list, &cache->space_info->block_groups);
7619 up_write(&cache->space_info->groups_sem);
7620 8072
7621 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8073 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7622 BUG_ON(ret); 8074 BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..d74e6af9b53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
135 return state; 135 return state;
136} 136}
137 137
138static void free_extent_state(struct extent_state *state) 138void free_extent_state(struct extent_state *state)
139{ 139{
140 if (!state) 140 if (!state)
141 return; 141 return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
335} 335}
336 336
337static int set_state_cb(struct extent_io_tree *tree, 337static int set_state_cb(struct extent_io_tree *tree,
338 struct extent_state *state, 338 struct extent_state *state, int *bits)
339 unsigned long bits)
340{ 339{
341 if (tree->ops && tree->ops->set_bit_hook) { 340 if (tree->ops && tree->ops->set_bit_hook) {
342 return tree->ops->set_bit_hook(tree->mapping->host, 341 return tree->ops->set_bit_hook(tree->mapping->host,
343 state->start, state->end, 342 state, bits);
344 state->state, bits);
345 } 343 }
346 344
347 return 0; 345 return 0;
348} 346}
349 347
350static void clear_state_cb(struct extent_io_tree *tree, 348static void clear_state_cb(struct extent_io_tree *tree,
351 struct extent_state *state, 349 struct extent_state *state, int *bits)
352 unsigned long bits)
353{ 350{
354 if (tree->ops && tree->ops->clear_bit_hook) 351 if (tree->ops && tree->ops->clear_bit_hook)
355 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 352 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
367 */ 364 */
368static int insert_state(struct extent_io_tree *tree, 365static int insert_state(struct extent_io_tree *tree,
369 struct extent_state *state, u64 start, u64 end, 366 struct extent_state *state, u64 start, u64 end,
370 int bits) 367 int *bits)
371{ 368{
372 struct rb_node *node; 369 struct rb_node *node;
370 int bits_to_set = *bits & ~EXTENT_CTLBITS;
373 int ret; 371 int ret;
374 372
375 if (end < start) { 373 if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
384 if (ret) 382 if (ret)
385 return ret; 383 return ret;
386 384
387 if (bits & EXTENT_DIRTY) 385 if (bits_to_set & EXTENT_DIRTY)
388 tree->dirty_bytes += end - start + 1; 386 tree->dirty_bytes += end - start + 1;
389 state->state |= bits; 387 state->state |= bits_to_set;
390 node = tree_insert(&tree->state, end, &state->rb_node); 388 node = tree_insert(&tree->state, end, &state->rb_node);
391 if (node) { 389 if (node) {
392 struct extent_state *found; 390 struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
456 * struct is freed and removed from the tree 454 * struct is freed and removed from the tree
457 */ 455 */
458static int clear_state_bit(struct extent_io_tree *tree, 456static int clear_state_bit(struct extent_io_tree *tree,
459 struct extent_state *state, int bits, int wake, 457 struct extent_state *state,
460 int delete) 458 int *bits, int wake)
461{ 459{
462 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 460 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
463 int ret = state->state & bits_to_clear; 461 int ret = state->state & bits_to_clear;
464 462
465 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 463 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
466 u64 range = state->end - state->start + 1; 464 u64 range = state->end - state->start + 1;
467 WARN_ON(range > tree->dirty_bytes); 465 WARN_ON(range > tree->dirty_bytes);
468 tree->dirty_bytes -= range; 466 tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
471 state->state &= ~bits_to_clear; 469 state->state &= ~bits_to_clear;
472 if (wake) 470 if (wake)
473 wake_up(&state->wq); 471 wake_up(&state->wq);
474 if (delete || state->state == 0) { 472 if (state->state == 0) {
475 if (state->tree) { 473 if (state->tree) {
476 clear_state_cb(tree, state, state->state);
477 rb_erase(&state->rb_node, &tree->state); 474 rb_erase(&state->rb_node, &tree->state);
478 state->tree = NULL; 475 state->tree = NULL;
479 free_extent_state(state); 476 free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
514 int set = 0; 511 int set = 0;
515 int clear = 0; 512 int clear = 0;
516 513
514 if (delete)
515 bits |= ~EXTENT_CTLBITS;
516 bits |= EXTENT_FIRST_DELALLOC;
517
517 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
518 clear = 1; 519 clear = 1;
519again: 520again:
@@ -580,8 +581,7 @@ hit_next:
580 if (err) 581 if (err)
581 goto out; 582 goto out;
582 if (state->end <= end) { 583 if (state->end <= end) {
583 set |= clear_state_bit(tree, state, bits, wake, 584 set |= clear_state_bit(tree, state, &bits, wake);
584 delete);
585 if (last_end == (u64)-1) 585 if (last_end == (u64)-1)
586 goto out; 586 goto out;
587 start = last_end + 1; 587 start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
602 if (wake) 602 if (wake)
603 wake_up(&state->wq); 603 wake_up(&state->wq);
604 604
605 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 605 set |= clear_state_bit(tree, prealloc, &bits, wake);
606 606
607 prealloc = NULL; 607 prealloc = NULL;
608 goto out; 608 goto out;
@@ -613,7 +613,7 @@ hit_next:
613 else 613 else
614 next_node = NULL; 614 next_node = NULL;
615 615
616 set |= clear_state_bit(tree, state, bits, wake, delete); 616 set |= clear_state_bit(tree, state, &bits, wake);
617 if (last_end == (u64)-1) 617 if (last_end == (u64)-1)
618 goto out; 618 goto out;
619 start = last_end + 1; 619 start = last_end + 1;
@@ -706,19 +706,19 @@ out:
706 706
707static int set_state_bits(struct extent_io_tree *tree, 707static int set_state_bits(struct extent_io_tree *tree,
708 struct extent_state *state, 708 struct extent_state *state,
709 int bits) 709 int *bits)
710{ 710{
711 int ret; 711 int ret;
712 int bits_to_set = *bits & ~EXTENT_CTLBITS;
712 713
713 ret = set_state_cb(tree, state, bits); 714 ret = set_state_cb(tree, state, bits);
714 if (ret) 715 if (ret)
715 return ret; 716 return ret;
716 717 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
717 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
718 u64 range = state->end - state->start + 1; 718 u64 range = state->end - state->start + 1;
719 tree->dirty_bytes += range; 719 tree->dirty_bytes += range;
720 } 720 }
721 state->state |= bits; 721 state->state |= bits_to_set;
722 722
723 return 0; 723 return 0;
724} 724}
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
745 * [start, end] is inclusive This takes the tree lock. 745 * [start, end] is inclusive This takes the tree lock.
746 */ 746 */
747 747
748static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 748int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
749 int bits, int exclusive_bits, u64 *failed_start, 749 int bits, int exclusive_bits, u64 *failed_start,
750 struct extent_state **cached_state, 750 struct extent_state **cached_state, gfp_t mask)
751 gfp_t mask)
752{ 751{
753 struct extent_state *state; 752 struct extent_state *state;
754 struct extent_state *prealloc = NULL; 753 struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
757 u64 last_start; 756 u64 last_start;
758 u64 last_end; 757 u64 last_end;
759 758
759 bits |= EXTENT_FIRST_DELALLOC;
760again: 760again:
761 if (!prealloc && (mask & __GFP_WAIT)) { 761 if (!prealloc && (mask & __GFP_WAIT)) {
762 prealloc = alloc_extent_state(mask); 762 prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
778 */ 778 */
779 node = tree_search(tree, start); 779 node = tree_search(tree, start);
780 if (!node) { 780 if (!node) {
781 err = insert_state(tree, prealloc, start, end, bits); 781 err = insert_state(tree, prealloc, start, end, &bits);
782 prealloc = NULL; 782 prealloc = NULL;
783 BUG_ON(err == -EEXIST); 783 BUG_ON(err == -EEXIST);
784 goto out; 784 goto out;
@@ -802,7 +802,7 @@ hit_next:
802 goto out; 802 goto out;
803 } 803 }
804 804
805 err = set_state_bits(tree, state, bits); 805 err = set_state_bits(tree, state, &bits);
806 if (err) 806 if (err)
807 goto out; 807 goto out;
808 808
@@ -852,7 +852,7 @@ hit_next:
852 if (err) 852 if (err)
853 goto out; 853 goto out;
854 if (state->end <= end) { 854 if (state->end <= end) {
855 err = set_state_bits(tree, state, bits); 855 err = set_state_bits(tree, state, &bits);
856 if (err) 856 if (err)
857 goto out; 857 goto out;
858 cache_state(state, cached_state); 858 cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
877 else 877 else
878 this_end = last_start - 1; 878 this_end = last_start - 1;
879 err = insert_state(tree, prealloc, start, this_end, 879 err = insert_state(tree, prealloc, start, this_end,
880 bits); 880 &bits);
881 BUG_ON(err == -EEXIST); 881 BUG_ON(err == -EEXIST);
882 if (err) { 882 if (err) {
883 prealloc = NULL; 883 prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
903 err = split_state(tree, state, prealloc, end + 1); 903 err = split_state(tree, state, prealloc, end + 1);
904 BUG_ON(err == -EEXIST); 904 BUG_ON(err == -EEXIST);
905 905
906 err = set_state_bits(tree, prealloc, bits); 906 err = set_state_bits(tree, prealloc, &bits);
907 if (err) { 907 if (err) {
908 prealloc = NULL; 908 prealloc = NULL;
909 goto out; 909 goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
966{ 966{
967 return clear_extent_bit(tree, start, end, 967 return clear_extent_bit(tree, start, end,
968 EXTENT_DIRTY | EXTENT_DELALLOC | 968 EXTENT_DIRTY | EXTENT_DELALLOC |
969 EXTENT_DO_ACCOUNTING, 0, 0, 969 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
970 NULL, mask);
971} 970}
972 971
973int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 972int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1435 if (op & EXTENT_CLEAR_DELALLOC) 1434 if (op & EXTENT_CLEAR_DELALLOC)
1436 clear_bits |= EXTENT_DELALLOC; 1435 clear_bits |= EXTENT_DELALLOC;
1437 1436
1438 if (op & EXTENT_CLEAR_ACCOUNTING)
1439 clear_bits |= EXTENT_DO_ACCOUNTING;
1440
1441 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1437 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1442 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1438 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1439 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1916 1912
1917 if (tree->ops && tree->ops->submit_bio_hook) 1913 if (tree->ops && tree->ops->submit_bio_hook)
1918 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1914 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1919 mirror_num, bio_flags); 1915 mirror_num, bio_flags, start);
1920 else 1916 else
1921 submit_bio(rw, bio); 1917 submit_bio(rw, bio);
1922 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1918 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2020 sector_t sector; 2016 sector_t sector;
2021 struct extent_map *em; 2017 struct extent_map *em;
2022 struct block_device *bdev; 2018 struct block_device *bdev;
2019 struct btrfs_ordered_extent *ordered;
2023 int ret; 2020 int ret;
2024 int nr = 0; 2021 int nr = 0;
2025 size_t page_offset = 0; 2022 size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2031 set_page_extent_mapped(page); 2028 set_page_extent_mapped(page);
2032 2029
2033 end = page_end; 2030 end = page_end;
2034 lock_extent(tree, start, end, GFP_NOFS); 2031 while (1) {
2032 lock_extent(tree, start, end, GFP_NOFS);
2033 ordered = btrfs_lookup_ordered_extent(inode, start);
2034 if (!ordered)
2035 break;
2036 unlock_extent(tree, start, end, GFP_NOFS);
2037 btrfs_start_ordered_extent(inode, ordered, 1);
2038 btrfs_put_ordered_extent(ordered);
2039 }
2035 2040
2036 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2041 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2037 char *userpage; 2042 char *userpage;
@@ -2589,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2589 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2594 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2590 }; 2595 };
2591 struct writeback_control wbc_writepages = { 2596 struct writeback_control wbc_writepages = {
2592 .bdi = wbc->bdi,
2593 .sync_mode = wbc->sync_mode, 2597 .sync_mode = wbc->sync_mode,
2594 .older_than_this = NULL, 2598 .older_than_this = NULL,
2595 .nr_to_write = 64, 2599 .nr_to_write = 64,
@@ -2623,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2623 .sync_io = mode == WB_SYNC_ALL, 2627 .sync_io = mode == WB_SYNC_ALL,
2624 }; 2628 };
2625 struct writeback_control wbc_writepages = { 2629 struct writeback_control wbc_writepages = {
2626 .bdi = inode->i_mapping->backing_dev_info,
2627 .sync_mode = mode, 2630 .sync_mode = mode,
2628 .older_than_this = NULL, 2631 .older_than_this = NULL,
2629 .nr_to_write = nr_pages * 2, 2632 .nr_to_write = nr_pages * 2,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
20 22
21/* flags for bio submission */ 23/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1 24#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
47 49
48typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 50typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
49 struct bio *bio, int mirror_num, 51 struct bio *bio, int mirror_num,
50 unsigned long bio_flags); 52 unsigned long bio_flags, u64 bio_offset);
51struct extent_io_ops { 53struct extent_io_ops {
52 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 54 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
53 u64 start, u64 end, int *page_started, 55 u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
69 struct extent_state *state); 71 struct extent_state *state);
70 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 72 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
71 struct extent_state *state, int uptodate); 73 struct extent_state *state, int uptodate);
72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
73 unsigned long old, unsigned long bits); 75 int *bits);
74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 76 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
75 unsigned long bits); 77 int *bits);
76 int (*merge_extent_hook)(struct inode *inode, 78 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new, 79 struct extent_state *new,
78 struct extent_state *other); 80 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
176 u64 *start, u64 search_end, 178 u64 *start, u64 search_end,
177 u64 max_bytes, unsigned long bits); 179 u64 max_bytes, unsigned long bits);
178 180
181void free_extent_state(struct extent_state *state);
179int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
180 int bits, int filled, struct extent_state *cached_state); 183 int bits, int filled, struct extent_state *cached_state);
181int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 184int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
185 gfp_t mask); 188 gfp_t mask);
186int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 189int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
187 int bits, gfp_t mask); 190 int bits, gfp_t mask);
191int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask);
188int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
189 gfp_t mask); 195 gfp_t mask);
190int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
149} 149}
150 150
151 151
152int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
153 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
154{ 155{
155 u32 sum; 156 u32 sum;
156 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
157 int bio_index = 0; 158 int bio_index = 0;
158 u64 offset; 159 u64 offset = 0;
159 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
160 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
161 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
174 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
175 176
176 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
177 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
178 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
179 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
180 if (ret == 0) 184 if (ret == 0)
181 goto found; 185 goto found;
@@ -238,6 +242,7 @@ found:
238 else 242 else
239 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
240 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
241 bio_index++; 246 bio_index++;
242 bvec++; 247 bvec++;
243 } 248 }
@@ -245,6 +250,18 @@ found:
245 return 0; 250 return 0;
246} 251}
247 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
248int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
249 struct list_head *list) 266 struct list_head *list)
250{ 267{
@@ -657,6 +674,9 @@ again:
657 goto found; 674 goto found;
658 } 675 }
659 ret = PTR_ERR(item); 676 ret = PTR_ERR(item);
677 if (ret != -EFBIG && ret != -ENOENT)
678 goto fail_unlock;
679
660 if (ret == -EFBIG) { 680 if (ret == -EFBIG) {
661 u32 item_size; 681 u32 item_size;
662 /* we found one, but it isn't big enough yet */ 682 /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
47 int write_bytes, 47 int write_bytes,
48 struct page **prepared_pages, 48 struct page **prepared_pages,
49 const char __user *buf) 49 struct iov_iter *i)
50{ 50{
51 long page_fault = 0; 51 size_t copied;
52 int i; 52 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 53 int offset = pos & (PAGE_CACHE_SIZE - 1);
54 54
55 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { 55 while (write_bytes > 0) {
56 size_t count = min_t(size_t, 56 size_t count = min_t(size_t,
57 PAGE_CACHE_SIZE - offset, write_bytes); 57 PAGE_CACHE_SIZE - offset, write_bytes);
58 struct page *page = prepared_pages[i]; 58 struct page *page = prepared_pages[pg];
59 fault_in_pages_readable(buf, count); 59again:
60 if (unlikely(iov_iter_fault_in_readable(i, count)))
61 return -EFAULT;
60 62
61 /* Copy data from userspace to the current page */ 63 /* Copy data from userspace to the current page */
62 kmap(page); 64 copied = iov_iter_copy_from_user(page, i, offset, count);
63 page_fault = __copy_from_user(page_address(page) + offset, 65
64 buf, count);
65 /* Flush processor's dcache for this page */ 66 /* Flush processor's dcache for this page */
66 flush_dcache_page(page); 67 flush_dcache_page(page);
67 kunmap(page); 68 iov_iter_advance(i, copied);
68 buf += count; 69 write_bytes -= copied;
69 write_bytes -= count;
70 70
71 if (page_fault) 71 if (unlikely(copied == 0)) {
72 break; 72 count = min_t(size_t, PAGE_CACHE_SIZE - offset,
73 iov_iter_single_seg_count(i));
74 goto again;
75 }
76
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 offset += copied;
79 } else {
80 pg++;
81 offset = 0;
82 }
73 } 83 }
74 return page_fault ? -EFAULT : 0; 84 return 0;
75} 85}
76 86
77/* 87/*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
126 end_of_last_block = start_pos + num_bytes - 1; 136 end_of_last_block = start_pos + num_bytes - 1;
127 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 137 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
128 NULL); 138 NULL);
129 if (err) 139 BUG_ON(err);
130 return err;
131 140
132 for (i = 0; i < num_pages; i++) { 141 for (i = 0; i < num_pages; i++) {
133 struct page *p = pages[i]; 142 struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
142 * at this time. 151 * at this time.
143 */ 152 */
144 } 153 }
145 return err; 154 return 0;
146} 155}
147 156
148/* 157/*
@@ -823,45 +832,46 @@ again:
823 return 0; 832 return 0;
824} 833}
825 834
826static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 835static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
827 size_t count, loff_t *ppos) 836 const struct iovec *iov,
837 unsigned long nr_segs, loff_t pos)
828{ 838{
829 loff_t pos; 839 struct file *file = iocb->ki_filp;
840 struct inode *inode = fdentry(file)->d_inode;
841 struct btrfs_root *root = BTRFS_I(inode)->root;
842 struct page *pinned[2];
843 struct page **pages = NULL;
844 struct iov_iter i;
845 loff_t *ppos = &iocb->ki_pos;
830 loff_t start_pos; 846 loff_t start_pos;
831 ssize_t num_written = 0; 847 ssize_t num_written = 0;
832 ssize_t err = 0; 848 ssize_t err = 0;
849 size_t count;
850 size_t ocount;
833 int ret = 0; 851 int ret = 0;
834 struct inode *inode = fdentry(file)->d_inode;
835 struct btrfs_root *root = BTRFS_I(inode)->root;
836 struct page **pages = NULL;
837 int nrptrs; 852 int nrptrs;
838 struct page *pinned[2];
839 unsigned long first_index; 853 unsigned long first_index;
840 unsigned long last_index; 854 unsigned long last_index;
841 int will_write; 855 int will_write;
856 int buffered = 0;
842 857
843 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
844 (file->f_flags & O_DIRECT)); 859 (file->f_flags & O_DIRECT));
845 860
846 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
847 PAGE_CACHE_SIZE / (sizeof(struct page *)));
848 pinned[0] = NULL; 861 pinned[0] = NULL;
849 pinned[1] = NULL; 862 pinned[1] = NULL;
850 863
851 pos = *ppos;
852 start_pos = pos; 864 start_pos = pos;
853 865
854 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 866 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
855 867
856 /* do the reserve before the mutex lock in case we have to do some
857 * flushing. We wouldn't deadlock, but this is more polite.
858 */
859 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
860 if (err)
861 goto out_nolock;
862
863 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
864 869
870 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
871 if (err)
872 goto out;
873 count = ocount;
874
865 current->backing_dev_info = inode->i_mapping->backing_dev_info; 875 current->backing_dev_info = inode->i_mapping->backing_dev_info;
866 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 876 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
867 if (err) 877 if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
875 goto out; 885 goto out;
876 886
877 file_update_time(file); 887 file_update_time(file);
888 BTRFS_I(inode)->sequence++;
889
890 if (unlikely(file->f_flags & O_DIRECT)) {
891 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 pos, ppos, count,
893 ocount);
894 /*
895 * the generic O_DIRECT will update in-memory i_size after the
896 * DIOs are done. But our endio handlers that update the on
897 * disk i_size never update past the in memory i_size. So we
898 * need one more update here to catch any additions to the
899 * file
900 */
901 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
902 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
903 mark_inode_dirty(inode);
904 }
878 905
906 if (num_written < 0) {
907 ret = num_written;
908 num_written = 0;
909 goto out;
910 } else if (num_written == count) {
911 /* pick up pos changes done by the generic code */
912 pos = *ppos;
913 goto out;
914 }
915 /*
916 * We are going to do buffered for the rest of the range, so we
917 * need to make sure to invalidate the buffered pages when we're
918 * done.
919 */
920 buffered = 1;
921 pos += num_written;
922 }
923
924 iov_iter_init(&i, iov, nr_segs, count, num_written);
925 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 (sizeof(struct page *)));
879 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 928 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
880 929
881 /* generic_write_checks can change our pos */ 930 /* generic_write_checks can change our pos */
882 start_pos = pos; 931 start_pos = pos;
883 932
884 BTRFS_I(inode)->sequence++;
885 first_index = pos >> PAGE_CACHE_SHIFT; 933 first_index = pos >> PAGE_CACHE_SHIFT;
886 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 934 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
887 935
888 /* 936 /*
889 * there are lots of better ways to do this, but this code 937 * there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
900 unlock_page(pinned[0]); 948 unlock_page(pinned[0]);
901 } 949 }
902 } 950 }
903 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { 951 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
904 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 952 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
905 if (!PageUptodate(pinned[1])) { 953 if (!PageUptodate(pinned[1])) {
906 ret = btrfs_readpage(NULL, pinned[1]); 954 ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
911 } 959 }
912 } 960 }
913 961
914 while (count > 0) { 962 while (iov_iter_count(&i) > 0) {
915 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 963 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
916 size_t write_bytes = min(count, nrptrs * 964 size_t write_bytes = min(iov_iter_count(&i),
917 (size_t)PAGE_CACHE_SIZE - 965 nrptrs * (size_t)PAGE_CACHE_SIZE -
918 offset); 966 offset);
919 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 967 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
920 PAGE_CACHE_SHIFT; 968 PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
922 WARN_ON(num_pages > nrptrs); 970 WARN_ON(num_pages > nrptrs);
923 memset(pages, 0, sizeof(struct page *) * nrptrs); 971 memset(pages, 0, sizeof(struct page *) * nrptrs);
924 972
925 ret = btrfs_check_data_free_space(root, inode, write_bytes); 973 ret = btrfs_delalloc_reserve_space(inode, write_bytes);
926 if (ret) 974 if (ret)
927 goto out; 975 goto out;
928 976
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
930 pos, first_index, last_index, 978 pos, first_index, last_index,
931 write_bytes); 979 write_bytes);
932 if (ret) { 980 if (ret) {
933 btrfs_free_reserved_data_space(root, inode, 981 btrfs_delalloc_release_space(inode, write_bytes);
934 write_bytes);
935 goto out; 982 goto out;
936 } 983 }
937 984
938 ret = btrfs_copy_from_user(pos, num_pages, 985 ret = btrfs_copy_from_user(pos, num_pages,
939 write_bytes, pages, buf); 986 write_bytes, pages, &i);
940 if (ret) { 987 if (ret == 0) {
941 btrfs_free_reserved_data_space(root, inode, 988 dirty_and_release_pages(NULL, root, file, pages,
942 write_bytes); 989 num_pages, pos, write_bytes);
943 btrfs_drop_pages(pages, num_pages);
944 goto out;
945 } 990 }
946 991
947 ret = dirty_and_release_pages(NULL, root, file, pages,
948 num_pages, pos, write_bytes);
949 btrfs_drop_pages(pages, num_pages); 992 btrfs_drop_pages(pages, num_pages);
950 if (ret) { 993 if (ret) {
951 btrfs_free_reserved_data_space(root, inode, 994 btrfs_delalloc_release_space(inode, write_bytes);
952 write_bytes);
953 goto out; 995 goto out;
954 } 996 }
955 997
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
965 btrfs_throttle(root); 1007 btrfs_throttle(root);
966 } 1008 }
967 1009
968 buf += write_bytes;
969 count -= write_bytes;
970 pos += write_bytes; 1010 pos += write_bytes;
971 num_written += write_bytes; 1011 num_written += write_bytes;
972 1012
@@ -976,9 +1016,7 @@ out:
976 mutex_unlock(&inode->i_mutex); 1016 mutex_unlock(&inode->i_mutex);
977 if (ret) 1017 if (ret)
978 err = ret; 1018 err = ret;
979 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
980 1019
981out_nolock:
982 kfree(pages); 1020 kfree(pages);
983 if (pinned[0]) 1021 if (pinned[0])
984 page_cache_release(pinned[0]); 1022 page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
1008 num_written = err; 1046 num_written = err;
1009 1047
1010 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1011 trans = btrfs_start_transaction(root, 1); 1049 trans = btrfs_start_transaction(root, 0);
1012 ret = btrfs_log_dentry_safe(trans, root, 1050 ret = btrfs_log_dentry_safe(trans, root,
1013 file->f_dentry); 1051 file->f_dentry);
1014 if (ret == 0) { 1052 if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
1023 btrfs_end_transaction(trans, root); 1061 btrfs_end_transaction(trans, root);
1024 } 1062 }
1025 } 1063 }
1026 if (file->f_flags & O_DIRECT) { 1064 if (file->f_flags & O_DIRECT && buffered) {
1027 invalidate_mapping_pages(inode->i_mapping, 1065 invalidate_mapping_pages(inode->i_mapping,
1028 start_pos >> PAGE_CACHE_SHIFT, 1066 start_pos >> PAGE_CACHE_SHIFT,
1029 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1063 * important optimization for directories because holding the mutex prevents 1101 * important optimization for directories because holding the mutex prevents
1064 * new operations on the dir while we write to disk. 1102 * new operations on the dir while we write to disk.
1065 */ 1103 */
1066int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 1104int btrfs_sync_file(struct file *file, int datasync)
1067{ 1105{
1106 struct dentry *dentry = file->f_path.dentry;
1068 struct inode *inode = dentry->d_inode; 1107 struct inode *inode = dentry->d_inode;
1069 struct btrfs_root *root = BTRFS_I(inode)->root; 1108 struct btrfs_root *root = BTRFS_I(inode)->root;
1070 int ret = 0; 1109 int ret = 0;
@@ -1101,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1101 /* 1140 /*
1102 * ok we haven't committed the transaction yet, lets do a commit 1141 * ok we haven't committed the transaction yet, lets do a commit
1103 */ 1142 */
1104 if (file && file->private_data) 1143 if (file->private_data)
1105 btrfs_ioctl_trans_end(file); 1144 btrfs_ioctl_trans_end(file);
1106 1145
1107 trans = btrfs_start_transaction(root, 1); 1146 trans = btrfs_start_transaction(root, 0);
1108 if (!trans) { 1147 if (IS_ERR(trans)) {
1109 ret = -ENOMEM; 1148 ret = PTR_ERR(trans);
1110 goto out; 1149 goto out;
1111 } 1150 }
1112 1151
@@ -1151,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
1151 1190
1152static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1191static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1153{ 1192{
1154 vma->vm_ops = &btrfs_file_vm_ops; 1193 struct address_space *mapping = filp->f_mapping;
1194
1195 if (!mapping->a_ops->readpage)
1196 return -ENOEXEC;
1197
1155 file_accessed(filp); 1198 file_accessed(filp);
1199 vma->vm_ops = &btrfs_file_vm_ops;
1200 vma->vm_flags |= VM_CAN_NONLINEAR;
1201
1156 return 0; 1202 return 0;
1157} 1203}
1158 1204
1159const struct file_operations btrfs_file_operations = { 1205const struct file_operations btrfs_file_operations = {
1160 .llseek = generic_file_llseek, 1206 .llseek = generic_file_llseek,
1161 .read = do_sync_read, 1207 .read = do_sync_read,
1208 .write = do_sync_write,
1162 .aio_read = generic_file_aio_read, 1209 .aio_read = generic_file_aio_read,
1163 .splice_read = generic_file_splice_read, 1210 .splice_read = generic_file_splice_read,
1164 .write = btrfs_file_write, 1211 .aio_write = btrfs_file_aio_write,
1165 .mmap = btrfs_file_mmap, 1212 .mmap = btrfs_file_mmap,
1166 .open = generic_file_open, 1213 .open = generic_file_open,
1167 .release = btrfs_release_file, 1214 .release = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
49 return 0; 49 return 0;
50} 50}
51 51
52struct btrfs_inode_ref *
53btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path,
56 const char *name, int name_len,
57 u64 inode_objectid, u64 ref_objectid, int mod)
58{
59 struct btrfs_key key;
60 struct btrfs_inode_ref *ref;
61 int ins_len = mod < 0 ? -1 : 0;
62 int cow = mod != 0;
63 int ret;
64
65 key.objectid = inode_objectid;
66 key.type = BTRFS_INODE_REF_KEY;
67 key.offset = ref_objectid;
68
69 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0)
73 return NULL;
74 if (!find_name_in_backref(path, name, name_len, &ref))
75 return NULL;
76 return ref;
77}
78
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 79int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root, 80 struct btrfs_root *root,
54 const char *name, int name_len, 81 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc641d4e3..c03864406af3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
252 inline_len, compressed_size, 252 inline_len, compressed_size,
253 compressed_pages); 253 compressed_pages);
254 BUG_ON(ret); 254 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start);
255 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
256 return 0; 257 return 0;
257} 258}
@@ -414,6 +415,7 @@ again:
414 trans = btrfs_join_transaction(root, 1); 415 trans = btrfs_join_transaction(root, 1);
415 BUG_ON(!trans); 416 BUG_ON(!trans);
416 btrfs_set_trans_block_group(trans, inode); 417 btrfs_set_trans_block_group(trans, inode);
418 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
417 419
418 /* lets try to make an inline extent */ 420 /* lets try to make an inline extent */
419 if (ret || total_in < (actual_end - start)) { 421 if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
439 start, end, NULL, 441 start, end, NULL,
440 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 442 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
441 EXTENT_CLEAR_DELALLOC | 443 EXTENT_CLEAR_DELALLOC |
442 EXTENT_CLEAR_ACCOUNTING |
443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
444 445
445 btrfs_end_transaction(trans, root); 446 btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
697 return 0; 698 return 0;
698} 699}
699 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
700/* 733/*
701 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
702 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
734 trans = btrfs_join_transaction(root, 1); 767 trans = btrfs_join_transaction(root, 1);
735 BUG_ON(!trans); 768 BUG_ON(!trans);
736 btrfs_set_trans_block_group(trans, inode); 769 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
737 771
738 actual_end = min_t(u64, isize, end + 1); 772 actual_end = min_t(u64, isize, end + 1);
739 773
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
753 EXTENT_CLEAR_UNLOCK_PAGE | 787 EXTENT_CLEAR_UNLOCK_PAGE |
754 EXTENT_CLEAR_UNLOCK | 788 EXTENT_CLEAR_UNLOCK |
755 EXTENT_CLEAR_DELALLOC | 789 EXTENT_CLEAR_DELALLOC |
756 EXTENT_CLEAR_ACCOUNTING |
757 EXTENT_CLEAR_DIRTY | 790 EXTENT_CLEAR_DIRTY |
758 EXTENT_SET_WRITEBACK | 791 EXTENT_SET_WRITEBACK |
759 EXTENT_END_WRITEBACK); 792 EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
769 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
770 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
771 804
772 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
773 read_lock(&BTRFS_I(inode)->extent_tree.lock);
774 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
775 start, num_bytes);
776 if (em) {
777 /*
778 * if block start isn't an actual block number then find the
779 * first block in this inode and use that as a hint. If that
780 * block is also bogus then just don't worry about it.
781 */
782 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
783 free_extent_map(em);
784 em = search_extent_mapping(em_tree, 0, 0);
785 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
786 alloc_hint = em->block_start;
787 if (em)
788 free_extent_map(em);
789 } else {
790 alloc_hint = em->block_start;
791 free_extent_map(em);
792 }
793 }
794 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
795 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
796 807
797 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
1174 num_bytes, num_bytes, type); 1185 num_bytes, num_bytes, type);
1175 BUG_ON(ret); 1186 BUG_ON(ret);
1176 1187
1188 if (root->root_key.objectid ==
1189 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1190 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1191 num_bytes);
1192 BUG_ON(ret);
1193 }
1194
1177 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1195 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1178 cur_offset, cur_offset + num_bytes - 1, 1196 cur_offset, cur_offset + num_bytes - 1,
1179 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1197 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1226} 1244}
1227 1245
1228static int btrfs_split_extent_hook(struct inode *inode, 1246static int btrfs_split_extent_hook(struct inode *inode,
1229 struct extent_state *orig, u64 split) 1247 struct extent_state *orig, u64 split)
1230{ 1248{
1249 /* not delalloc, ignore it */
1231 if (!(orig->state & EXTENT_DELALLOC)) 1250 if (!(orig->state & EXTENT_DELALLOC))
1232 return 0; 1251 return 0;
1233 1252
1234 spin_lock(&BTRFS_I(inode)->accounting_lock); 1253 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1235 BTRFS_I(inode)->outstanding_extents++;
1236 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1237
1238 return 0; 1254 return 0;
1239} 1255}
1240 1256
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1252 if (!(other->state & EXTENT_DELALLOC)) 1268 if (!(other->state & EXTENT_DELALLOC))
1253 return 0; 1269 return 0;
1254 1270
1255 spin_lock(&BTRFS_I(inode)->accounting_lock); 1271 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1256 BTRFS_I(inode)->outstanding_extents--;
1257 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1258
1259 return 0; 1272 return 0;
1260} 1273}
1261 1274
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1264 * bytes in this file, and to maintain the list of inodes that 1277 * bytes in this file, and to maintain the list of inodes that
1265 * have pending delalloc work to be done. 1278 * have pending delalloc work to be done.
1266 */ 1279 */
1267static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1280static int btrfs_set_bit_hook(struct inode *inode,
1268 unsigned long old, unsigned long bits) 1281 struct extent_state *state, int *bits)
1269{ 1282{
1270 1283
1271 /* 1284 /*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1273 * but in this case, we are only testeing for the DELALLOC 1286 * but in this case, we are only testeing for the DELALLOC
1274 * bit, which is only set or cleared with irqs on 1287 * bit, which is only set or cleared with irqs on
1275 */ 1288 */
1276 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1277 struct btrfs_root *root = BTRFS_I(inode)->root; 1290 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start;
1278 1292
1279 spin_lock(&BTRFS_I(inode)->accounting_lock); 1293 if (*bits & EXTENT_FIRST_DELALLOC)
1280 BTRFS_I(inode)->outstanding_extents++; 1294 *bits &= ~EXTENT_FIRST_DELALLOC;
1281 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1295 else
1282 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1296 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1283 1297
1284 spin_lock(&root->fs_info->delalloc_lock); 1298 spin_lock(&root->fs_info->delalloc_lock);
1285 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1299 BTRFS_I(inode)->delalloc_bytes += len;
1286 root->fs_info->delalloc_bytes += end - start + 1; 1300 root->fs_info->delalloc_bytes += len;
1287 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1288 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1289 &root->fs_info->delalloc_inodes); 1303 &root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1297 * extent_io.c clear_bit_hook, see set_bit_hook for why 1311 * extent_io.c clear_bit_hook, see set_bit_hook for why
1298 */ 1312 */
1299static int btrfs_clear_bit_hook(struct inode *inode, 1313static int btrfs_clear_bit_hook(struct inode *inode,
1300 struct extent_state *state, unsigned long bits) 1314 struct extent_state *state, int *bits)
1301{ 1315{
1302 /* 1316 /*
1303 * set_bit and clear bit hooks normally require _irqsave/restore 1317 * set_bit and clear bit hooks normally require _irqsave/restore
1304 * but in this case, we are only testeing for the DELALLOC 1318 * but in this case, we are only testeing for the DELALLOC
1305 * bit, which is only set or cleared with irqs on 1319 * bit, which is only set or cleared with irqs on
1306 */ 1320 */
1307 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1308 struct btrfs_root *root = BTRFS_I(inode)->root; 1322 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start;
1309 1324
1310 if (bits & EXTENT_DO_ACCOUNTING) { 1325 if (*bits & EXTENT_FIRST_DELALLOC)
1311 spin_lock(&BTRFS_I(inode)->accounting_lock); 1326 *bits &= ~EXTENT_FIRST_DELALLOC;
1312 WARN_ON(!BTRFS_I(inode)->outstanding_extents); 1327 else if (!(*bits & EXTENT_DO_ACCOUNTING))
1313 BTRFS_I(inode)->outstanding_extents--; 1328 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1314 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1329
1315 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1330 if (*bits & EXTENT_DO_ACCOUNTING)
1316 } 1331 btrfs_delalloc_release_metadata(inode, len);
1332
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1334 btrfs_free_reserved_data_space(inode, len);
1317 1335
1318 spin_lock(&root->fs_info->delalloc_lock); 1336 spin_lock(&root->fs_info->delalloc_lock);
1319 if (state->end - state->start + 1 > 1337 root->fs_info->delalloc_bytes -= len;
1320 root->fs_info->delalloc_bytes) { 1338 BTRFS_I(inode)->delalloc_bytes -= len;
1321 printk(KERN_INFO "btrfs warning: delalloc account " 1339
1322 "%llu %llu\n",
1323 (unsigned long long)
1324 state->end - state->start + 1,
1325 (unsigned long long)
1326 root->fs_info->delalloc_bytes);
1327 btrfs_delalloc_free_space(root, inode, (u64)-1);
1328 root->fs_info->delalloc_bytes = 0;
1329 BTRFS_I(inode)->delalloc_bytes = 0;
1330 } else {
1331 btrfs_delalloc_free_space(root, inode,
1332 state->end -
1333 state->start + 1);
1334 root->fs_info->delalloc_bytes -= state->end -
1335 state->start + 1;
1336 BTRFS_I(inode)->delalloc_bytes -= state->end -
1337 state->start + 1;
1338 }
1339 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1340 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1340 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1341 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1384 */ 1385 */
1385static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1386static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1386 struct bio *bio, int mirror_num, 1387 struct bio *bio, int mirror_num,
1387 unsigned long bio_flags) 1388 unsigned long bio_flags,
1389 u64 bio_offset)
1388{ 1390{
1389 struct btrfs_root *root = BTRFS_I(inode)->root; 1391 struct btrfs_root *root = BTRFS_I(inode)->root;
1390 int ret = 0; 1392 int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1403 * are inserted into the btree 1405 * are inserted into the btree
1404 */ 1406 */
1405static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1407static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1406 int mirror_num, unsigned long bio_flags) 1408 int mirror_num, unsigned long bio_flags,
1409 u64 bio_offset)
1407{ 1410{
1408 struct btrfs_root *root = BTRFS_I(inode)->root; 1411 struct btrfs_root *root = BTRFS_I(inode)->root;
1409 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1412 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1414 * on write, or reading the csums from the tree before a read 1417 * on write, or reading the csums from the tree before a read
1415 */ 1418 */
1416static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1419static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1417 int mirror_num, unsigned long bio_flags) 1420 int mirror_num, unsigned long bio_flags,
1421 u64 bio_offset)
1418{ 1422{
1419 struct btrfs_root *root = BTRFS_I(inode)->root; 1423 struct btrfs_root *root = BTRFS_I(inode)->root;
1420 int ret = 0; 1424 int ret = 0;
@@ -1425,7 +1429,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1425 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1429 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1426 BUG_ON(ret); 1430 BUG_ON(ret);
1427 1431
1428 if (!(rw & (1 << BIO_RW))) { 1432 if (!(rw & REQ_WRITE)) {
1429 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1433 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1430 return btrfs_submit_compressed_read(inode, bio, 1434 return btrfs_submit_compressed_read(inode, bio,
1431 mirror_num, bio_flags); 1435 mirror_num, bio_flags);
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1439 /* we're doing a write, do the async checksumming */ 1443 /* we're doing a write, do the async checksumming */
1440 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1444 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1441 inode, rw, bio, mirror_num, 1445 inode, rw, bio, mirror_num,
1442 bio_flags, __btrfs_submit_bio_start, 1446 bio_flags, bio_offset,
1447 __btrfs_submit_bio_start,
1443 __btrfs_submit_bio_done); 1448 __btrfs_submit_bio_done);
1444 } 1449 }
1445 1450
@@ -1520,6 +1525,7 @@ again:
1520 goto again; 1525 goto again;
1521 } 1526 }
1522 1527
1528 BUG();
1523 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1529 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1524 ClearPageChecked(page); 1530 ClearPageChecked(page);
1525out: 1531out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1650static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1656static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1651{ 1657{
1652 struct btrfs_root *root = BTRFS_I(inode)->root; 1658 struct btrfs_root *root = BTRFS_I(inode)->root;
1653 struct btrfs_trans_handle *trans; 1659 struct btrfs_trans_handle *trans = NULL;
1654 struct btrfs_ordered_extent *ordered_extent = NULL; 1660 struct btrfs_ordered_extent *ordered_extent = NULL;
1655 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1661 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1656 struct extent_state *cached_state = NULL; 1662 struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1668 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1669 if (!ret) { 1675 if (!ret) {
1670 trans = btrfs_join_transaction(root, 1); 1676 trans = btrfs_join_transaction(root, 1);
1677 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1671 ret = btrfs_update_inode(trans, root, inode); 1679 ret = btrfs_update_inode(trans, root, inode);
1672 BUG_ON(ret); 1680 BUG_ON(ret);
1673 btrfs_end_transaction(trans, root);
1674 } 1681 }
1675 goto out; 1682 goto out;
1676 } 1683 }
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1680 0, &cached_state, GFP_NOFS); 1687 0, &cached_state, GFP_NOFS);
1681 1688
1682 trans = btrfs_join_transaction(root, 1); 1689 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1683 1692
1684 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1693 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1685 compressed = 1; 1694 compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1711 add_pending_csums(trans, inode, ordered_extent->file_offset, 1720 add_pending_csums(trans, inode, ordered_extent->file_offset,
1712 &ordered_extent->list); 1721 &ordered_extent->list);
1713 1722
1714 /* this also removes the ordered extent from the tree */
1715 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1723 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1716 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
1717 BUG_ON(ret); 1725 BUG_ON(ret);
1718 btrfs_end_transaction(trans, root);
1719out: 1726out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1728 if (trans)
1729 btrfs_end_transaction(trans, root);
1720 /* once for us */ 1730 /* once for us */
1721 btrfs_put_ordered_extent(ordered_extent); 1731 btrfs_put_ordered_extent(ordered_extent);
1722 /* once for the tree */ 1732 /* once for the tree */
@@ -1831,14 +1841,14 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1831 bio->bi_size = 0; 1841 bio->bi_size = 0;
1832 1842
1833 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 1843 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1834 if (failed_bio->bi_rw & (1 << BIO_RW)) 1844 if (failed_bio->bi_rw & REQ_WRITE)
1835 rw = WRITE; 1845 rw = WRITE;
1836 else 1846 else
1837 rw = READ; 1847 rw = READ;
1838 1848
1839 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1849 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1840 failrec->last_mirror, 1850 failrec->last_mirror,
1841 failrec->bio_flags); 1851 failrec->bio_flags, 0);
1842 return 0; 1852 return 0;
1843} 1853}
1844 1854
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
1993} 2003}
1994 2004
1995/* 2005/*
2006 * calculate extra metadata reservation when snapshotting a subvolume
2007 * contains orphan files.
2008 */
2009void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2010 struct btrfs_pending_snapshot *pending,
2011 u64 *bytes_to_reserve)
2012{
2013 struct btrfs_root *root;
2014 struct btrfs_block_rsv *block_rsv;
2015 u64 num_bytes;
2016 int index;
2017
2018 root = pending->root;
2019 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2020 return;
2021
2022 block_rsv = root->orphan_block_rsv;
2023
2024 /* orphan block reservation for the snapshot */
2025 num_bytes = block_rsv->size;
2026
2027 /*
2028 * after the snapshot is created, COWing tree blocks may use more
2029 * space than it frees. So we should make sure there is enough
2030 * reserved space.
2031 */
2032 index = trans->transid & 0x1;
2033 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2034 num_bytes += block_rsv->size -
2035 (block_rsv->reserved + block_rsv->freed[index]);
2036 }
2037
2038 *bytes_to_reserve += num_bytes;
2039}
2040
2041void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2042 struct btrfs_pending_snapshot *pending)
2043{
2044 struct btrfs_root *root = pending->root;
2045 struct btrfs_root *snap = pending->snap;
2046 struct btrfs_block_rsv *block_rsv;
2047 u64 num_bytes;
2048 int index;
2049 int ret;
2050
2051 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2052 return;
2053
2054 /* refill source subvolume's orphan block reservation */
2055 block_rsv = root->orphan_block_rsv;
2056 index = trans->transid & 0x1;
2057 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2058 num_bytes = block_rsv->size -
2059 (block_rsv->reserved + block_rsv->freed[index]);
2060 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2061 root->orphan_block_rsv,
2062 num_bytes);
2063 BUG_ON(ret);
2064 }
2065
2066 /* setup orphan block reservation for the snapshot */
2067 block_rsv = btrfs_alloc_block_rsv(snap);
2068 BUG_ON(!block_rsv);
2069
2070 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2071 snap->orphan_block_rsv = block_rsv;
2072
2073 num_bytes = root->orphan_block_rsv->size;
2074 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2075 block_rsv, num_bytes);
2076 BUG_ON(ret);
2077
2078#if 0
2079 /* insert orphan item for the snapshot */
2080 WARN_ON(!root->orphan_item_inserted);
2081 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2082 snap->root_key.objectid);
2083 BUG_ON(ret);
2084 snap->orphan_item_inserted = 1;
2085#endif
2086}
2087
2088enum btrfs_orphan_cleanup_state {
2089 ORPHAN_CLEANUP_STARTED = 1,
2090 ORPHAN_CLEANUP_DONE = 2,
2091};
2092
2093/*
2094 * This is called in transaction commmit time. If there are no orphan
2095 * files in the subvolume, it removes orphan item and frees block_rsv
2096 * structure.
2097 */
2098void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root)
2100{
2101 int ret;
2102
2103 if (!list_empty(&root->orphan_list) ||
2104 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2105 return;
2106
2107 if (root->orphan_item_inserted &&
2108 btrfs_root_refs(&root->root_item) > 0) {
2109 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2110 root->root_key.objectid);
2111 BUG_ON(ret);
2112 root->orphan_item_inserted = 0;
2113 }
2114
2115 if (root->orphan_block_rsv) {
2116 WARN_ON(root->orphan_block_rsv->size > 0);
2117 btrfs_free_block_rsv(root, root->orphan_block_rsv);
2118 root->orphan_block_rsv = NULL;
2119 }
2120}
2121
2122/*
1996 * This creates an orphan entry for the given inode in case something goes 2123 * This creates an orphan entry for the given inode in case something goes
1997 * wrong in the middle of an unlink/truncate. 2124 * wrong in the middle of an unlink/truncate.
2125 *
2126 * NOTE: caller of this function should reserve 5 units of metadata for
2127 * this function.
1998 */ 2128 */
1999int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2129int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2000{ 2130{
2001 struct btrfs_root *root = BTRFS_I(inode)->root; 2131 struct btrfs_root *root = BTRFS_I(inode)->root;
2002 int ret = 0; 2132 struct btrfs_block_rsv *block_rsv = NULL;
2133 int reserve = 0;
2134 int insert = 0;
2135 int ret;
2136
2137 if (!root->orphan_block_rsv) {
2138 block_rsv = btrfs_alloc_block_rsv(root);
2139 BUG_ON(!block_rsv);
2140 }
2003 2141
2004 spin_lock(&root->list_lock); 2142 spin_lock(&root->orphan_lock);
2143 if (!root->orphan_block_rsv) {
2144 root->orphan_block_rsv = block_rsv;
2145 } else if (block_rsv) {
2146 btrfs_free_block_rsv(root, block_rsv);
2147 block_rsv = NULL;
2148 }
2005 2149
2006 /* already on the orphan list, we're good */ 2150 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2007 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2151 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2008 spin_unlock(&root->list_lock); 2152#if 0
2009 return 0; 2153 /*
2154 * For proper ENOSPC handling, we should do orphan
2155 * cleanup when mounting. But this introduces backward
2156 * compatibility issue.
2157 */
2158 if (!xchg(&root->orphan_item_inserted, 1))
2159 insert = 2;
2160 else
2161 insert = 1;
2162#endif
2163 insert = 1;
2164 } else {
2165 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2010 } 2166 }
2011 2167
2012 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2168 if (!BTRFS_I(inode)->orphan_meta_reserved) {
2169 BTRFS_I(inode)->orphan_meta_reserved = 1;
2170 reserve = 1;
2171 }
2172 spin_unlock(&root->orphan_lock);
2013 2173
2014 spin_unlock(&root->list_lock); 2174 if (block_rsv)
2175 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2015 2176
2016 /* 2177 /* grab metadata reservation from transaction handle */
2017 * insert an orphan item to track this unlinked/truncated file 2178 if (reserve) {
2018 */ 2179 ret = btrfs_orphan_reserve_metadata(trans, inode);
2019 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2180 BUG_ON(ret);
2181 }
2020 2182
2021 return ret; 2183 /* insert an orphan item to track this unlinked/truncated file */
2184 if (insert >= 1) {
2185 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2186 BUG_ON(ret);
2187 }
2188
2189 /* insert an orphan item to track subvolume contains orphan files */
2190 if (insert >= 2) {
2191 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2192 root->root_key.objectid);
2193 BUG_ON(ret);
2194 }
2195 return 0;
2022} 2196}
2023 2197
2024/* 2198/*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2028int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2202int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2029{ 2203{
2030 struct btrfs_root *root = BTRFS_I(inode)->root; 2204 struct btrfs_root *root = BTRFS_I(inode)->root;
2205 int delete_item = 0;
2206 int release_rsv = 0;
2031 int ret = 0; 2207 int ret = 0;
2032 2208
2033 spin_lock(&root->list_lock); 2209 spin_lock(&root->orphan_lock);
2034 2210 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2035 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2211 list_del_init(&BTRFS_I(inode)->i_orphan);
2036 spin_unlock(&root->list_lock); 2212 delete_item = 1;
2037 return 0;
2038 } 2213 }
2039 2214
2040 list_del_init(&BTRFS_I(inode)->i_orphan); 2215 if (BTRFS_I(inode)->orphan_meta_reserved) {
2041 if (!trans) { 2216 BTRFS_I(inode)->orphan_meta_reserved = 0;
2042 spin_unlock(&root->list_lock); 2217 release_rsv = 1;
2043 return 0;
2044 } 2218 }
2219 spin_unlock(&root->orphan_lock);
2045 2220
2046 spin_unlock(&root->list_lock); 2221 if (trans && delete_item) {
2222 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2223 BUG_ON(ret);
2224 }
2047 2225
2048 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2226 if (release_rsv)
2227 btrfs_orphan_release_metadata(inode);
2049 2228
2050 return ret; 2229 return 0;
2051} 2230}
2052 2231
2053/* 2232/*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2064 struct inode *inode; 2243 struct inode *inode;
2065 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2244 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2066 2245
2067 if (!xchg(&root->clean_orphans, 0)) 2246 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2068 return; 2247 return;
2069 2248
2070 path = btrfs_alloc_path(); 2249 path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2117 found_key.type = BTRFS_INODE_ITEM_KEY; 2296 found_key.type = BTRFS_INODE_ITEM_KEY;
2118 found_key.offset = 0; 2297 found_key.offset = 0;
2119 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2120 if (IS_ERR(inode)) 2299 BUG_ON(IS_ERR(inode));
2121 break;
2122 2300
2123 /* 2301 /*
2124 * add this inode to the orphan list so btrfs_orphan_del does 2302 * add this inode to the orphan list so btrfs_orphan_del does
2125 * the proper thing when we hit it 2303 * the proper thing when we hit it
2126 */ 2304 */
2127 spin_lock(&root->list_lock); 2305 spin_lock(&root->orphan_lock);
2128 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2306 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2129 spin_unlock(&root->list_lock); 2307 spin_unlock(&root->orphan_lock);
2130 2308
2131 /* 2309 /*
2132 * if this is a bad inode, means we actually succeeded in 2310 * if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2135 * do a destroy_inode 2313 * do a destroy_inode
2136 */ 2314 */
2137 if (is_bad_inode(inode)) { 2315 if (is_bad_inode(inode)) {
2138 trans = btrfs_start_transaction(root, 1); 2316 trans = btrfs_start_transaction(root, 0);
2139 btrfs_orphan_del(trans, inode); 2317 btrfs_orphan_del(trans, inode);
2140 btrfs_end_transaction(trans, root); 2318 btrfs_end_transaction(trans, root);
2141 iput(inode); 2319 iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2153 /* this will do delete_inode and everything for us */ 2331 /* this will do delete_inode and everything for us */
2154 iput(inode); 2332 iput(inode);
2155 } 2333 }
2334 btrfs_free_path(path);
2335
2336 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2337
2338 if (root->orphan_block_rsv)
2339 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2340 (u64)-1);
2341
2342 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2343 trans = btrfs_join_transaction(root, 1);
2344 btrfs_end_transaction(trans, root);
2345 }
2156 2346
2157 if (nr_unlink) 2347 if (nr_unlink)
2158 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2348 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2159 if (nr_truncate) 2349 if (nr_truncate)
2160 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2350 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2161
2162 btrfs_free_path(path);
2163} 2351}
2164 2352
2165/* 2353/*
@@ -2478,29 +2666,201 @@ out:
2478 return ret; 2666 return ret;
2479} 2667}
2480 2668
2481static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2669/* helper to check if there is any shared block in the path */
2670static int check_path_shared(struct btrfs_root *root,
2671 struct btrfs_path *path)
2672{
2673 struct extent_buffer *eb;
2674 int level;
2675 int ret;
2676 u64 refs = 1;
2677
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level])
2680 break;
2681 eb = path->nodes[level];
2682 if (!btrfs_block_can_be_shared(root, eb))
2683 continue;
2684 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2685 &refs, NULL);
2686 if (refs > 1)
2687 return 1;
2688 }
2689 return 0;
2690}
2691
2692/*
2693 * helper to start transaction for unlink and rmdir.
2694 *
2695 * unlink and rmdir are special in btrfs, they do not always free space.
2696 * so in enospc case, we should make sure they will free space before
2697 * allowing them to use the global metadata reservation.
2698 */
2699static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2700 struct dentry *dentry)
2482{ 2701{
2483 struct btrfs_root *root;
2484 struct btrfs_trans_handle *trans; 2702 struct btrfs_trans_handle *trans;
2703 struct btrfs_root *root = BTRFS_I(dir)->root;
2704 struct btrfs_path *path;
2705 struct btrfs_inode_ref *ref;
2706 struct btrfs_dir_item *di;
2485 struct inode *inode = dentry->d_inode; 2707 struct inode *inode = dentry->d_inode;
2708 u64 index;
2709 int check_link = 1;
2710 int err = -ENOSPC;
2486 int ret; 2711 int ret;
2487 unsigned long nr = 0;
2488 2712
2489 root = BTRFS_I(dir)->root; 2713 trans = btrfs_start_transaction(root, 10);
2714 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2715 return trans;
2490 2716
2491 /* 2717 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2492 * 5 items for unlink inode 2718 return ERR_PTR(-ENOSPC);
2493 * 1 for orphan 2719
2494 */ 2720 /* check if there is someone else holds reference */
2495 ret = btrfs_reserve_metadata_space(root, 6); 2721 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2496 if (ret) 2722 return ERR_PTR(-ENOSPC);
2497 return ret; 2723
2724 if (atomic_read(&inode->i_count) > 2)
2725 return ERR_PTR(-ENOSPC);
2726
2727 if (xchg(&root->fs_info->enospc_unlink, 1))
2728 return ERR_PTR(-ENOSPC);
2729
2730 path = btrfs_alloc_path();
2731 if (!path) {
2732 root->fs_info->enospc_unlink = 0;
2733 return ERR_PTR(-ENOMEM);
2734 }
2498 2735
2499 trans = btrfs_start_transaction(root, 1); 2736 trans = btrfs_start_transaction(root, 0);
2500 if (IS_ERR(trans)) { 2737 if (IS_ERR(trans)) {
2501 btrfs_unreserve_metadata_space(root, 6); 2738 btrfs_free_path(path);
2502 return PTR_ERR(trans); 2739 root->fs_info->enospc_unlink = 0;
2740 return trans;
2741 }
2742
2743 path->skip_locking = 1;
2744 path->search_commit_root = 1;
2745
2746 ret = btrfs_lookup_inode(trans, root, path,
2747 &BTRFS_I(dir)->location, 0);
2748 if (ret < 0) {
2749 err = ret;
2750 goto out;
2751 }
2752 if (ret == 0) {
2753 if (check_path_shared(root, path))
2754 goto out;
2755 } else {
2756 check_link = 0;
2757 }
2758 btrfs_release_path(root, path);
2759
2760 ret = btrfs_lookup_inode(trans, root, path,
2761 &BTRFS_I(inode)->location, 0);
2762 if (ret < 0) {
2763 err = ret;
2764 goto out;
2765 }
2766 if (ret == 0) {
2767 if (check_path_shared(root, path))
2768 goto out;
2769 } else {
2770 check_link = 0;
2503 } 2771 }
2772 btrfs_release_path(root, path);
2773
2774 if (ret == 0 && S_ISREG(inode->i_mode)) {
2775 ret = btrfs_lookup_file_extent(trans, root, path,
2776 inode->i_ino, (u64)-1, 0);
2777 if (ret < 0) {
2778 err = ret;
2779 goto out;
2780 }
2781 BUG_ON(ret == 0);
2782 if (check_path_shared(root, path))
2783 goto out;
2784 btrfs_release_path(root, path);
2785 }
2786
2787 if (!check_link) {
2788 err = 0;
2789 goto out;
2790 }
2791
2792 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2793 dentry->d_name.name, dentry->d_name.len, 0);
2794 if (IS_ERR(di)) {
2795 err = PTR_ERR(di);
2796 goto out;
2797 }
2798 if (di) {
2799 if (check_path_shared(root, path))
2800 goto out;
2801 } else {
2802 err = 0;
2803 goto out;
2804 }
2805 btrfs_release_path(root, path);
2806
2807 ref = btrfs_lookup_inode_ref(trans, root, path,
2808 dentry->d_name.name, dentry->d_name.len,
2809 inode->i_ino, dir->i_ino, 0);
2810 if (IS_ERR(ref)) {
2811 err = PTR_ERR(ref);
2812 goto out;
2813 }
2814 BUG_ON(!ref);
2815 if (check_path_shared(root, path))
2816 goto out;
2817 index = btrfs_inode_ref_index(path->nodes[0], ref);
2818 btrfs_release_path(root, path);
2819
2820 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
2821 dentry->d_name.name, dentry->d_name.len, 0);
2822 if (IS_ERR(di)) {
2823 err = PTR_ERR(di);
2824 goto out;
2825 }
2826 BUG_ON(ret == -ENOENT);
2827 if (check_path_shared(root, path))
2828 goto out;
2829
2830 err = 0;
2831out:
2832 btrfs_free_path(path);
2833 if (err) {
2834 btrfs_end_transaction(trans, root);
2835 root->fs_info->enospc_unlink = 0;
2836 return ERR_PTR(err);
2837 }
2838
2839 trans->block_rsv = &root->fs_info->global_block_rsv;
2840 return trans;
2841}
2842
2843static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2844 struct btrfs_root *root)
2845{
2846 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2847 BUG_ON(!root->fs_info->enospc_unlink);
2848 root->fs_info->enospc_unlink = 0;
2849 }
2850 btrfs_end_transaction_throttle(trans, root);
2851}
2852
2853static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2854{
2855 struct btrfs_root *root = BTRFS_I(dir)->root;
2856 struct btrfs_trans_handle *trans;
2857 struct inode *inode = dentry->d_inode;
2858 int ret;
2859 unsigned long nr = 0;
2860
2861 trans = __unlink_start_trans(dir, dentry);
2862 if (IS_ERR(trans))
2863 return PTR_ERR(trans);
2504 2864
2505 btrfs_set_trans_block_group(trans, dir); 2865 btrfs_set_trans_block_group(trans, dir);
2506 2866
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2508 2868
2509 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2869 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2510 dentry->d_name.name, dentry->d_name.len); 2870 dentry->d_name.name, dentry->d_name.len);
2871 BUG_ON(ret);
2511 2872
2512 if (inode->i_nlink == 0) 2873 if (inode->i_nlink == 0) {
2513 ret = btrfs_orphan_add(trans, inode); 2874 ret = btrfs_orphan_add(trans, inode);
2875 BUG_ON(ret);
2876 }
2514 2877
2515 nr = trans->blocks_used; 2878 nr = trans->blocks_used;
2516 2879 __unlink_end_trans(trans, root);
2517 btrfs_end_transaction_throttle(trans, root);
2518 btrfs_unreserve_metadata_space(root, 6);
2519 btrfs_btree_balance_dirty(root, nr); 2880 btrfs_btree_balance_dirty(root, nr);
2520 return ret; 2881 return ret;
2521} 2882}
@@ -2577,7 +2938,6 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2577 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2938 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2578 ret = btrfs_update_inode(trans, root, dir); 2939 ret = btrfs_update_inode(trans, root, dir);
2579 BUG_ON(ret); 2940 BUG_ON(ret);
2580 dir->i_sb->s_dirt = 1;
2581 2941
2582 btrfs_free_path(path); 2942 btrfs_free_path(path);
2583 return 0; 2943 return 0;
@@ -2587,7 +2947,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2587{ 2947{
2588 struct inode *inode = dentry->d_inode; 2948 struct inode *inode = dentry->d_inode;
2589 int err = 0; 2949 int err = 0;
2590 int ret;
2591 struct btrfs_root *root = BTRFS_I(dir)->root; 2950 struct btrfs_root *root = BTRFS_I(dir)->root;
2592 struct btrfs_trans_handle *trans; 2951 struct btrfs_trans_handle *trans;
2593 unsigned long nr = 0; 2952 unsigned long nr = 0;
@@ -2596,15 +2955,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2596 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2955 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2597 return -ENOTEMPTY; 2956 return -ENOTEMPTY;
2598 2957
2599 ret = btrfs_reserve_metadata_space(root, 5); 2958 trans = __unlink_start_trans(dir, dentry);
2600 if (ret) 2959 if (IS_ERR(trans))
2601 return ret;
2602
2603 trans = btrfs_start_transaction(root, 1);
2604 if (IS_ERR(trans)) {
2605 btrfs_unreserve_metadata_space(root, 5);
2606 return PTR_ERR(trans); 2960 return PTR_ERR(trans);
2607 }
2608 2961
2609 btrfs_set_trans_block_group(trans, dir); 2962 btrfs_set_trans_block_group(trans, dir);
2610 2963
@@ -2627,12 +2980,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2627 btrfs_i_size_write(inode, 0); 2980 btrfs_i_size_write(inode, 0);
2628out: 2981out:
2629 nr = trans->blocks_used; 2982 nr = trans->blocks_used;
2630 ret = btrfs_end_transaction_throttle(trans, root); 2983 __unlink_end_trans(trans, root);
2631 btrfs_unreserve_metadata_space(root, 5);
2632 btrfs_btree_balance_dirty(root, nr); 2984 btrfs_btree_balance_dirty(root, nr);
2633 2985
2634 if (ret && !err)
2635 err = ret;
2636 return err; 2986 return err;
2637} 2987}
2638 2988
@@ -3029,6 +3379,7 @@ out:
3029 if (pending_del_nr) { 3379 if (pending_del_nr) {
3030 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3380 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3031 pending_del_nr); 3381 pending_del_nr);
3382 BUG_ON(ret);
3032 } 3383 }
3033 btrfs_free_path(path); 3384 btrfs_free_path(path);
3034 return err; 3385 return err;
@@ -3056,11 +3407,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3056 3407
3057 if ((offset & (blocksize - 1)) == 0) 3408 if ((offset & (blocksize - 1)) == 0)
3058 goto out; 3409 goto out;
3059 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3410 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3060 if (ret)
3061 goto out;
3062
3063 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3064 if (ret) 3411 if (ret)
3065 goto out; 3412 goto out;
3066 3413
@@ -3068,8 +3415,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3068again: 3415again:
3069 page = grab_cache_page(mapping, index); 3416 page = grab_cache_page(mapping, index);
3070 if (!page) { 3417 if (!page) {
3071 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3418 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3072 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3073 goto out; 3419 goto out;
3074 } 3420 }
3075 3421
@@ -3132,8 +3478,7 @@ again:
3132 3478
3133out_unlock: 3479out_unlock:
3134 if (ret) 3480 if (ret)
3135 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3481 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3136 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3137 unlock_page(page); 3482 unlock_page(page);
3138 page_cache_release(page); 3483 page_cache_release(page);
3139out: 3484out:
@@ -3145,7 +3490,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3145 struct btrfs_trans_handle *trans; 3490 struct btrfs_trans_handle *trans;
3146 struct btrfs_root *root = BTRFS_I(inode)->root; 3491 struct btrfs_root *root = BTRFS_I(inode)->root;
3147 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3492 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3148 struct extent_map *em; 3493 struct extent_map *em = NULL;
3149 struct extent_state *cached_state = NULL; 3494 struct extent_state *cached_state = NULL;
3150 u64 mask = root->sectorsize - 1; 3495 u64 mask = root->sectorsize - 1;
3151 u64 hole_start = (inode->i_size + mask) & ~mask; 3496 u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3528,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3183 u64 hint_byte = 0; 3528 u64 hint_byte = 0;
3184 hole_size = last_byte - cur_offset; 3529 hole_size = last_byte - cur_offset;
3185 3530
3186 err = btrfs_reserve_metadata_space(root, 2); 3531 trans = btrfs_start_transaction(root, 2);
3187 if (err) 3532 if (IS_ERR(trans)) {
3533 err = PTR_ERR(trans);
3188 break; 3534 break;
3189 3535 }
3190 trans = btrfs_start_transaction(root, 1);
3191 btrfs_set_trans_block_group(trans, inode); 3536 btrfs_set_trans_block_group(trans, inode);
3192 3537
3193 err = btrfs_drop_extents(trans, inode, cur_offset, 3538 err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3550,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3205 last_byte - 1, 0); 3550 last_byte - 1, 0);
3206 3551
3207 btrfs_end_transaction(trans, root); 3552 btrfs_end_transaction(trans, root);
3208 btrfs_unreserve_metadata_space(root, 2);
3209 } 3553 }
3210 free_extent_map(em); 3554 free_extent_map(em);
3555 em = NULL;
3211 cur_offset = last_byte; 3556 cur_offset = last_byte;
3212 if (cur_offset >= block_end) 3557 if (cur_offset >= block_end)
3213 break; 3558 break;
3214 } 3559 }
3215 3560
3561 free_extent_map(em);
3216 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3562 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3217 GFP_NOFS); 3563 GFP_NOFS);
3218 return err; 3564 return err;
@@ -3239,11 +3585,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3239 } 3585 }
3240 } 3586 }
3241 3587
3242 ret = btrfs_reserve_metadata_space(root, 1); 3588 trans = btrfs_start_transaction(root, 5);
3243 if (ret) 3589 if (IS_ERR(trans))
3244 return ret; 3590 return PTR_ERR(trans);
3245 3591
3246 trans = btrfs_start_transaction(root, 1);
3247 btrfs_set_trans_block_group(trans, inode); 3592 btrfs_set_trans_block_group(trans, inode);
3248 3593
3249 ret = btrfs_orphan_add(trans, inode); 3594 ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3596,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3251 3596
3252 nr = trans->blocks_used; 3597 nr = trans->blocks_used;
3253 btrfs_end_transaction(trans, root); 3598 btrfs_end_transaction(trans, root);
3254 btrfs_unreserve_metadata_space(root, 1);
3255 btrfs_btree_balance_dirty(root, nr); 3599 btrfs_btree_balance_dirty(root, nr);
3256 3600
3257 if (attr->ia_size > inode->i_size) { 3601 if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3608,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3264 i_size_write(inode, attr->ia_size); 3608 i_size_write(inode, attr->ia_size);
3265 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3609 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3266 3610
3267 trans = btrfs_start_transaction(root, 1); 3611 trans = btrfs_start_transaction(root, 0);
3612 BUG_ON(IS_ERR(trans));
3268 btrfs_set_trans_block_group(trans, inode); 3613 btrfs_set_trans_block_group(trans, inode);
3614 trans->block_rsv = root->orphan_block_rsv;
3615 BUG_ON(!trans->block_rsv);
3269 3616
3270 ret = btrfs_update_inode(trans, root, inode); 3617 ret = btrfs_update_inode(trans, root, inode);
3271 BUG_ON(ret); 3618 BUG_ON(ret);
@@ -3308,17 +3655,19 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3308 if (err) 3655 if (err)
3309 return err; 3656 return err;
3310 } 3657 }
3311 attr->ia_valid &= ~ATTR_SIZE;
3312 3658
3313 if (attr->ia_valid) 3659 if (attr->ia_valid) {
3314 err = inode_setattr(inode, attr); 3660 setattr_copy(inode, attr);
3661 mark_inode_dirty(inode);
3662
3663 if (attr->ia_valid & ATTR_MODE)
3664 err = btrfs_acl_chmod(inode);
3665 }
3315 3666
3316 if (!err && ((attr->ia_valid & ATTR_MODE)))
3317 err = btrfs_acl_chmod(inode);
3318 return err; 3667 return err;
3319} 3668}
3320 3669
3321void btrfs_delete_inode(struct inode *inode) 3670void btrfs_evict_inode(struct inode *inode)
3322{ 3671{
3323 struct btrfs_trans_handle *trans; 3672 struct btrfs_trans_handle *trans;
3324 struct btrfs_root *root = BTRFS_I(inode)->root; 3673 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3326,10 +3675,14 @@ void btrfs_delete_inode(struct inode *inode)
3326 int ret; 3675 int ret;
3327 3676
3328 truncate_inode_pages(&inode->i_data, 0); 3677 truncate_inode_pages(&inode->i_data, 0);
3678 if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
3679 goto no_delete;
3680
3329 if (is_bad_inode(inode)) { 3681 if (is_bad_inode(inode)) {
3330 btrfs_orphan_del(NULL, inode); 3682 btrfs_orphan_del(NULL, inode);
3331 goto no_delete; 3683 goto no_delete;
3332 } 3684 }
3685 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
3333 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3686 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3334 3687
3335 if (root->fs_info->log_root_recovering) { 3688 if (root->fs_info->log_root_recovering) {
@@ -3345,10 +3698,21 @@ void btrfs_delete_inode(struct inode *inode)
3345 btrfs_i_size_write(inode, 0); 3698 btrfs_i_size_write(inode, 0);
3346 3699
3347 while (1) { 3700 while (1) {
3348 trans = btrfs_start_transaction(root, 1); 3701 trans = btrfs_start_transaction(root, 0);
3702 BUG_ON(IS_ERR(trans));
3349 btrfs_set_trans_block_group(trans, inode); 3703 btrfs_set_trans_block_group(trans, inode);
3350 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3704 trans->block_rsv = root->orphan_block_rsv;
3705
3706 ret = btrfs_block_rsv_check(trans, root,
3707 root->orphan_block_rsv, 0, 5);
3708 if (ret) {
3709 BUG_ON(ret != -EAGAIN);
3710 ret = btrfs_commit_transaction(trans, root);
3711 BUG_ON(ret);
3712 continue;
3713 }
3351 3714
3715 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3352 if (ret != -EAGAIN) 3716 if (ret != -EAGAIN)
3353 break; 3717 break;
3354 3718
@@ -3356,6 +3720,7 @@ void btrfs_delete_inode(struct inode *inode)
3356 btrfs_end_transaction(trans, root); 3720 btrfs_end_transaction(trans, root);
3357 trans = NULL; 3721 trans = NULL;
3358 btrfs_btree_balance_dirty(root, nr); 3722 btrfs_btree_balance_dirty(root, nr);
3723
3359 } 3724 }
3360 3725
3361 if (ret == 0) { 3726 if (ret == 0) {
@@ -3367,7 +3732,7 @@ void btrfs_delete_inode(struct inode *inode)
3367 btrfs_end_transaction(trans, root); 3732 btrfs_end_transaction(trans, root);
3368 btrfs_btree_balance_dirty(root, nr); 3733 btrfs_btree_balance_dirty(root, nr);
3369no_delete: 3734no_delete:
3370 clear_inode(inode); 3735 end_writeback(inode);
3371 return; 3736 return;
3372} 3737}
3373 3738
@@ -3498,7 +3863,7 @@ again:
3498 p = &parent->rb_right; 3863 p = &parent->rb_right;
3499 else { 3864 else {
3500 WARN_ON(!(entry->vfs_inode.i_state & 3865 WARN_ON(!(entry->vfs_inode.i_state &
3501 (I_WILL_FREE | I_FREEING | I_CLEAR))); 3866 (I_WILL_FREE | I_FREEING)));
3502 rb_erase(parent, &root->inode_tree); 3867 rb_erase(parent, &root->inode_tree);
3503 RB_CLEAR_NODE(parent); 3868 RB_CLEAR_NODE(parent);
3504 spin_unlock(&root->inode_lock); 3869 spin_unlock(&root->inode_lock);
@@ -3577,7 +3942,7 @@ again:
3577 if (atomic_read(&inode->i_count) > 1) 3942 if (atomic_read(&inode->i_count) > 1)
3578 d_prune_aliases(inode); 3943 d_prune_aliases(inode);
3579 /* 3944 /*
3580 * btrfs_drop_inode will remove it from 3945 * btrfs_drop_inode will have it removed from
3581 * the inode cache when its usage count 3946 * the inode cache when its usage count
3582 * hits zero. 3947 * hits zero.
3583 */ 3948 */
@@ -3596,40 +3961,10 @@ again:
3596 return 0; 3961 return 0;
3597} 3962}
3598 3963
3599static noinline void init_btrfs_i(struct inode *inode)
3600{
3601 struct btrfs_inode *bi = BTRFS_I(inode);
3602
3603 bi->generation = 0;
3604 bi->sequence = 0;
3605 bi->last_trans = 0;
3606 bi->last_sub_trans = 0;
3607 bi->logged_trans = 0;
3608 bi->delalloc_bytes = 0;
3609 bi->reserved_bytes = 0;
3610 bi->disk_i_size = 0;
3611 bi->flags = 0;
3612 bi->index_cnt = (u64)-1;
3613 bi->last_unlink_trans = 0;
3614 bi->ordered_data_close = 0;
3615 bi->force_compress = 0;
3616 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3617 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3618 inode->i_mapping, GFP_NOFS);
3619 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3620 inode->i_mapping, GFP_NOFS);
3621 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3622 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3623 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3624 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3625 mutex_init(&BTRFS_I(inode)->log_mutex);
3626}
3627
3628static int btrfs_init_locked_inode(struct inode *inode, void *p) 3964static int btrfs_init_locked_inode(struct inode *inode, void *p)
3629{ 3965{
3630 struct btrfs_iget_args *args = p; 3966 struct btrfs_iget_args *args = p;
3631 inode->i_ino = args->ino; 3967 inode->i_ino = args->ino;
3632 init_btrfs_i(inode);
3633 BTRFS_I(inode)->root = args->root; 3968 BTRFS_I(inode)->root = args->root;
3634 btrfs_set_inode_space_info(args->root, inode); 3969 btrfs_set_inode_space_info(args->root, inode);
3635 return 0; 3970 return 0;
@@ -3692,8 +4027,6 @@ static struct inode *new_simple_dir(struct super_block *s,
3692 if (!inode) 4027 if (!inode)
3693 return ERR_PTR(-ENOMEM); 4028 return ERR_PTR(-ENOMEM);
3694 4029
3695 init_btrfs_i(inode);
3696
3697 BTRFS_I(inode)->root = root; 4030 BTRFS_I(inode)->root = root;
3698 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4031 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3699 BTRFS_I(inode)->dummy_inode = 1; 4032 BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4283,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3950 struct btrfs_trans_handle *trans; 4283 struct btrfs_trans_handle *trans;
3951 int ret = 0; 4284 int ret = 0;
3952 4285
3953 if (root->fs_info->btree_inode == inode) 4286 if (BTRFS_I(inode)->dummy_inode)
3954 return 0; 4287 return 0;
3955 4288
3956 if (wbc->sync_mode == WB_SYNC_ALL) { 4289 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4304,38 @@ void btrfs_dirty_inode(struct inode *inode)
3971{ 4304{
3972 struct btrfs_root *root = BTRFS_I(inode)->root; 4305 struct btrfs_root *root = BTRFS_I(inode)->root;
3973 struct btrfs_trans_handle *trans; 4306 struct btrfs_trans_handle *trans;
4307 int ret;
4308
4309 if (BTRFS_I(inode)->dummy_inode)
4310 return;
3974 4311
3975 trans = btrfs_join_transaction(root, 1); 4312 trans = btrfs_join_transaction(root, 1);
3976 btrfs_set_trans_block_group(trans, inode); 4313 btrfs_set_trans_block_group(trans, inode);
3977 btrfs_update_inode(trans, root, inode); 4314
4315 ret = btrfs_update_inode(trans, root, inode);
4316 if (ret && ret == -ENOSPC) {
4317 /* whoops, lets try again with the full transaction */
4318 btrfs_end_transaction(trans, root);
4319 trans = btrfs_start_transaction(root, 1);
4320 if (IS_ERR(trans)) {
4321 if (printk_ratelimit()) {
4322 printk(KERN_ERR "btrfs: fail to "
4323 "dirty inode %lu error %ld\n",
4324 inode->i_ino, PTR_ERR(trans));
4325 }
4326 return;
4327 }
4328 btrfs_set_trans_block_group(trans, inode);
4329
4330 ret = btrfs_update_inode(trans, root, inode);
4331 if (ret) {
4332 if (printk_ratelimit()) {
4333 printk(KERN_ERR "btrfs: fail to "
4334 "dirty inode %lu error %d\n",
4335 inode->i_ino, ret);
4336 }
4337 }
4338 }
3978 btrfs_end_transaction(trans, root); 4339 btrfs_end_transaction(trans, root);
3979} 4340}
3980 4341
@@ -4092,7 +4453,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4092 * btrfs_get_inode_index_count has an explanation for the magic 4453 * btrfs_get_inode_index_count has an explanation for the magic
4093 * number 4454 * number
4094 */ 4455 */
4095 init_btrfs_i(inode);
4096 BTRFS_I(inode)->index_cnt = 2; 4456 BTRFS_I(inode)->index_cnt = 2;
4097 BTRFS_I(inode)->root = root; 4457 BTRFS_I(inode)->root = root;
4098 BTRFS_I(inode)->generation = trans->transid; 4458 BTRFS_I(inode)->generation = trans->transid;
@@ -4121,16 +4481,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4121 if (ret != 0) 4481 if (ret != 0)
4122 goto fail; 4482 goto fail;
4123 4483
4124 inode->i_uid = current_fsuid(); 4484 inode_init_owner(inode, dir, mode);
4125
4126 if (dir && (dir->i_mode & S_ISGID)) {
4127 inode->i_gid = dir->i_gid;
4128 if (S_ISDIR(mode))
4129 mode |= S_ISGID;
4130 } else
4131 inode->i_gid = current_fsgid();
4132
4133 inode->i_mode = mode;
4134 inode->i_ino = objectid; 4485 inode->i_ino = objectid;
4135 inode_set_bytes(inode, 0); 4486 inode_set_bytes(inode, 0);
4136 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4487 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4256,26 +4607,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4256 if (!new_valid_dev(rdev)) 4607 if (!new_valid_dev(rdev))
4257 return -EINVAL; 4608 return -EINVAL;
4258 4609
4610 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4611 if (err)
4612 return err;
4613
4259 /* 4614 /*
4260 * 2 for inode item and ref 4615 * 2 for inode item and ref
4261 * 2 for dir items 4616 * 2 for dir items
4262 * 1 for xattr if selinux is on 4617 * 1 for xattr if selinux is on
4263 */ 4618 */
4264 err = btrfs_reserve_metadata_space(root, 5); 4619 trans = btrfs_start_transaction(root, 5);
4265 if (err) 4620 if (IS_ERR(trans))
4266 return err; 4621 return PTR_ERR(trans);
4267 4622
4268 trans = btrfs_start_transaction(root, 1);
4269 if (!trans)
4270 goto fail;
4271 btrfs_set_trans_block_group(trans, dir); 4623 btrfs_set_trans_block_group(trans, dir);
4272 4624
4273 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4274 if (err) {
4275 err = -ENOSPC;
4276 goto out_unlock;
4277 }
4278
4279 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4625 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4280 dentry->d_name.len, 4626 dentry->d_name.len,
4281 dentry->d_parent->d_inode->i_ino, objectid, 4627 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4304,13 +4650,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4304out_unlock: 4650out_unlock:
4305 nr = trans->blocks_used; 4651 nr = trans->blocks_used;
4306 btrfs_end_transaction_throttle(trans, root); 4652 btrfs_end_transaction_throttle(trans, root);
4307fail: 4653 btrfs_btree_balance_dirty(root, nr);
4308 btrfs_unreserve_metadata_space(root, 5);
4309 if (drop_inode) { 4654 if (drop_inode) {
4310 inode_dec_link_count(inode); 4655 inode_dec_link_count(inode);
4311 iput(inode); 4656 iput(inode);
4312 } 4657 }
4313 btrfs_btree_balance_dirty(root, nr);
4314 return err; 4658 return err;
4315} 4659}
4316 4660
@@ -4320,32 +4664,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4320 struct btrfs_trans_handle *trans; 4664 struct btrfs_trans_handle *trans;
4321 struct btrfs_root *root = BTRFS_I(dir)->root; 4665 struct btrfs_root *root = BTRFS_I(dir)->root;
4322 struct inode *inode = NULL; 4666 struct inode *inode = NULL;
4323 int err;
4324 int drop_inode = 0; 4667 int drop_inode = 0;
4668 int err;
4325 unsigned long nr = 0; 4669 unsigned long nr = 0;
4326 u64 objectid; 4670 u64 objectid;
4327 u64 index = 0; 4671 u64 index = 0;
4328 4672
4673 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4674 if (err)
4675 return err;
4329 /* 4676 /*
4330 * 2 for inode item and ref 4677 * 2 for inode item and ref
4331 * 2 for dir items 4678 * 2 for dir items
4332 * 1 for xattr if selinux is on 4679 * 1 for xattr if selinux is on
4333 */ 4680 */
4334 err = btrfs_reserve_metadata_space(root, 5); 4681 trans = btrfs_start_transaction(root, 5);
4335 if (err) 4682 if (IS_ERR(trans))
4336 return err; 4683 return PTR_ERR(trans);
4337 4684
4338 trans = btrfs_start_transaction(root, 1);
4339 if (!trans)
4340 goto fail;
4341 btrfs_set_trans_block_group(trans, dir); 4685 btrfs_set_trans_block_group(trans, dir);
4342 4686
4343 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4344 if (err) {
4345 err = -ENOSPC;
4346 goto out_unlock;
4347 }
4348
4349 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4687 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4350 dentry->d_name.len, 4688 dentry->d_name.len,
4351 dentry->d_parent->d_inode->i_ino, 4689 dentry->d_parent->d_inode->i_ino,
@@ -4377,8 +4715,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4377out_unlock: 4715out_unlock:
4378 nr = trans->blocks_used; 4716 nr = trans->blocks_used;
4379 btrfs_end_transaction_throttle(trans, root); 4717 btrfs_end_transaction_throttle(trans, root);
4380fail:
4381 btrfs_unreserve_metadata_space(root, 5);
4382 if (drop_inode) { 4718 if (drop_inode) {
4383 inode_dec_link_count(inode); 4719 inode_dec_link_count(inode);
4384 iput(inode); 4720 iput(inode);
@@ -4405,21 +4741,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4405 if (root->objectid != BTRFS_I(inode)->root->objectid) 4741 if (root->objectid != BTRFS_I(inode)->root->objectid)
4406 return -EPERM; 4742 return -EPERM;
4407 4743
4408 /*
4409 * 1 item for inode ref
4410 * 2 items for dir items
4411 */
4412 err = btrfs_reserve_metadata_space(root, 3);
4413 if (err)
4414 return err;
4415
4416 btrfs_inc_nlink(inode); 4744 btrfs_inc_nlink(inode);
4417 4745
4418 err = btrfs_set_inode_index(dir, &index); 4746 err = btrfs_set_inode_index(dir, &index);
4419 if (err) 4747 if (err)
4420 goto fail; 4748 goto fail;
4421 4749
4422 trans = btrfs_start_transaction(root, 1); 4750 /*
4751 * 1 item for inode ref
4752 * 2 items for dir items
4753 */
4754 trans = btrfs_start_transaction(root, 3);
4755 if (IS_ERR(trans)) {
4756 err = PTR_ERR(trans);
4757 goto fail;
4758 }
4423 4759
4424 btrfs_set_trans_block_group(trans, dir); 4760 btrfs_set_trans_block_group(trans, dir);
4425 atomic_inc(&inode->i_count); 4761 atomic_inc(&inode->i_count);
@@ -4438,7 +4774,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4438 nr = trans->blocks_used; 4774 nr = trans->blocks_used;
4439 btrfs_end_transaction_throttle(trans, root); 4775 btrfs_end_transaction_throttle(trans, root);
4440fail: 4776fail:
4441 btrfs_unreserve_metadata_space(root, 3);
4442 if (drop_inode) { 4777 if (drop_inode) {
4443 inode_dec_link_count(inode); 4778 inode_dec_link_count(inode);
4444 iput(inode); 4779 iput(inode);
@@ -4458,28 +4793,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4458 u64 index = 0; 4793 u64 index = 0;
4459 unsigned long nr = 1; 4794 unsigned long nr = 1;
4460 4795
4796 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4797 if (err)
4798 return err;
4799
4461 /* 4800 /*
4462 * 2 items for inode and ref 4801 * 2 items for inode and ref
4463 * 2 items for dir items 4802 * 2 items for dir items
4464 * 1 for xattr if selinux is on 4803 * 1 for xattr if selinux is on
4465 */ 4804 */
4466 err = btrfs_reserve_metadata_space(root, 5); 4805 trans = btrfs_start_transaction(root, 5);
4467 if (err) 4806 if (IS_ERR(trans))
4468 return err; 4807 return PTR_ERR(trans);
4469
4470 trans = btrfs_start_transaction(root, 1);
4471 if (!trans) {
4472 err = -ENOMEM;
4473 goto out_unlock;
4474 }
4475 btrfs_set_trans_block_group(trans, dir); 4808 btrfs_set_trans_block_group(trans, dir);
4476 4809
4477 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4478 if (err) {
4479 err = -ENOSPC;
4480 goto out_fail;
4481 }
4482
4483 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4810 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4484 dentry->d_name.len, 4811 dentry->d_name.len,
4485 dentry->d_parent->d_inode->i_ino, objectid, 4812 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4519,9 +4846,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4519out_fail: 4846out_fail:
4520 nr = trans->blocks_used; 4847 nr = trans->blocks_used;
4521 btrfs_end_transaction_throttle(trans, root); 4848 btrfs_end_transaction_throttle(trans, root);
4522
4523out_unlock:
4524 btrfs_unreserve_metadata_space(root, 5);
4525 if (drop_on_err) 4849 if (drop_on_err)
4526 iput(inode); 4850 iput(inode);
4527 btrfs_btree_balance_dirty(root, nr); 4851 btrfs_btree_balance_dirty(root, nr);
@@ -4779,6 +5103,7 @@ again:
4779 } 5103 }
4780 flush_dcache_page(page); 5104 flush_dcache_page(page);
4781 } else if (create && PageUptodate(page)) { 5105 } else if (create && PageUptodate(page)) {
5106 WARN_ON(1);
4782 if (!trans) { 5107 if (!trans) {
4783 kunmap(page); 5108 kunmap(page);
4784 free_extent_map(em); 5109 free_extent_map(em);
@@ -4875,11 +5200,651 @@ out:
4875 return em; 5200 return em;
4876} 5201}
4877 5202
5203static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5204 u64 start, u64 len)
5205{
5206 struct btrfs_root *root = BTRFS_I(inode)->root;
5207 struct btrfs_trans_handle *trans;
5208 struct extent_map *em;
5209 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5210 struct btrfs_key ins;
5211 u64 alloc_hint;
5212 int ret;
5213
5214 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5215
5216 trans = btrfs_join_transaction(root, 0);
5217 if (!trans)
5218 return ERR_PTR(-ENOMEM);
5219
5220 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5221
5222 alloc_hint = get_extent_allocation_hint(inode, start, len);
5223 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5224 alloc_hint, (u64)-1, &ins, 1);
5225 if (ret) {
5226 em = ERR_PTR(ret);
5227 goto out;
5228 }
5229
5230 em = alloc_extent_map(GFP_NOFS);
5231 if (!em) {
5232 em = ERR_PTR(-ENOMEM);
5233 goto out;
5234 }
5235
5236 em->start = start;
5237 em->orig_start = em->start;
5238 em->len = ins.offset;
5239
5240 em->block_start = ins.objectid;
5241 em->block_len = ins.offset;
5242 em->bdev = root->fs_info->fs_devices->latest_bdev;
5243 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5244
5245 while (1) {
5246 write_lock(&em_tree->lock);
5247 ret = add_extent_mapping(em_tree, em);
5248 write_unlock(&em_tree->lock);
5249 if (ret != -EEXIST)
5250 break;
5251 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5252 }
5253
5254 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5255 ins.offset, ins.offset, 0);
5256 if (ret) {
5257 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5258 em = ERR_PTR(ret);
5259 }
5260out:
5261 btrfs_end_transaction(trans, root);
5262 return em;
5263}
5264
5265/*
5266 * returns 1 when the nocow is safe, < 1 on error, 0 if the
5267 * block must be cow'd
5268 */
5269static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5270 struct inode *inode, u64 offset, u64 len)
5271{
5272 struct btrfs_path *path;
5273 int ret;
5274 struct extent_buffer *leaf;
5275 struct btrfs_root *root = BTRFS_I(inode)->root;
5276 struct btrfs_file_extent_item *fi;
5277 struct btrfs_key key;
5278 u64 disk_bytenr;
5279 u64 backref_offset;
5280 u64 extent_end;
5281 u64 num_bytes;
5282 int slot;
5283 int found_type;
5284
5285 path = btrfs_alloc_path();
5286 if (!path)
5287 return -ENOMEM;
5288
5289 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
5290 offset, 0);
5291 if (ret < 0)
5292 goto out;
5293
5294 slot = path->slots[0];
5295 if (ret == 1) {
5296 if (slot == 0) {
5297 /* can't find the item, must cow */
5298 ret = 0;
5299 goto out;
5300 }
5301 slot--;
5302 }
5303 ret = 0;
5304 leaf = path->nodes[0];
5305 btrfs_item_key_to_cpu(leaf, &key, slot);
5306 if (key.objectid != inode->i_ino ||
5307 key.type != BTRFS_EXTENT_DATA_KEY) {
5308 /* not our file or wrong item type, must cow */
5309 goto out;
5310 }
5311
5312 if (key.offset > offset) {
5313 /* Wrong offset, must cow */
5314 goto out;
5315 }
5316
5317 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5318 found_type = btrfs_file_extent_type(leaf, fi);
5319 if (found_type != BTRFS_FILE_EXTENT_REG &&
5320 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5321 /* not a regular extent, must cow */
5322 goto out;
5323 }
5324 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5325 backref_offset = btrfs_file_extent_offset(leaf, fi);
5326
5327 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5328 if (extent_end < offset + len) {
5329 /* extent doesn't include our full range, must cow */
5330 goto out;
5331 }
5332
5333 if (btrfs_extent_readonly(root, disk_bytenr))
5334 goto out;
5335
5336 /*
5337 * look for other files referencing this extent, if we
5338 * find any we must cow
5339 */
5340 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
5341 key.offset - backref_offset, disk_bytenr))
5342 goto out;
5343
5344 /*
5345 * adjust disk_bytenr and num_bytes to cover just the bytes
5346 * in this extent we are about to write. If there
5347 * are any csums in that range we have to cow in order
5348 * to keep the csums correct
5349 */
5350 disk_bytenr += backref_offset;
5351 disk_bytenr += offset - key.offset;
5352 num_bytes = min(offset + len, extent_end) - offset;
5353 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5354 goto out;
5355 /*
5356 * all of the above have passed, it is safe to overwrite this extent
5357 * without cow
5358 */
5359 ret = 1;
5360out:
5361 btrfs_free_path(path);
5362 return ret;
5363}
5364
5365static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5366 struct buffer_head *bh_result, int create)
5367{
5368 struct extent_map *em;
5369 struct btrfs_root *root = BTRFS_I(inode)->root;
5370 u64 start = iblock << inode->i_blkbits;
5371 u64 len = bh_result->b_size;
5372 struct btrfs_trans_handle *trans;
5373
5374 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5375 if (IS_ERR(em))
5376 return PTR_ERR(em);
5377
5378 /*
5379 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5380 * io. INLINE is special, and we could probably kludge it in here, but
5381 * it's still buffered so for safety lets just fall back to the generic
5382 * buffered path.
5383 *
5384 * For COMPRESSED we _have_ to read the entire extent in so we can
5385 * decompress it, so there will be buffering required no matter what we
5386 * do, so go ahead and fallback to buffered.
5387 *
5388 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5389 * to buffered IO. Don't blame me, this is the price we pay for using
5390 * the generic code.
5391 */
5392 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5393 em->block_start == EXTENT_MAP_INLINE) {
5394 free_extent_map(em);
5395 return -ENOTBLK;
5396 }
5397
5398 /* Just a good old fashioned hole, return */
5399 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5400 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5401 free_extent_map(em);
5402 /* DIO will do one hole at a time, so just unlock a sector */
5403 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5404 start + root->sectorsize - 1, GFP_NOFS);
5405 return 0;
5406 }
5407
5408 /*
5409 * We don't allocate a new extent in the following cases
5410 *
5411 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5412 * existing extent.
5413 * 2) The extent is marked as PREALLOC. We're good to go here and can
5414 * just use the extent.
5415 *
5416 */
5417 if (!create) {
5418 len = em->len - (start - em->start);
5419 goto map;
5420 }
5421
5422 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5423 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5424 em->block_start != EXTENT_MAP_HOLE)) {
5425 int type;
5426 int ret;
5427 u64 block_start;
5428
5429 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5430 type = BTRFS_ORDERED_PREALLOC;
5431 else
5432 type = BTRFS_ORDERED_NOCOW;
5433 len = min(len, em->len - (start - em->start));
5434 block_start = em->block_start + (start - em->start);
5435
5436 /*
5437 * we're not going to log anything, but we do need
5438 * to make sure the current transaction stays open
5439 * while we look for nocow cross refs
5440 */
5441 trans = btrfs_join_transaction(root, 0);
5442 if (!trans)
5443 goto must_cow;
5444
5445 if (can_nocow_odirect(trans, inode, start, len) == 1) {
5446 ret = btrfs_add_ordered_extent_dio(inode, start,
5447 block_start, len, len, type);
5448 btrfs_end_transaction(trans, root);
5449 if (ret) {
5450 free_extent_map(em);
5451 return ret;
5452 }
5453 goto unlock;
5454 }
5455 btrfs_end_transaction(trans, root);
5456 }
5457must_cow:
5458 /*
5459 * this will cow the extent, reset the len in case we changed
5460 * it above
5461 */
5462 len = bh_result->b_size;
5463 free_extent_map(em);
5464 em = btrfs_new_extent_direct(inode, start, len);
5465 if (IS_ERR(em))
5466 return PTR_ERR(em);
5467 len = min(len, em->len - (start - em->start));
5468unlock:
5469 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5470 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5471 0, NULL, GFP_NOFS);
5472map:
5473 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5474 inode->i_blkbits;
5475 bh_result->b_size = len;
5476 bh_result->b_bdev = em->bdev;
5477 set_buffer_mapped(bh_result);
5478 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5479 set_buffer_new(bh_result);
5480
5481 free_extent_map(em);
5482
5483 return 0;
5484}
5485
5486struct btrfs_dio_private {
5487 struct inode *inode;
5488 u64 logical_offset;
5489 u64 disk_bytenr;
5490 u64 bytes;
5491 u32 *csums;
5492 void *private;
5493};
5494
5495static void btrfs_endio_direct_read(struct bio *bio, int err)
5496{
5497 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5498 struct bio_vec *bvec = bio->bi_io_vec;
5499 struct btrfs_dio_private *dip = bio->bi_private;
5500 struct inode *inode = dip->inode;
5501 struct btrfs_root *root = BTRFS_I(inode)->root;
5502 u64 start;
5503 u32 *private = dip->csums;
5504
5505 start = dip->logical_offset;
5506 do {
5507 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5508 struct page *page = bvec->bv_page;
5509 char *kaddr;
5510 u32 csum = ~(u32)0;
5511 unsigned long flags;
5512
5513 local_irq_save(flags);
5514 kaddr = kmap_atomic(page, KM_IRQ0);
5515 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5516 csum, bvec->bv_len);
5517 btrfs_csum_final(csum, (char *)&csum);
5518 kunmap_atomic(kaddr, KM_IRQ0);
5519 local_irq_restore(flags);
5520
5521 flush_dcache_page(bvec->bv_page);
5522 if (csum != *private) {
5523 printk(KERN_ERR "btrfs csum failed ino %lu off"
5524 " %llu csum %u private %u\n",
5525 inode->i_ino, (unsigned long long)start,
5526 csum, *private);
5527 err = -EIO;
5528 }
5529 }
5530
5531 start += bvec->bv_len;
5532 private++;
5533 bvec++;
5534 } while (bvec <= bvec_end);
5535
5536 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5537 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5538 bio->bi_private = dip->private;
5539
5540 kfree(dip->csums);
5541 kfree(dip);
5542 dio_end_io(bio, err);
5543}
5544
5545static void btrfs_endio_direct_write(struct bio *bio, int err)
5546{
5547 struct btrfs_dio_private *dip = bio->bi_private;
5548 struct inode *inode = dip->inode;
5549 struct btrfs_root *root = BTRFS_I(inode)->root;
5550 struct btrfs_trans_handle *trans;
5551 struct btrfs_ordered_extent *ordered = NULL;
5552 struct extent_state *cached_state = NULL;
5553 int ret;
5554
5555 if (err)
5556 goto out_done;
5557
5558 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5559 dip->logical_offset, dip->bytes);
5560 if (!ret)
5561 goto out_done;
5562
5563 BUG_ON(!ordered);
5564
5565 trans = btrfs_join_transaction(root, 1);
5566 if (!trans) {
5567 err = -ENOMEM;
5568 goto out;
5569 }
5570 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5571
5572 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5573 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5574 if (!ret)
5575 ret = btrfs_update_inode(trans, root, inode);
5576 err = ret;
5577 goto out;
5578 }
5579
5580 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5581 ordered->file_offset + ordered->len - 1, 0,
5582 &cached_state, GFP_NOFS);
5583
5584 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5585 ret = btrfs_mark_extent_written(trans, inode,
5586 ordered->file_offset,
5587 ordered->file_offset +
5588 ordered->len);
5589 if (ret) {
5590 err = ret;
5591 goto out_unlock;
5592 }
5593 } else {
5594 ret = insert_reserved_file_extent(trans, inode,
5595 ordered->file_offset,
5596 ordered->start,
5597 ordered->disk_len,
5598 ordered->len,
5599 ordered->len,
5600 0, 0, 0,
5601 BTRFS_FILE_EXTENT_REG);
5602 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5603 ordered->file_offset, ordered->len);
5604 if (ret) {
5605 err = ret;
5606 WARN_ON(1);
5607 goto out_unlock;
5608 }
5609 }
5610
5611 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5612 btrfs_ordered_update_i_size(inode, 0, ordered);
5613 btrfs_update_inode(trans, root, inode);
5614out_unlock:
5615 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5616 ordered->file_offset + ordered->len - 1,
5617 &cached_state, GFP_NOFS);
5618out:
5619 btrfs_delalloc_release_metadata(inode, ordered->len);
5620 btrfs_end_transaction(trans, root);
5621 btrfs_put_ordered_extent(ordered);
5622 btrfs_put_ordered_extent(ordered);
5623out_done:
5624 bio->bi_private = dip->private;
5625
5626 kfree(dip->csums);
5627 kfree(dip);
5628 dio_end_io(bio, err);
5629}
5630
5631static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5632 struct bio *bio, int mirror_num,
5633 unsigned long bio_flags, u64 offset)
5634{
5635 int ret;
5636 struct btrfs_root *root = BTRFS_I(inode)->root;
5637 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5638 BUG_ON(ret);
5639 return 0;
5640}
5641
5642static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5643 loff_t file_offset)
5644{
5645 struct btrfs_root *root = BTRFS_I(inode)->root;
5646 struct btrfs_dio_private *dip;
5647 struct bio_vec *bvec = bio->bi_io_vec;
5648 u64 start;
5649 int skip_sum;
5650 int write = rw & REQ_WRITE;
5651 int ret = 0;
5652
5653 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5654
5655 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5656 if (!dip) {
5657 ret = -ENOMEM;
5658 goto free_ordered;
5659 }
5660 dip->csums = NULL;
5661
5662 if (!skip_sum) {
5663 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5664 if (!dip->csums) {
5665 ret = -ENOMEM;
5666 goto free_ordered;
5667 }
5668 }
5669
5670 dip->private = bio->bi_private;
5671 dip->inode = inode;
5672 dip->logical_offset = file_offset;
5673
5674 start = dip->logical_offset;
5675 dip->bytes = 0;
5676 do {
5677 dip->bytes += bvec->bv_len;
5678 bvec++;
5679 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5680
5681 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5682 bio->bi_private = dip;
5683
5684 if (write)
5685 bio->bi_end_io = btrfs_endio_direct_write;
5686 else
5687 bio->bi_end_io = btrfs_endio_direct_read;
5688
5689 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5690 if (ret)
5691 goto out_err;
5692
5693 if (write && !skip_sum) {
5694 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5695 inode, rw, bio, 0, 0,
5696 dip->logical_offset,
5697 __btrfs_submit_bio_start_direct_io,
5698 __btrfs_submit_bio_done);
5699 if (ret)
5700 goto out_err;
5701 return;
5702 } else if (!skip_sum)
5703 btrfs_lookup_bio_sums_dio(root, inode, bio,
5704 dip->logical_offset, dip->csums);
5705
5706 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5707 if (ret)
5708 goto out_err;
5709 return;
5710out_err:
5711 kfree(dip->csums);
5712 kfree(dip);
5713free_ordered:
5714 /*
5715 * If this is a write, we need to clean up the reserved space and kill
5716 * the ordered extent.
5717 */
5718 if (write) {
5719 struct btrfs_ordered_extent *ordered;
5720 ordered = btrfs_lookup_ordered_extent(inode,
5721 dip->logical_offset);
5722 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5723 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5724 btrfs_free_reserved_extent(root, ordered->start,
5725 ordered->disk_len);
5726 btrfs_put_ordered_extent(ordered);
5727 btrfs_put_ordered_extent(ordered);
5728 }
5729 bio_endio(bio, ret);
5730}
5731
5732static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
5733 const struct iovec *iov, loff_t offset,
5734 unsigned long nr_segs)
5735{
5736 int seg;
5737 size_t size;
5738 unsigned long addr;
5739 unsigned blocksize_mask = root->sectorsize - 1;
5740 ssize_t retval = -EINVAL;
5741 loff_t end = offset;
5742
5743 if (offset & blocksize_mask)
5744 goto out;
5745
5746 /* Check the memory alignment. Blocks cannot straddle pages */
5747 for (seg = 0; seg < nr_segs; seg++) {
5748 addr = (unsigned long)iov[seg].iov_base;
5749 size = iov[seg].iov_len;
5750 end += size;
5751 if ((addr & blocksize_mask) || (size & blocksize_mask))
5752 goto out;
5753 }
5754 retval = 0;
5755out:
5756 return retval;
5757}
4878static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5758static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4879 const struct iovec *iov, loff_t offset, 5759 const struct iovec *iov, loff_t offset,
4880 unsigned long nr_segs) 5760 unsigned long nr_segs)
4881{ 5761{
4882 return -EINVAL; 5762 struct file *file = iocb->ki_filp;
5763 struct inode *inode = file->f_mapping->host;
5764 struct btrfs_ordered_extent *ordered;
5765 struct extent_state *cached_state = NULL;
5766 u64 lockstart, lockend;
5767 ssize_t ret;
5768 int writing = rw & WRITE;
5769 int write_bits = 0;
5770 size_t count = iov_length(iov, nr_segs);
5771
5772 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
5773 offset, nr_segs)) {
5774 return 0;
5775 }
5776
5777 lockstart = offset;
5778 lockend = offset + count - 1;
5779
5780 if (writing) {
5781 ret = btrfs_delalloc_reserve_space(inode, count);
5782 if (ret)
5783 goto out;
5784 }
5785
5786 while (1) {
5787 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5788 0, &cached_state, GFP_NOFS);
5789 /*
5790 * We're concerned with the entire range that we're going to be
5791 * doing DIO to, so we need to make sure theres no ordered
5792 * extents in this range.
5793 */
5794 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5795 lockend - lockstart + 1);
5796 if (!ordered)
5797 break;
5798 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5799 &cached_state, GFP_NOFS);
5800 btrfs_start_ordered_extent(inode, ordered, 1);
5801 btrfs_put_ordered_extent(ordered);
5802 cond_resched();
5803 }
5804
5805 /*
5806 * we don't use btrfs_set_extent_delalloc because we don't want
5807 * the dirty or uptodate bits
5808 */
5809 if (writing) {
5810 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
5811 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5812 EXTENT_DELALLOC, 0, NULL, &cached_state,
5813 GFP_NOFS);
5814 if (ret) {
5815 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5816 lockend, EXTENT_LOCKED | write_bits,
5817 1, 0, &cached_state, GFP_NOFS);
5818 goto out;
5819 }
5820 }
5821
5822 free_extent_state(cached_state);
5823 cached_state = NULL;
5824
5825 ret = __blockdev_direct_IO(rw, iocb, inode,
5826 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
5827 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
5828 btrfs_submit_direct, 0);
5829
5830 if (ret < 0 && ret != -EIOCBQUEUED) {
5831 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
5832 offset + iov_length(iov, nr_segs) - 1,
5833 EXTENT_LOCKED | write_bits, 1, 0,
5834 &cached_state, GFP_NOFS);
5835 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5836 /*
5837 * We're falling back to buffered, unlock the section we didn't
5838 * do IO on.
5839 */
5840 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
5841 offset + iov_length(iov, nr_segs) - 1,
5842 EXTENT_LOCKED | write_bits, 1, 0,
5843 &cached_state, GFP_NOFS);
5844 }
5845out:
5846 free_extent_state(cached_state);
5847 return ret;
4883} 5848}
4884 5849
4885static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5850static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5043,7 +6008,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5043 u64 page_start; 6008 u64 page_start;
5044 u64 page_end; 6009 u64 page_end;
5045 6010
5046 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 6011 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5047 if (ret) { 6012 if (ret) {
5048 if (ret == -ENOMEM) 6013 if (ret == -ENOMEM)
5049 ret = VM_FAULT_OOM; 6014 ret = VM_FAULT_OOM;
@@ -5052,13 +6017,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5052 goto out; 6017 goto out;
5053 } 6018 }
5054 6019
5055 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5056 if (ret) {
5057 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5058 ret = VM_FAULT_SIGBUS;
5059 goto out;
5060 }
5061
5062 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6020 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5063again: 6021again:
5064 lock_page(page); 6022 lock_page(page);
@@ -5068,7 +6026,6 @@ again:
5068 6026
5069 if ((page->mapping != inode->i_mapping) || 6027 if ((page->mapping != inode->i_mapping) ||
5070 (page_start >= size)) { 6028 (page_start >= size)) {
5071 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5072 /* page got truncated out from underneath us */ 6029 /* page got truncated out from underneath us */
5073 goto out_unlock; 6030 goto out_unlock;
5074 } 6031 }
@@ -5109,7 +6066,6 @@ again:
5109 unlock_extent_cached(io_tree, page_start, page_end, 6066 unlock_extent_cached(io_tree, page_start, page_end,
5110 &cached_state, GFP_NOFS); 6067 &cached_state, GFP_NOFS);
5111 ret = VM_FAULT_SIGBUS; 6068 ret = VM_FAULT_SIGBUS;
5112 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5113 goto out_unlock; 6069 goto out_unlock;
5114 } 6070 }
5115 ret = 0; 6071 ret = 0;
@@ -5136,10 +6092,10 @@ again:
5136 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6092 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5137 6093
5138out_unlock: 6094out_unlock:
5139 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5140 if (!ret) 6095 if (!ret)
5141 return VM_FAULT_LOCKED; 6096 return VM_FAULT_LOCKED;
5142 unlock_page(page); 6097 unlock_page(page);
6098 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
5143out: 6099out:
5144 return ret; 6100 return ret;
5145} 6101}
@@ -5164,8 +6120,10 @@ static void btrfs_truncate(struct inode *inode)
5164 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6120 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5165 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6121 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
5166 6122
5167 trans = btrfs_start_transaction(root, 1); 6123 trans = btrfs_start_transaction(root, 0);
6124 BUG_ON(IS_ERR(trans));
5168 btrfs_set_trans_block_group(trans, inode); 6125 btrfs_set_trans_block_group(trans, inode);
6126 trans->block_rsv = root->orphan_block_rsv;
5169 6127
5170 /* 6128 /*
5171 * setattr is responsible for setting the ordered_data_close flag, 6129 * setattr is responsible for setting the ordered_data_close flag,
@@ -5188,6 +6146,23 @@ static void btrfs_truncate(struct inode *inode)
5188 btrfs_add_ordered_operation(trans, root, inode); 6146 btrfs_add_ordered_operation(trans, root, inode);
5189 6147
5190 while (1) { 6148 while (1) {
6149 if (!trans) {
6150 trans = btrfs_start_transaction(root, 0);
6151 BUG_ON(IS_ERR(trans));
6152 btrfs_set_trans_block_group(trans, inode);
6153 trans->block_rsv = root->orphan_block_rsv;
6154 }
6155
6156 ret = btrfs_block_rsv_check(trans, root,
6157 root->orphan_block_rsv, 0, 5);
6158 if (ret) {
6159 BUG_ON(ret != -EAGAIN);
6160 ret = btrfs_commit_transaction(trans, root);
6161 BUG_ON(ret);
6162 trans = NULL;
6163 continue;
6164 }
6165
5191 ret = btrfs_truncate_inode_items(trans, root, inode, 6166 ret = btrfs_truncate_inode_items(trans, root, inode,
5192 inode->i_size, 6167 inode->i_size,
5193 BTRFS_EXTENT_DATA_KEY); 6168 BTRFS_EXTENT_DATA_KEY);
@@ -5199,10 +6174,8 @@ static void btrfs_truncate(struct inode *inode)
5199 6174
5200 nr = trans->blocks_used; 6175 nr = trans->blocks_used;
5201 btrfs_end_transaction(trans, root); 6176 btrfs_end_transaction(trans, root);
6177 trans = NULL;
5202 btrfs_btree_balance_dirty(root, nr); 6178 btrfs_btree_balance_dirty(root, nr);
5203
5204 trans = btrfs_start_transaction(root, 1);
5205 btrfs_set_trans_block_group(trans, inode);
5206 } 6179 }
5207 6180
5208 if (ret == 0 && inode->i_nlink > 0) { 6181 if (ret == 0 && inode->i_nlink > 0) {
@@ -5263,21 +6236,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
5263struct inode *btrfs_alloc_inode(struct super_block *sb) 6236struct inode *btrfs_alloc_inode(struct super_block *sb)
5264{ 6237{
5265 struct btrfs_inode *ei; 6238 struct btrfs_inode *ei;
6239 struct inode *inode;
5266 6240
5267 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6241 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5268 if (!ei) 6242 if (!ei)
5269 return NULL; 6243 return NULL;
6244
6245 ei->root = NULL;
6246 ei->space_info = NULL;
6247 ei->generation = 0;
6248 ei->sequence = 0;
5270 ei->last_trans = 0; 6249 ei->last_trans = 0;
5271 ei->last_sub_trans = 0; 6250 ei->last_sub_trans = 0;
5272 ei->logged_trans = 0; 6251 ei->logged_trans = 0;
5273 ei->outstanding_extents = 0; 6252 ei->delalloc_bytes = 0;
5274 ei->reserved_extents = 0; 6253 ei->reserved_bytes = 0;
5275 ei->root = NULL; 6254 ei->disk_i_size = 0;
6255 ei->flags = 0;
6256 ei->index_cnt = (u64)-1;
6257 ei->last_unlink_trans = 0;
6258
5276 spin_lock_init(&ei->accounting_lock); 6259 spin_lock_init(&ei->accounting_lock);
6260 atomic_set(&ei->outstanding_extents, 0);
6261 ei->reserved_extents = 0;
6262
6263 ei->ordered_data_close = 0;
6264 ei->orphan_meta_reserved = 0;
6265 ei->dummy_inode = 0;
6266 ei->force_compress = 0;
6267
6268 inode = &ei->vfs_inode;
6269 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
6270 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
6271 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
6272 mutex_init(&ei->log_mutex);
5277 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6273 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5278 INIT_LIST_HEAD(&ei->i_orphan); 6274 INIT_LIST_HEAD(&ei->i_orphan);
6275 INIT_LIST_HEAD(&ei->delalloc_inodes);
5279 INIT_LIST_HEAD(&ei->ordered_operations); 6276 INIT_LIST_HEAD(&ei->ordered_operations);
5280 return &ei->vfs_inode; 6277 RB_CLEAR_NODE(&ei->rb_node);
6278
6279 return inode;
5281} 6280}
5282 6281
5283void btrfs_destroy_inode(struct inode *inode) 6282void btrfs_destroy_inode(struct inode *inode)
@@ -5287,6 +6286,8 @@ void btrfs_destroy_inode(struct inode *inode)
5287 6286
5288 WARN_ON(!list_empty(&inode->i_dentry)); 6287 WARN_ON(!list_empty(&inode->i_dentry));
5289 WARN_ON(inode->i_data.nrpages); 6288 WARN_ON(inode->i_data.nrpages);
6289 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6290 WARN_ON(BTRFS_I(inode)->reserved_extents);
5290 6291
5291 /* 6292 /*
5292 * This can happen where we create an inode, but somebody else also 6293 * This can happen where we create an inode, but somebody else also
@@ -5307,13 +6308,13 @@ void btrfs_destroy_inode(struct inode *inode)
5307 spin_unlock(&root->fs_info->ordered_extent_lock); 6308 spin_unlock(&root->fs_info->ordered_extent_lock);
5308 } 6309 }
5309 6310
5310 spin_lock(&root->list_lock); 6311 spin_lock(&root->orphan_lock);
5311 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6312 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5312 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6313 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5313 inode->i_ino); 6314 inode->i_ino);
5314 list_del_init(&BTRFS_I(inode)->i_orphan); 6315 list_del_init(&BTRFS_I(inode)->i_orphan);
5315 } 6316 }
5316 spin_unlock(&root->list_lock); 6317 spin_unlock(&root->orphan_lock);
5317 6318
5318 while (1) { 6319 while (1) {
5319 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6320 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5335,13 +6336,14 @@ free:
5335 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6336 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
5336} 6337}
5337 6338
5338void btrfs_drop_inode(struct inode *inode) 6339int btrfs_drop_inode(struct inode *inode)
5339{ 6340{
5340 struct btrfs_root *root = BTRFS_I(inode)->root; 6341 struct btrfs_root *root = BTRFS_I(inode)->root;
5341 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 6342
5342 generic_delete_inode(inode); 6343 if (btrfs_root_refs(&root->root_item) == 0)
6344 return 1;
5343 else 6345 else
5344 generic_drop_inode(inode); 6346 return generic_drop_inode(inode);
5345} 6347}
5346 6348
5347static void init_once(void *foo) 6349static void init_once(void *foo)
@@ -5434,19 +6436,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5434 if (S_ISDIR(old_inode->i_mode) && new_inode && 6436 if (S_ISDIR(old_inode->i_mode) && new_inode &&
5435 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6437 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5436 return -ENOTEMPTY; 6438 return -ENOTEMPTY;
5437
5438 /*
5439 * We want to reserve the absolute worst case amount of items. So if
5440 * both inodes are subvols and we need to unlink them then that would
5441 * require 4 item modifications, but if they are both normal inodes it
5442 * would require 5 item modifications, so we'll assume their normal
5443 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5444 * should cover the worst case number of items we'll modify.
5445 */
5446 ret = btrfs_reserve_metadata_space(root, 11);
5447 if (ret)
5448 return ret;
5449
5450 /* 6439 /*
5451 * we're using rename to replace one file with another. 6440 * we're using rename to replace one file with another.
5452 * and the replacement file is large. Start IO on it now so 6441 * and the replacement file is large. Start IO on it now so
@@ -5459,8 +6448,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5459 /* close the racy window with snapshot create/destroy ioctl */ 6448 /* close the racy window with snapshot create/destroy ioctl */
5460 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6449 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5461 down_read(&root->fs_info->subvol_sem); 6450 down_read(&root->fs_info->subvol_sem);
6451 /*
6452 * We want to reserve the absolute worst case amount of items. So if
6453 * both inodes are subvols and we need to unlink them then that would
6454 * require 4 item modifications, but if they are both normal inodes it
6455 * would require 5 item modifications, so we'll assume their normal
6456 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6457 * should cover the worst case number of items we'll modify.
6458 */
6459 trans = btrfs_start_transaction(root, 20);
6460 if (IS_ERR(trans))
6461 return PTR_ERR(trans);
5462 6462
5463 trans = btrfs_start_transaction(root, 1);
5464 btrfs_set_trans_block_group(trans, new_dir); 6463 btrfs_set_trans_block_group(trans, new_dir);
5465 6464
5466 if (dest != root) 6465 if (dest != root)
@@ -5559,7 +6558,6 @@ out_fail:
5559 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6558 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5560 up_read(&root->fs_info->subvol_sem); 6559 up_read(&root->fs_info->subvol_sem);
5561 6560
5562 btrfs_unreserve_metadata_space(root, 11);
5563 return ret; 6561 return ret;
5564} 6562}
5565 6563
@@ -5611,6 +6609,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5611 return 0; 6609 return 0;
5612} 6610}
5613 6611
6612int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6613{
6614 struct btrfs_inode *binode;
6615 struct inode *inode = NULL;
6616
6617 spin_lock(&root->fs_info->delalloc_lock);
6618 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6619 binode = list_entry(root->fs_info->delalloc_inodes.next,
6620 struct btrfs_inode, delalloc_inodes);
6621 inode = igrab(&binode->vfs_inode);
6622 if (inode) {
6623 list_move_tail(&binode->delalloc_inodes,
6624 &root->fs_info->delalloc_inodes);
6625 break;
6626 }
6627
6628 list_del_init(&binode->delalloc_inodes);
6629 cond_resched_lock(&root->fs_info->delalloc_lock);
6630 }
6631 spin_unlock(&root->fs_info->delalloc_lock);
6632
6633 if (inode) {
6634 write_inode_now(inode, 0);
6635 if (delay_iput)
6636 btrfs_add_delayed_iput(inode);
6637 else
6638 iput(inode);
6639 return 1;
6640 }
6641 return 0;
6642}
6643
5614static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6644static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5615 const char *symname) 6645 const char *symname)
5616{ 6646{
@@ -5634,26 +6664,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5634 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6664 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5635 return -ENAMETOOLONG; 6665 return -ENAMETOOLONG;
5636 6666
6667 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6668 if (err)
6669 return err;
5637 /* 6670 /*
5638 * 2 items for inode item and ref 6671 * 2 items for inode item and ref
5639 * 2 items for dir items 6672 * 2 items for dir items
5640 * 1 item for xattr if selinux is on 6673 * 1 item for xattr if selinux is on
5641 */ 6674 */
5642 err = btrfs_reserve_metadata_space(root, 5); 6675 trans = btrfs_start_transaction(root, 5);
5643 if (err) 6676 if (IS_ERR(trans))
5644 return err; 6677 return PTR_ERR(trans);
5645 6678
5646 trans = btrfs_start_transaction(root, 1);
5647 if (!trans)
5648 goto out_fail;
5649 btrfs_set_trans_block_group(trans, dir); 6679 btrfs_set_trans_block_group(trans, dir);
5650 6680
5651 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5652 if (err) {
5653 err = -ENOSPC;
5654 goto out_unlock;
5655 }
5656
5657 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6681 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5658 dentry->d_name.len, 6682 dentry->d_name.len,
5659 dentry->d_parent->d_inode->i_ino, objectid, 6683 dentry->d_parent->d_inode->i_ino, objectid,
@@ -5725,8 +6749,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5725out_unlock: 6749out_unlock:
5726 nr = trans->blocks_used; 6750 nr = trans->blocks_used;
5727 btrfs_end_transaction_throttle(trans, root); 6751 btrfs_end_transaction_throttle(trans, root);
5728out_fail:
5729 btrfs_unreserve_metadata_space(root, 5);
5730 if (drop_inode) { 6752 if (drop_inode) {
5731 inode_dec_link_count(inode); 6753 inode_dec_link_count(inode);
5732 iput(inode); 6754 iput(inode);
@@ -5735,33 +6757,28 @@ out_fail:
5735 return err; 6757 return err;
5736} 6758}
5737 6759
5738static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 6760int btrfs_prealloc_file_range(struct inode *inode, int mode,
5739 u64 alloc_hint, int mode, loff_t actual_len) 6761 u64 start, u64 num_bytes, u64 min_size,
6762 loff_t actual_len, u64 *alloc_hint)
5740{ 6763{
5741 struct btrfs_trans_handle *trans; 6764 struct btrfs_trans_handle *trans;
5742 struct btrfs_root *root = BTRFS_I(inode)->root; 6765 struct btrfs_root *root = BTRFS_I(inode)->root;
5743 struct btrfs_key ins; 6766 struct btrfs_key ins;
5744 u64 cur_offset = start; 6767 u64 cur_offset = start;
5745 u64 num_bytes = end - start;
5746 int ret = 0; 6768 int ret = 0;
5747 u64 i_size;
5748 6769
5749 while (num_bytes > 0) { 6770 while (num_bytes > 0) {
5750 trans = btrfs_start_transaction(root, 1); 6771 trans = btrfs_start_transaction(root, 3);
5751 6772 if (IS_ERR(trans)) {
5752 ret = btrfs_reserve_extent(trans, root, num_bytes, 6773 ret = PTR_ERR(trans);
5753 root->sectorsize, 0, alloc_hint, 6774 break;
5754 (u64)-1, &ins, 1);
5755 if (ret) {
5756 WARN_ON(1);
5757 goto stop_trans;
5758 } 6775 }
5759 6776
5760 ret = btrfs_reserve_metadata_space(root, 3); 6777 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6778 0, *alloc_hint, (u64)-1, &ins, 1);
5761 if (ret) { 6779 if (ret) {
5762 btrfs_free_reserved_extent(root, ins.objectid, 6780 btrfs_end_transaction(trans, root);
5763 ins.offset); 6781 break;
5764 goto stop_trans;
5765 } 6782 }
5766 6783
5767 ret = insert_reserved_file_extent(trans, inode, 6784 ret = insert_reserved_file_extent(trans, inode,
@@ -5775,34 +6792,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5775 6792
5776 num_bytes -= ins.offset; 6793 num_bytes -= ins.offset;
5777 cur_offset += ins.offset; 6794 cur_offset += ins.offset;
5778 alloc_hint = ins.objectid + ins.offset; 6795 *alloc_hint = ins.objectid + ins.offset;
5779 6796
5780 inode->i_ctime = CURRENT_TIME; 6797 inode->i_ctime = CURRENT_TIME;
5781 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 6798 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5782 if (!(mode & FALLOC_FL_KEEP_SIZE) && 6799 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5783 (actual_len > inode->i_size) && 6800 (actual_len > inode->i_size) &&
5784 (cur_offset > inode->i_size)) { 6801 (cur_offset > inode->i_size)) {
5785
5786 if (cur_offset > actual_len) 6802 if (cur_offset > actual_len)
5787 i_size = actual_len; 6803 i_size_write(inode, actual_len);
5788 else 6804 else
5789 i_size = cur_offset; 6805 i_size_write(inode, cur_offset);
5790 i_size_write(inode, i_size); 6806 i_size_write(inode, cur_offset);
5791 btrfs_ordered_update_i_size(inode, i_size, NULL); 6807 btrfs_ordered_update_i_size(inode, cur_offset, NULL);
5792 } 6808 }
5793 6809
5794 ret = btrfs_update_inode(trans, root, inode); 6810 ret = btrfs_update_inode(trans, root, inode);
5795 BUG_ON(ret); 6811 BUG_ON(ret);
5796 6812
5797 btrfs_end_transaction(trans, root); 6813 btrfs_end_transaction(trans, root);
5798 btrfs_unreserve_metadata_space(root, 3);
5799 } 6814 }
5800 return ret; 6815 return ret;
5801
5802stop_trans:
5803 btrfs_end_transaction(trans, root);
5804 return ret;
5805
5806} 6816}
5807 6817
5808static long btrfs_fallocate(struct inode *inode, int mode, 6818static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5835,8 +6845,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5835 goto out; 6845 goto out;
5836 } 6846 }
5837 6847
5838 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, 6848 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
5839 alloc_end - alloc_start);
5840 if (ret) 6849 if (ret)
5841 goto out; 6850 goto out;
5842 6851
@@ -5881,16 +6890,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5881 if (em->block_start == EXTENT_MAP_HOLE || 6890 if (em->block_start == EXTENT_MAP_HOLE ||
5882 (cur_offset >= inode->i_size && 6891 (cur_offset >= inode->i_size &&
5883 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6892 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5884 ret = prealloc_file_range(inode, 6893 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
5885 cur_offset, last_byte, 6894 last_byte - cur_offset,
5886 alloc_hint, mode, offset+len); 6895 1 << inode->i_blkbits,
6896 offset + len,
6897 &alloc_hint);
5887 if (ret < 0) { 6898 if (ret < 0) {
5888 free_extent_map(em); 6899 free_extent_map(em);
5889 break; 6900 break;
5890 } 6901 }
5891 } 6902 }
5892 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5893 alloc_hint = em->block_start;
5894 free_extent_map(em); 6903 free_extent_map(em);
5895 6904
5896 cur_offset = last_byte; 6905 cur_offset = last_byte;
@@ -5902,8 +6911,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5902 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 6911 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5903 &cached_state, GFP_NOFS); 6912 &cached_state, GFP_NOFS);
5904 6913
5905 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 6914 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
5906 alloc_end - alloc_start);
5907out: 6915out:
5908 mutex_unlock(&inode->i_mutex); 6916 mutex_unlock(&inode->i_mutex);
5909 return ret; 6917 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
240 u64 index = 0; 240 u64 index = 0;
241 241
242 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
243 0, &objectid);
244 if (ret)
245 return ret;
242 /* 246 /*
243 * 1 - inode item 247 * 1 - inode item
244 * 2 - refs 248 * 2 - refs
245 * 1 - root item 249 * 1 - root item
246 * 2 - dir items 250 * 2 - dir items
247 */ 251 */
248 ret = btrfs_reserve_metadata_space(root, 6); 252 trans = btrfs_start_transaction(root, 6);
249 if (ret) 253 if (IS_ERR(trans))
250 return ret; 254 return PTR_ERR(trans);
251
252 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(!trans);
254
255 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
256 0, &objectid);
257 if (ret)
258 goto fail;
259 255
260 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 256 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
261 0, objectid, NULL, 0, 0, 0); 257 0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
345 err = btrfs_commit_transaction(trans, root); 341 err = btrfs_commit_transaction(trans, root);
346 if (err && !ret) 342 if (err && !ret)
347 ret = err; 343 ret = err;
348
349 btrfs_unreserve_metadata_space(root, 6);
350 return ret; 344 return ret;
351} 345}
352 346
353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
354 char *name, int namelen)
355{ 348{
356 struct inode *inode; 349 struct inode *inode;
357 struct btrfs_pending_snapshot *pending_snapshot; 350 struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
361 if (!root->ref_cows) 354 if (!root->ref_cows)
362 return -EINVAL; 355 return -EINVAL;
363 356
364 /*
365 * 1 - inode item
366 * 2 - refs
367 * 1 - root item
368 * 2 - dir items
369 */
370 ret = btrfs_reserve_metadata_space(root, 6);
371 if (ret)
372 goto fail;
373
374 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 357 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
375 if (!pending_snapshot) { 358 if (!pending_snapshot)
376 ret = -ENOMEM; 359 return -ENOMEM;
377 btrfs_unreserve_metadata_space(root, 6); 360
378 goto fail; 361 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 }
380 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
381 if (!pending_snapshot->name) {
382 ret = -ENOMEM;
383 kfree(pending_snapshot);
384 btrfs_unreserve_metadata_space(root, 6);
385 goto fail;
386 }
387 memcpy(pending_snapshot->name, name, namelen);
388 pending_snapshot->name[namelen] = '\0';
389 pending_snapshot->dentry = dentry; 362 pending_snapshot->dentry = dentry;
390 trans = btrfs_start_transaction(root, 1);
391 BUG_ON(!trans);
392 pending_snapshot->root = root; 363 pending_snapshot->root = root;
364
365 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
366 if (IS_ERR(trans)) {
367 ret = PTR_ERR(trans);
368 goto fail;
369 }
370
371 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
372 BUG_ON(ret);
373
393 list_add(&pending_snapshot->list, 374 list_add(&pending_snapshot->list,
394 &trans->transaction->pending_snapshots); 375 &trans->transaction->pending_snapshots);
395 ret = btrfs_commit_transaction(trans, root); 376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
396 BUG_ON(ret); 377 BUG_ON(ret);
397 btrfs_unreserve_metadata_space(root, 6); 378
379 ret = pending_snapshot->error;
380 if (ret)
381 goto fail;
382
383 btrfs_orphan_cleanup(pending_snapshot->snap);
398 384
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 385 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
400 if (IS_ERR(inode)) { 386 if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
405 d_instantiate(dentry, inode); 391 d_instantiate(dentry, inode);
406 ret = 0; 392 ret = 0;
407fail: 393fail:
394 kfree(pending_snapshot);
408 return ret; 395 return ret;
409} 396}
410 397
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
456 goto out_up_read; 443 goto out_up_read;
457 444
458 if (snap_src) { 445 if (snap_src) {
459 error = create_snapshot(snap_src, dentry, 446 error = create_snapshot(snap_src, dentry);
460 name, namelen);
461 } else { 447 } else {
462 error = create_subvol(BTRFS_I(dir)->root, dentry, 448 error = create_subvol(BTRFS_I(dir)->root, dentry,
463 name, namelen); 449 name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 587 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1; 588 BTRFS_I(inode)->force_compress = 1;
603 589
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 590 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
605 if (ret) { 591 if (ret)
606 ret = -ENOSPC; 592 goto err_unlock;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
617again: 593again:
618 if (inode->i_size == 0 || 594 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { 595 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
622 } 598 }
623 599
624 page = grab_cache_page(inode->i_mapping, i); 600 page = grab_cache_page(inode->i_mapping, i);
625 if (!page) 601 if (!page) {
602 ret = -ENOMEM;
626 goto err_reservations; 603 goto err_reservations;
604 }
627 605
628 if (!PageUptodate(page)) { 606 if (!PageUptodate(page)) {
629 btrfs_readpage(NULL, page); 607 btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
631 if (!PageUptodate(page)) { 609 if (!PageUptodate(page)) {
632 unlock_page(page); 610 unlock_page(page);
633 page_cache_release(page); 611 page_cache_release(page);
612 ret = -EIO;
634 goto err_reservations; 613 goto err_reservations;
635 } 614 }
636 } 615 }
@@ -644,8 +623,7 @@ again:
644 wait_on_page_writeback(page); 623 wait_on_page_writeback(page);
645 624
646 if (PageDirty(page)) { 625 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode, 626 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
648 PAGE_CACHE_SIZE);
649 goto loop_unlock; 627 goto loop_unlock;
650 } 628 }
651 629
@@ -683,7 +661,6 @@ loop_unlock:
683 page_cache_release(page); 661 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex); 662 mutex_unlock(&inode->i_mutex);
685 663
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 664 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++; 665 i++;
689 } 666 }
@@ -713,9 +690,9 @@ loop_unlock:
713 return 0; 690 return 0;
714 691
715err_reservations: 692err_reservations:
693 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
694err_unlock:
716 mutex_unlock(&inode->i_mutex); 695 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret; 696 return ret;
720} 697}
721 698
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
811 device->name, (unsigned long long)new_size); 788 device->name, (unsigned long long)new_size);
812 789
813 if (new_size > old_size) { 790 if (new_size > old_size) {
814 trans = btrfs_start_transaction(root, 1); 791 trans = btrfs_start_transaction(root, 0);
815 ret = btrfs_grow_device(trans, device, new_size); 792 ret = btrfs_grow_device(trans, device, new_size);
816 btrfs_commit_transaction(trans, root); 793 btrfs_commit_transaction(trans, root);
817 } else { 794 } else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1300 if (err) 1277 if (err)
1301 goto out_up_write; 1278 goto out_up_write;
1302 1279
1303 trans = btrfs_start_transaction(root, 1); 1280 trans = btrfs_start_transaction(root, 0);
1281 if (IS_ERR(trans)) {
1282 err = PTR_ERR(trans);
1283 goto out_up_write;
1284 }
1285 trans->block_rsv = &root->fs_info->global_block_rsv;
1286
1304 ret = btrfs_unlink_subvol(trans, root, dir, 1287 ret = btrfs_unlink_subvol(trans, root, dir,
1305 dest->root_key.objectid, 1288 dest->root_key.objectid,
1306 dentry->d_name.name, 1289 dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1314 dest->root_item.drop_level = 0; 1297 dest->root_item.drop_level = 0;
1315 btrfs_set_root_refs(&dest->root_item, 0); 1298 btrfs_set_root_refs(&dest->root_item, 0);
1316 1299
1317 ret = btrfs_insert_orphan_item(trans, 1300 if (!xchg(&dest->orphan_item_inserted, 1)) {
1318 root->fs_info->tree_root, 1301 ret = btrfs_insert_orphan_item(trans,
1319 dest->root_key.objectid); 1302 root->fs_info->tree_root,
1320 BUG_ON(ret); 1303 dest->root_key.objectid);
1304 BUG_ON(ret);
1305 }
1321 1306
1322 ret = btrfs_commit_transaction(trans, root); 1307 ret = btrfs_commit_transaction(trans, root);
1323 BUG_ON(ret); 1308 BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1358 ret = -EPERM; 1343 ret = -EPERM;
1359 goto out; 1344 goto out;
1360 } 1345 }
1361 btrfs_defrag_root(root, 0); 1346 ret = btrfs_defrag_root(root, 0);
1362 btrfs_defrag_root(root->fs_info->extent_root, 0); 1347 if (ret)
1348 goto out;
1349 ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
1363 break; 1350 break;
1364 case S_IFREG: 1351 case S_IFREG:
1365 if (!(file->f_mode & FMODE_WRITE)) { 1352 if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1389 /* the rest are all set to zero by kzalloc */ 1376 /* the rest are all set to zero by kzalloc */
1390 range->len = (u64)-1; 1377 range->len = (u64)-1;
1391 } 1378 }
1392 btrfs_defrag_file(file, range); 1379 ret = btrfs_defrag_file(file, range);
1393 kfree(range); 1380 kfree(range);
1394 break; 1381 break;
1382 default:
1383 ret = -EINVAL;
1395 } 1384 }
1396out: 1385out:
1397 mnt_drop_write(file->f_path.mnt); 1386 mnt_drop_write(file->f_path.mnt);
@@ -1469,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1469 */ 1458 */
1470 1459
1471 /* the destination must be opened for writing */ 1460 /* the destination must be opened for writing */
1472 if (!(file->f_mode & FMODE_WRITE)) 1461 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1473 return -EINVAL; 1462 return -EINVAL;
1474 1463
1475 ret = mnt_want_write(file->f_path.mnt); 1464 ret = mnt_want_write(file->f_path.mnt);
@@ -1522,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1522 1511
1523 /* determine range to clone */ 1512 /* determine range to clone */
1524 ret = -EINVAL; 1513 ret = -EINVAL;
1525 if (off >= src->i_size || off + len > src->i_size) 1514 if (off + len > src->i_size || off + len < off)
1526 goto out_unlock; 1515 goto out_unlock;
1527 if (len == 0) 1516 if (len == 0)
1528 olen = len = src->i_size - off; 1517 olen = len = src->i_size - off;
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1550 btrfs_wait_ordered_range(src, off, off+len); 1539 btrfs_wait_ordered_range(src, off, off+len);
1551 } 1540 }
1552 1541
1553 trans = btrfs_start_transaction(root, 1);
1554 BUG_ON(!trans);
1555
1556 /* punch hole in destination first */
1557 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1558
1559 /* clone data */ 1542 /* clone data */
1560 key.objectid = src->i_ino; 1543 key.objectid = src->i_ino;
1561 key.type = BTRFS_EXTENT_DATA_KEY; 1544 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1566 * note the key will change type as we walk through the 1549 * note the key will change type as we walk through the
1567 * tree. 1550 * tree.
1568 */ 1551 */
1569 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 1552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1570 if (ret < 0) 1553 if (ret < 0)
1571 goto out; 1554 goto out;
1572 1555
@@ -1595,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1595 u64 disko = 0, diskl = 0; 1578 u64 disko = 0, diskl = 0;
1596 u64 datao = 0, datal = 0; 1579 u64 datao = 0, datal = 0;
1597 u8 comp; 1580 u8 comp;
1581 u64 endoff;
1598 1582
1599 size = btrfs_item_size_nr(leaf, slot); 1583 size = btrfs_item_size_nr(leaf, slot);
1600 read_extent_buffer(leaf, buf, 1584 read_extent_buffer(leaf, buf,
@@ -1629,12 +1613,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1629 new_key.objectid = inode->i_ino; 1613 new_key.objectid = inode->i_ino;
1630 new_key.offset = key.offset + destoff - off; 1614 new_key.offset = key.offset + destoff - off;
1631 1615
1616 trans = btrfs_start_transaction(root, 1);
1617 if (IS_ERR(trans)) {
1618 ret = PTR_ERR(trans);
1619 goto out;
1620 }
1621
1632 if (type == BTRFS_FILE_EXTENT_REG || 1622 if (type == BTRFS_FILE_EXTENT_REG ||
1633 type == BTRFS_FILE_EXTENT_PREALLOC) { 1623 type == BTRFS_FILE_EXTENT_PREALLOC) {
1624 if (off > key.offset) {
1625 datao += off - key.offset;
1626 datal -= off - key.offset;
1627 }
1628
1629 if (key.offset + datal > off + len)
1630 datal = off + len - key.offset;
1631
1632 ret = btrfs_drop_extents(trans, inode,
1633 new_key.offset,
1634 new_key.offset + datal,
1635 &hint_byte, 1);
1636 BUG_ON(ret);
1637
1634 ret = btrfs_insert_empty_item(trans, root, path, 1638 ret = btrfs_insert_empty_item(trans, root, path,
1635 &new_key, size); 1639 &new_key, size);
1636 if (ret) 1640 BUG_ON(ret);
1637 goto out;
1638 1641
1639 leaf = path->nodes[0]; 1642 leaf = path->nodes[0];
1640 slot = path->slots[0]; 1643 slot = path->slots[0];
@@ -1645,14 +1648,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1645 extent = btrfs_item_ptr(leaf, slot, 1648 extent = btrfs_item_ptr(leaf, slot,
1646 struct btrfs_file_extent_item); 1649 struct btrfs_file_extent_item);
1647 1650
1648 if (off > key.offset) {
1649 datao += off - key.offset;
1650 datal -= off - key.offset;
1651 }
1652
1653 if (key.offset + datal > off + len)
1654 datal = off + len - key.offset;
1655
1656 /* disko == 0 means it's a hole */ 1651 /* disko == 0 means it's a hole */
1657 if (!disko) 1652 if (!disko)
1658 datao = 0; 1653 datao = 0;
@@ -1683,14 +1678,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1683 1678
1684 if (comp && (skip || trim)) { 1679 if (comp && (skip || trim)) {
1685 ret = -EINVAL; 1680 ret = -EINVAL;
1681 btrfs_end_transaction(trans, root);
1686 goto out; 1682 goto out;
1687 } 1683 }
1688 size -= skip + trim; 1684 size -= skip + trim;
1689 datal -= skip + trim; 1685 datal -= skip + trim;
1686
1687 ret = btrfs_drop_extents(trans, inode,
1688 new_key.offset,
1689 new_key.offset + datal,
1690 &hint_byte, 1);
1691 BUG_ON(ret);
1692
1690 ret = btrfs_insert_empty_item(trans, root, path, 1693 ret = btrfs_insert_empty_item(trans, root, path,
1691 &new_key, size); 1694 &new_key, size);
1692 if (ret) 1695 BUG_ON(ret);
1693 goto out;
1694 1696
1695 if (skip) { 1697 if (skip) {
1696 u32 start = 1698 u32 start =
@@ -1708,8 +1710,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1708 } 1710 }
1709 1711
1710 btrfs_mark_buffer_dirty(leaf); 1712 btrfs_mark_buffer_dirty(leaf);
1711 } 1713 btrfs_release_path(root, path);
1714
1715 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1712 1716
1717 /*
1718 * we round up to the block size at eof when
1719 * determining which extents to clone above,
1720 * but shouldn't round up the file size
1721 */
1722 endoff = new_key.offset + datal;
1723 if (endoff > off+olen)
1724 endoff = off+olen;
1725 if (endoff > inode->i_size)
1726 btrfs_i_size_write(inode, endoff);
1727
1728 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1729 ret = btrfs_update_inode(trans, root, inode);
1730 BUG_ON(ret);
1731 btrfs_end_transaction(trans, root);
1732 }
1713next: 1733next:
1714 btrfs_release_path(root, path); 1734 btrfs_release_path(root, path);
1715 key.offset++; 1735 key.offset++;
@@ -1717,17 +1737,7 @@ next:
1717 ret = 0; 1737 ret = 0;
1718out: 1738out:
1719 btrfs_release_path(root, path); 1739 btrfs_release_path(root, path);
1720 if (ret == 0) {
1721 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1722 if (destoff + olen > inode->i_size)
1723 btrfs_i_size_write(inode, destoff + olen);
1724 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1725 ret = btrfs_update_inode(trans, root, inode);
1726 }
1727 btrfs_end_transaction(trans, root);
1728 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1740 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1729 if (ret)
1730 vmtruncate(inode, 0);
1731out_unlock: 1741out_unlock:
1732 mutex_unlock(&src->i_mutex); 1742 mutex_unlock(&src->i_mutex);
1733 mutex_unlock(&inode->i_mutex); 1743 mutex_unlock(&inode->i_mutex);
@@ -1845,7 +1855,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1845 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 1855 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1846 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 1856 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1847 dir_id, "default", 7, 1); 1857 dir_id, "default", 7, 1);
1848 if (!di) { 1858 if (IS_ERR_OR_NULL(di)) {
1849 btrfs_free_path(path); 1859 btrfs_free_path(path);
1850 btrfs_end_transaction(trans, root); 1860 btrfs_end_transaction(trans, root);
1851 printk(KERN_ERR "Umm, you don't have the default dir item, " 1861 printk(KERN_ERR "Umm, you don't have the default dir item, "
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
124 return 1; 124 return 1;
125} 125}
126 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
127/* 136/*
128 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
129 * the first one less than this offset 138 * the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
161 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
162 * inserted. 171 * inserted.
163 */ 172 */
164int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
165 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio)
166{ 176{
167 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
168 struct rb_node *node; 178 struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
182 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
183 set_bit(type, &entry->flags); 193 set_bit(type, &entry->flags);
184 194
195 if (dio)
196 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
197
185 /* one ref for the tree */ 198 /* one ref for the tree */
186 atomic_set(&entry->refs, 1); 199 atomic_set(&entry->refs, 1);
187 init_waitqueue_head(&entry->wait); 200 init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
203 return 0; 216 return 0;
204} 217}
205 218
219int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type)
221{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0);
224}
225
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type)
228{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1);
231}
232
206/* 233/*
207 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 234 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
208 * when an ordered extent is finished. If the list covers more than one 235 * when an ordered extent is finished. If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
311 tree->last = NULL; 338 tree->last = NULL;
312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 339 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
313 340
314 spin_lock(&BTRFS_I(inode)->accounting_lock);
315 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
316 BTRFS_I(inode)->outstanding_extents--;
317 spin_unlock(&BTRFS_I(inode)->accounting_lock);
318 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
319 inode, 1);
320
321 spin_lock(&root->fs_info->ordered_extent_lock); 341 spin_lock(&root->fs_info->ordered_extent_lock);
322 list_del_init(&entry->root_extent_list); 342 list_del_init(&entry->root_extent_list);
323 343
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
491 * start IO on any dirty ones so the wait doesn't stall waiting 511 * start IO on any dirty ones so the wait doesn't stall waiting
492 * for pdflush to find them 512 * for pdflush to find them
493 */ 513 */
494 filemap_fdatawrite_range(inode->i_mapping, start, end); 514 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
515 filemap_fdatawrite_range(inode->i_mapping, start, end);
495 if (wait) { 516 if (wait) {
496 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 517 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
497 &entry->flags)); 518 &entry->flags));
@@ -588,6 +609,47 @@ out:
588 return entry; 609 return entry;
589} 610}
590 611
612/* Since the DIO code tries to lock a wide area we need to look for any ordered
613 * extents that exist in the range, rather than just the start of the range.
614 */
615struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
616 u64 file_offset,
617 u64 len)
618{
619 struct btrfs_ordered_inode_tree *tree;
620 struct rb_node *node;
621 struct btrfs_ordered_extent *entry = NULL;
622
623 tree = &BTRFS_I(inode)->ordered_tree;
624 spin_lock(&tree->lock);
625 node = tree_search(tree, file_offset);
626 if (!node) {
627 node = tree_search(tree, file_offset + len);
628 if (!node)
629 goto out;
630 }
631
632 while (1) {
633 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
634 if (range_overlaps(entry, file_offset, len))
635 break;
636
637 if (entry->file_offset >= file_offset + len) {
638 entry = NULL;
639 break;
640 }
641 entry = NULL;
642 node = rb_next(node);
643 if (!node)
644 break;
645 }
646out:
647 if (entry)
648 atomic_inc(&entry->refs);
649 spin_unlock(&tree->lock);
650 return entry;
651}
652
591/* 653/*
592 * lookup and return any extent before 'file_offset'. NULL is returned 654 * lookup and return any extent before 'file_offset'. NULL is returned
593 * if none is found 655 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
140 struct btrfs_ordered_extent **cached, 142 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size); 143 u64 file_offset, u64 io_size);
142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
143 u64 start, u64 len, u64 disk_len, int tyep); 145 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type);
144int btrfs_add_ordered_sum(struct inode *inode, 148int btrfs_add_ordered_sum(struct inode *inode,
145 struct btrfs_ordered_extent *entry, 149 struct btrfs_ordered_extent *entry,
146 struct btrfs_ordered_sum *sum); 150 struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
151int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 155int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
152struct btrfs_ordered_extent * 156struct btrfs_ordered_extent *
153btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 157btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
158struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
159 u64 file_offset,
160 u64 len);
154int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 161int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
155 struct btrfs_ordered_extent *ordered); 162 struct btrfs_ordered_extent *ordered);
156int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 163int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
44struct backref_node { 44struct backref_node {
45 struct rb_node rb_node; 45 struct rb_node rb_node;
46 u64 bytenr; 46 u64 bytenr;
47 /* objectid tree block owner */ 47
48 u64 new_bytenr;
49 /* objectid of tree block owner, can be not uptodate */
48 u64 owner; 50 u64 owner;
51 /* link to pending, changed or detached list */
52 struct list_head list;
49 /* list of upper level blocks reference this block */ 53 /* list of upper level blocks reference this block */
50 struct list_head upper; 54 struct list_head upper;
51 /* list of child blocks in the cache */ 55 /* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
56 struct extent_buffer *eb; 60 struct extent_buffer *eb;
57 /* level of tree block */ 61 /* level of tree block */
58 unsigned int level:8; 62 unsigned int level:8;
59 /* 1 if the block is root of old snapshot */ 63 /* is the block in non-reference counted tree */
60 unsigned int old_root:1; 64 unsigned int cowonly:1;
61 /* 1 if no child blocks in the cache */ 65 /* 1 if no child node in the cache */
62 unsigned int lowest:1; 66 unsigned int lowest:1;
63 /* is the extent buffer locked */ 67 /* is the extent buffer locked */
64 unsigned int locked:1; 68 unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
66 unsigned int processed:1; 70 unsigned int processed:1;
67 /* have backrefs of this block been checked */ 71 /* have backrefs of this block been checked */
68 unsigned int checked:1; 72 unsigned int checked:1;
73 /*
74 * 1 if corresponding block has been cowed but some upper
75 * level block pointers may not point to the new location
76 */
77 unsigned int pending:1;
78 /*
79 * 1 if the backref node isn't connected to any other
80 * backref node.
81 */
82 unsigned int detached:1;
69}; 83};
70 84
71/* 85/*
@@ -74,7 +88,6 @@ struct backref_node {
74struct backref_edge { 88struct backref_edge {
75 struct list_head list[2]; 89 struct list_head list[2];
76 struct backref_node *node[2]; 90 struct backref_node *node[2];
77 u64 blockptr;
78}; 91};
79 92
80#define LOWER 0 93#define LOWER 0
@@ -83,9 +96,25 @@ struct backref_edge {
83struct backref_cache { 96struct backref_cache {
84 /* red black tree of all backref nodes in the cache */ 97 /* red black tree of all backref nodes in the cache */
85 struct rb_root rb_root; 98 struct rb_root rb_root;
86 /* list of backref nodes with no child block in the cache */ 99 /* for passing backref nodes to btrfs_reloc_cow_block */
100 struct backref_node *path[BTRFS_MAX_LEVEL];
101 /*
102 * list of blocks that have been cowed but some block
103 * pointers in upper level blocks may not reflect the
104 * new location
105 */
87 struct list_head pending[BTRFS_MAX_LEVEL]; 106 struct list_head pending[BTRFS_MAX_LEVEL];
88 spinlock_t lock; 107 /* list of backref nodes with no child node */
108 struct list_head leaves;
109 /* list of blocks that have been cowed in current transaction */
110 struct list_head changed;
111 /* list of detached backref node. */
112 struct list_head detached;
113
114 u64 last_trans;
115
116 int nr_nodes;
117 int nr_edges;
89}; 118};
90 119
91/* 120/*
@@ -113,15 +142,6 @@ struct tree_block {
113 unsigned int key_ready:1; 142 unsigned int key_ready:1;
114}; 143};
115 144
116/* inode vector */
117#define INODEVEC_SIZE 16
118
119struct inodevec {
120 struct list_head list;
121 struct inode *inode[INODEVEC_SIZE];
122 int nr;
123};
124
125#define MAX_EXTENTS 128 145#define MAX_EXTENTS 128
126 146
127struct file_extent_cluster { 147struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
138 struct btrfs_root *extent_root; 158 struct btrfs_root *extent_root;
139 /* inode for moving data */ 159 /* inode for moving data */
140 struct inode *data_inode; 160 struct inode *data_inode;
141 struct btrfs_workers workers; 161
162 struct btrfs_block_rsv *block_rsv;
163
164 struct backref_cache backref_cache;
165
166 struct file_extent_cluster cluster;
142 /* tree blocks have been processed */ 167 /* tree blocks have been processed */
143 struct extent_io_tree processed_blocks; 168 struct extent_io_tree processed_blocks;
144 /* map start of tree root to corresponding reloc tree */ 169 /* map start of tree root to corresponding reloc tree */
145 struct mapping_tree reloc_root_tree; 170 struct mapping_tree reloc_root_tree;
146 /* list of reloc trees */ 171 /* list of reloc trees */
147 struct list_head reloc_roots; 172 struct list_head reloc_roots;
173 /* size of metadata reservation for merging reloc trees */
174 u64 merging_rsv_size;
175 /* size of relocated tree nodes */
176 u64 nodes_relocated;
177
148 u64 search_start; 178 u64 search_start;
149 u64 extents_found; 179 u64 extents_found;
150 u64 extents_skipped; 180
151 int stage; 181 int block_rsv_retries;
152 int create_reloc_root; 182
183 unsigned int stage:8;
184 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1;
153 unsigned int found_file_extent:1; 186 unsigned int found_file_extent:1;
154 unsigned int found_old_snapshot:1; 187 unsigned int commit_transaction:1;
155}; 188};
156 189
157/* stages of data relocation */ 190/* stages of data relocation */
158#define MOVE_DATA_EXTENTS 0 191#define MOVE_DATA_EXTENTS 0
159#define UPDATE_DATA_PTRS 1 192#define UPDATE_DATA_PTRS 1
160 193
161/* 194static void remove_backref_node(struct backref_cache *cache,
162 * merge reloc tree to corresponding fs tree in worker threads 195 struct backref_node *node);
163 */ 196static void __mark_block_processed(struct reloc_control *rc,
164struct async_merge { 197 struct backref_node *node);
165 struct btrfs_work work;
166 struct reloc_control *rc;
167 struct btrfs_root *root;
168 struct completion *done;
169 atomic_t *num_pending;
170};
171 198
172static void mapping_tree_init(struct mapping_tree *tree) 199static void mapping_tree_init(struct mapping_tree *tree)
173{ 200{
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
181 cache->rb_root = RB_ROOT; 208 cache->rb_root = RB_ROOT;
182 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 209 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
183 INIT_LIST_HEAD(&cache->pending[i]); 210 INIT_LIST_HEAD(&cache->pending[i]);
184 spin_lock_init(&cache->lock); 211 INIT_LIST_HEAD(&cache->changed);
212 INIT_LIST_HEAD(&cache->detached);
213 INIT_LIST_HEAD(&cache->leaves);
214}
215
216static void backref_cache_cleanup(struct backref_cache *cache)
217{
218 struct backref_node *node;
219 int i;
220
221 while (!list_empty(&cache->detached)) {
222 node = list_entry(cache->detached.next,
223 struct backref_node, list);
224 remove_backref_node(cache, node);
225 }
226
227 while (!list_empty(&cache->leaves)) {
228 node = list_entry(cache->leaves.next,
229 struct backref_node, lower);
230 remove_backref_node(cache, node);
231 }
232
233 cache->last_trans = 0;
234
235 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
236 BUG_ON(!list_empty(&cache->pending[i]));
237 BUG_ON(!list_empty(&cache->changed));
238 BUG_ON(!list_empty(&cache->detached));
239 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
240 BUG_ON(cache->nr_nodes);
241 BUG_ON(cache->nr_edges);
242}
243
244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
245{
246 struct backref_node *node;
247
248 node = kzalloc(sizeof(*node), GFP_NOFS);
249 if (node) {
250 INIT_LIST_HEAD(&node->list);
251 INIT_LIST_HEAD(&node->upper);
252 INIT_LIST_HEAD(&node->lower);
253 RB_CLEAR_NODE(&node->rb_node);
254 cache->nr_nodes++;
255 }
256 return node;
257}
258
259static void free_backref_node(struct backref_cache *cache,
260 struct backref_node *node)
261{
262 if (node) {
263 cache->nr_nodes--;
264 kfree(node);
265 }
266}
267
268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
269{
270 struct backref_edge *edge;
271
272 edge = kzalloc(sizeof(*edge), GFP_NOFS);
273 if (edge)
274 cache->nr_edges++;
275 return edge;
185} 276}
186 277
187static void backref_node_init(struct backref_node *node) 278static void free_backref_edge(struct backref_cache *cache,
279 struct backref_edge *edge)
188{ 280{
189 memset(node, 0, sizeof(*node)); 281 if (edge) {
190 INIT_LIST_HEAD(&node->upper); 282 cache->nr_edges--;
191 INIT_LIST_HEAD(&node->lower); 283 kfree(edge);
192 RB_CLEAR_NODE(&node->rb_node); 284 }
193} 285}
194 286
195static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
250 edges[idx++] = edge; 342 edges[idx++] = edge;
251 node = edge->node[UPPER]; 343 node = edge->node[UPPER];
252 } 344 }
345 BUG_ON(node->detached);
253 *index = idx; 346 *index = idx;
254 return node; 347 return node;
255} 348}
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
281 return NULL; 374 return NULL;
282} 375}
283 376
377static void unlock_node_buffer(struct backref_node *node)
378{
379 if (node->locked) {
380 btrfs_tree_unlock(node->eb);
381 node->locked = 0;
382 }
383}
384
284static void drop_node_buffer(struct backref_node *node) 385static void drop_node_buffer(struct backref_node *node)
285{ 386{
286 if (node->eb) { 387 if (node->eb) {
287 if (node->locked) { 388 unlock_node_buffer(node);
288 btrfs_tree_unlock(node->eb);
289 node->locked = 0;
290 }
291 free_extent_buffer(node->eb); 389 free_extent_buffer(node->eb);
292 node->eb = NULL; 390 node->eb = NULL;
293 } 391 }
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
296static void drop_backref_node(struct backref_cache *tree, 394static void drop_backref_node(struct backref_cache *tree,
297 struct backref_node *node) 395 struct backref_node *node)
298{ 396{
299 BUG_ON(!node->lowest);
300 BUG_ON(!list_empty(&node->upper)); 397 BUG_ON(!list_empty(&node->upper));
301 398
302 drop_node_buffer(node); 399 drop_node_buffer(node);
400 list_del(&node->list);
303 list_del(&node->lower); 401 list_del(&node->lower);
304 402 if (!RB_EMPTY_NODE(&node->rb_node))
305 rb_erase(&node->rb_node, &tree->rb_root); 403 rb_erase(&node->rb_node, &tree->rb_root);
306 kfree(node); 404 free_backref_node(tree, node);
307} 405}
308 406
309/* 407/*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
318 if (!node) 416 if (!node)
319 return; 417 return;
320 418
321 BUG_ON(!node->lowest); 419 BUG_ON(!node->lowest && !node->detached);
322 while (!list_empty(&node->upper)) { 420 while (!list_empty(&node->upper)) {
323 edge = list_entry(node->upper.next, struct backref_edge, 421 edge = list_entry(node->upper.next, struct backref_edge,
324 list[LOWER]); 422 list[LOWER]);
325 upper = edge->node[UPPER]; 423 upper = edge->node[UPPER];
326 list_del(&edge->list[LOWER]); 424 list_del(&edge->list[LOWER]);
327 list_del(&edge->list[UPPER]); 425 list_del(&edge->list[UPPER]);
328 kfree(edge); 426 free_backref_edge(cache, edge);
427
428 if (RB_EMPTY_NODE(&upper->rb_node)) {
429 BUG_ON(!list_empty(&node->upper));
430 drop_backref_node(cache, node);
431 node = upper;
432 node->lowest = 1;
433 continue;
434 }
329 /* 435 /*
330 * add the node to pending list if no other 436 * add the node to leaf node list if no other
331 * child block cached. 437 * child block cached.
332 */ 438 */
333 if (list_empty(&upper->lower)) { 439 if (list_empty(&upper->lower)) {
334 list_add_tail(&upper->lower, 440 list_add_tail(&upper->lower, &cache->leaves);
335 &cache->pending[upper->level]);
336 upper->lowest = 1; 441 upper->lowest = 1;
337 } 442 }
338 } 443 }
444
339 drop_backref_node(cache, node); 445 drop_backref_node(cache, node);
340} 446}
341 447
448static void update_backref_node(struct backref_cache *cache,
449 struct backref_node *node, u64 bytenr)
450{
451 struct rb_node *rb_node;
452 rb_erase(&node->rb_node, &cache->rb_root);
453 node->bytenr = bytenr;
454 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
455 BUG_ON(rb_node);
456}
457
458/*
459 * update backref cache after a transaction commit
460 */
461static int update_backref_cache(struct btrfs_trans_handle *trans,
462 struct backref_cache *cache)
463{
464 struct backref_node *node;
465 int level = 0;
466
467 if (cache->last_trans == 0) {
468 cache->last_trans = trans->transid;
469 return 0;
470 }
471
472 if (cache->last_trans == trans->transid)
473 return 0;
474
475 /*
476 * detached nodes are used to avoid unnecessary backref
477 * lookup. transaction commit changes the extent tree.
478 * so the detached nodes are no longer useful.
479 */
480 while (!list_empty(&cache->detached)) {
481 node = list_entry(cache->detached.next,
482 struct backref_node, list);
483 remove_backref_node(cache, node);
484 }
485
486 while (!list_empty(&cache->changed)) {
487 node = list_entry(cache->changed.next,
488 struct backref_node, list);
489 list_del_init(&node->list);
490 BUG_ON(node->pending);
491 update_backref_node(cache, node, node->new_bytenr);
492 }
493
494 /*
495 * some nodes can be left in the pending list if there were
496 * errors during processing the pending nodes.
497 */
498 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
499 list_for_each_entry(node, &cache->pending[level], list) {
500 BUG_ON(!node->pending);
501 if (node->bytenr == node->new_bytenr)
502 continue;
503 update_backref_node(cache, node, node->new_bytenr);
504 }
505 }
506
507 cache->last_trans = 0;
508 return 1;
509}
510
511static int should_ignore_root(struct btrfs_root *root)
512{
513 struct btrfs_root *reloc_root;
514
515 if (!root->ref_cows)
516 return 0;
517
518 reloc_root = root->reloc_root;
519 if (!reloc_root)
520 return 0;
521
522 if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
523 root->fs_info->running_transaction->transid - 1)
524 return 0;
525 /*
526 * if there is reloc tree and it was created in previous
527 * transaction backref lookup can find the reloc tree,
528 * so backref node for the fs tree root is useless for
529 * relocation.
530 */
531 return 1;
532}
533
342/* 534/*
343 * find reloc tree by address of tree root 535 * find reloc tree by address of tree root
344 */ 536 */
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
453 * for all upper level blocks that directly/indirectly reference the 645 * for all upper level blocks that directly/indirectly reference the
454 * block are also cached. 646 * block are also cached.
455 */ 647 */
456static struct backref_node *build_backref_tree(struct reloc_control *rc, 648static noinline_for_stack
457 struct backref_cache *cache, 649struct backref_node *build_backref_tree(struct reloc_control *rc,
458 struct btrfs_key *node_key, 650 struct btrfs_key *node_key,
459 int level, u64 bytenr) 651 int level, u64 bytenr)
460{ 652{
653 struct backref_cache *cache = &rc->backref_cache;
461 struct btrfs_path *path1; 654 struct btrfs_path *path1;
462 struct btrfs_path *path2; 655 struct btrfs_path *path2;
463 struct extent_buffer *eb; 656 struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
473 unsigned long end; 666 unsigned long end;
474 unsigned long ptr; 667 unsigned long ptr;
475 LIST_HEAD(list); 668 LIST_HEAD(list);
669 LIST_HEAD(useless);
670 int cowonly;
476 int ret; 671 int ret;
477 int err = 0; 672 int err = 0;
478 673
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
483 goto out; 678 goto out;
484 } 679 }
485 680
486 node = kmalloc(sizeof(*node), GFP_NOFS); 681 node = alloc_backref_node(cache);
487 if (!node) { 682 if (!node) {
488 err = -ENOMEM; 683 err = -ENOMEM;
489 goto out; 684 goto out;
490 } 685 }
491 686
492 backref_node_init(node);
493 node->bytenr = bytenr; 687 node->bytenr = bytenr;
494 node->owner = 0;
495 node->level = level; 688 node->level = level;
496 node->lowest = 1; 689 node->lowest = 1;
497 cur = node; 690 cur = node;
@@ -587,17 +780,21 @@ again:
587#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 780#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
588 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || 781 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
589 key.type == BTRFS_EXTENT_REF_V0_KEY) { 782 key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 if (key.objectid == key.offset && 783 if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 key.type == BTRFS_EXTENT_REF_V0_KEY) {
592 struct btrfs_extent_ref_v0 *ref0; 784 struct btrfs_extent_ref_v0 *ref0;
593 ref0 = btrfs_item_ptr(eb, path1->slots[0], 785 ref0 = btrfs_item_ptr(eb, path1->slots[0],
594 struct btrfs_extent_ref_v0); 786 struct btrfs_extent_ref_v0);
595 root = find_tree_root(rc, eb, ref0); 787 if (key.objectid == key.offset) {
596 if (root) 788 root = find_tree_root(rc, eb, ref0);
597 cur->root = root; 789 if (root && !should_ignore_root(root))
598 else 790 cur->root = root;
599 cur->old_root = 1; 791 else
600 break; 792 list_add(&cur->list, &useless);
793 break;
794 }
795 if (is_cowonly_root(btrfs_ref_root_v0(eb,
796 ref0)))
797 cur->cowonly = 1;
601 } 798 }
602#else 799#else
603 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 800 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +811,20 @@ again:
614 break; 811 break;
615 } 812 }
616 813
617 edge = kzalloc(sizeof(*edge), GFP_NOFS); 814 edge = alloc_backref_edge(cache);
618 if (!edge) { 815 if (!edge) {
619 err = -ENOMEM; 816 err = -ENOMEM;
620 goto out; 817 goto out;
621 } 818 }
622 rb_node = tree_search(&cache->rb_root, key.offset); 819 rb_node = tree_search(&cache->rb_root, key.offset);
623 if (!rb_node) { 820 if (!rb_node) {
624 upper = kmalloc(sizeof(*upper), GFP_NOFS); 821 upper = alloc_backref_node(cache);
625 if (!upper) { 822 if (!upper) {
626 kfree(edge); 823 free_backref_edge(cache, edge);
627 err = -ENOMEM; 824 err = -ENOMEM;
628 goto out; 825 goto out;
629 } 826 }
630 backref_node_init(upper);
631 upper->bytenr = key.offset; 827 upper->bytenr = key.offset;
632 upper->owner = 0;
633 upper->level = cur->level + 1; 828 upper->level = cur->level + 1;
634 /* 829 /*
635 * backrefs for the upper level block isn't 830 * backrefs for the upper level block isn't
@@ -639,11 +834,12 @@ again:
639 } else { 834 } else {
640 upper = rb_entry(rb_node, struct backref_node, 835 upper = rb_entry(rb_node, struct backref_node,
641 rb_node); 836 rb_node);
837 BUG_ON(!upper->checked);
642 INIT_LIST_HEAD(&edge->list[UPPER]); 838 INIT_LIST_HEAD(&edge->list[UPPER]);
643 } 839 }
644 list_add(&edge->list[LOWER], &cur->upper); 840 list_add_tail(&edge->list[LOWER], &cur->upper);
645 edge->node[UPPER] = upper;
646 edge->node[LOWER] = cur; 841 edge->node[LOWER] = cur;
842 edge->node[UPPER] = upper;
647 843
648 goto next; 844 goto next;
649 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 845 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +853,17 @@ again:
657 goto out; 853 goto out;
658 } 854 }
659 855
856 if (!root->ref_cows)
857 cur->cowonly = 1;
858
660 if (btrfs_root_level(&root->root_item) == cur->level) { 859 if (btrfs_root_level(&root->root_item) == cur->level) {
661 /* tree root */ 860 /* tree root */
662 BUG_ON(btrfs_root_bytenr(&root->root_item) != 861 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
663 cur->bytenr); 862 cur->bytenr);
664 cur->root = root; 863 if (should_ignore_root(root))
864 list_add(&cur->list, &useless);
865 else
866 cur->root = root;
665 break; 867 break;
666 } 868 }
667 869
@@ -692,11 +894,14 @@ again:
692 if (!path2->nodes[level]) { 894 if (!path2->nodes[level]) {
693 BUG_ON(btrfs_root_bytenr(&root->root_item) != 895 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
694 lower->bytenr); 896 lower->bytenr);
695 lower->root = root; 897 if (should_ignore_root(root))
898 list_add(&lower->list, &useless);
899 else
900 lower->root = root;
696 break; 901 break;
697 } 902 }
698 903
699 edge = kzalloc(sizeof(*edge), GFP_NOFS); 904 edge = alloc_backref_edge(cache);
700 if (!edge) { 905 if (!edge) {
701 err = -ENOMEM; 906 err = -ENOMEM;
702 goto out; 907 goto out;
@@ -705,16 +910,17 @@ again:
705 eb = path2->nodes[level]; 910 eb = path2->nodes[level];
706 rb_node = tree_search(&cache->rb_root, eb->start); 911 rb_node = tree_search(&cache->rb_root, eb->start);
707 if (!rb_node) { 912 if (!rb_node) {
708 upper = kmalloc(sizeof(*upper), GFP_NOFS); 913 upper = alloc_backref_node(cache);
709 if (!upper) { 914 if (!upper) {
710 kfree(edge); 915 free_backref_edge(cache, edge);
711 err = -ENOMEM; 916 err = -ENOMEM;
712 goto out; 917 goto out;
713 } 918 }
714 backref_node_init(upper);
715 upper->bytenr = eb->start; 919 upper->bytenr = eb->start;
716 upper->owner = btrfs_header_owner(eb); 920 upper->owner = btrfs_header_owner(eb);
717 upper->level = lower->level + 1; 921 upper->level = lower->level + 1;
922 if (!root->ref_cows)
923 upper->cowonly = 1;
718 924
719 /* 925 /*
720 * if we know the block isn't shared 926 * if we know the block isn't shared
@@ -744,10 +950,12 @@ again:
744 rb_node); 950 rb_node);
745 BUG_ON(!upper->checked); 951 BUG_ON(!upper->checked);
746 INIT_LIST_HEAD(&edge->list[UPPER]); 952 INIT_LIST_HEAD(&edge->list[UPPER]);
953 if (!upper->owner)
954 upper->owner = btrfs_header_owner(eb);
747 } 955 }
748 list_add_tail(&edge->list[LOWER], &lower->upper); 956 list_add_tail(&edge->list[LOWER], &lower->upper);
749 edge->node[UPPER] = upper;
750 edge->node[LOWER] = lower; 957 edge->node[LOWER] = lower;
958 edge->node[UPPER] = upper;
751 959
752 if (rb_node) 960 if (rb_node)
753 break; 961 break;
@@ -785,8 +993,13 @@ next:
785 * into the cache. 993 * into the cache.
786 */ 994 */
787 BUG_ON(!node->checked); 995 BUG_ON(!node->checked);
788 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 996 cowonly = node->cowonly;
789 BUG_ON(rb_node); 997 if (!cowonly) {
998 rb_node = tree_insert(&cache->rb_root, node->bytenr,
999 &node->rb_node);
1000 BUG_ON(rb_node);
1001 list_add_tail(&node->lower, &cache->leaves);
1002 }
790 1003
791 list_for_each_entry(edge, &node->upper, list[LOWER]) 1004 list_for_each_entry(edge, &node->upper, list[LOWER])
792 list_add_tail(&edge->list[UPPER], &list); 1005 list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1008,14 @@ next:
795 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1008 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
796 list_del_init(&edge->list[UPPER]); 1009 list_del_init(&edge->list[UPPER]);
797 upper = edge->node[UPPER]; 1010 upper = edge->node[UPPER];
1011 if (upper->detached) {
1012 list_del(&edge->list[LOWER]);
1013 lower = edge->node[LOWER];
1014 free_backref_edge(cache, edge);
1015 if (list_empty(&lower->upper))
1016 list_add(&lower->list, &useless);
1017 continue;
1018 }
798 1019
799 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1020 if (!RB_EMPTY_NODE(&upper->rb_node)) {
800 if (upper->lowest) { 1021 if (upper->lowest) {
@@ -807,25 +1028,69 @@ next:
807 } 1028 }
808 1029
809 BUG_ON(!upper->checked); 1030 BUG_ON(!upper->checked);
810 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1031 BUG_ON(cowonly != upper->cowonly);
811 &upper->rb_node); 1032 if (!cowonly) {
812 BUG_ON(rb_node); 1033 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1034 &upper->rb_node);
1035 BUG_ON(rb_node);
1036 }
813 1037
814 list_add_tail(&edge->list[UPPER], &upper->lower); 1038 list_add_tail(&edge->list[UPPER], &upper->lower);
815 1039
816 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1040 list_for_each_entry(edge, &upper->upper, list[LOWER])
817 list_add_tail(&edge->list[UPPER], &list); 1041 list_add_tail(&edge->list[UPPER], &list);
818 } 1042 }
1043 /*
1044 * process useless backref nodes. backref nodes for tree leaves
1045 * are deleted from the cache. backref nodes for upper level
1046 * tree blocks are left in the cache to avoid unnecessary backref
1047 * lookup.
1048 */
1049 while (!list_empty(&useless)) {
1050 upper = list_entry(useless.next, struct backref_node, list);
1051 list_del_init(&upper->list);
1052 BUG_ON(!list_empty(&upper->upper));
1053 if (upper == node)
1054 node = NULL;
1055 if (upper->lowest) {
1056 list_del_init(&upper->lower);
1057 upper->lowest = 0;
1058 }
1059 while (!list_empty(&upper->lower)) {
1060 edge = list_entry(upper->lower.next,
1061 struct backref_edge, list[UPPER]);
1062 list_del(&edge->list[UPPER]);
1063 list_del(&edge->list[LOWER]);
1064 lower = edge->node[LOWER];
1065 free_backref_edge(cache, edge);
1066
1067 if (list_empty(&lower->upper))
1068 list_add(&lower->list, &useless);
1069 }
1070 __mark_block_processed(rc, upper);
1071 if (upper->level > 0) {
1072 list_add(&upper->list, &cache->detached);
1073 upper->detached = 1;
1074 } else {
1075 rb_erase(&upper->rb_node, &cache->rb_root);
1076 free_backref_node(cache, upper);
1077 }
1078 }
819out: 1079out:
820 btrfs_free_path(path1); 1080 btrfs_free_path(path1);
821 btrfs_free_path(path2); 1081 btrfs_free_path(path2);
822 if (err) { 1082 if (err) {
823 INIT_LIST_HEAD(&list); 1083 while (!list_empty(&useless)) {
1084 lower = list_entry(useless.next,
1085 struct backref_node, upper);
1086 list_del_init(&lower->upper);
1087 }
824 upper = node; 1088 upper = node;
1089 INIT_LIST_HEAD(&list);
825 while (upper) { 1090 while (upper) {
826 if (RB_EMPTY_NODE(&upper->rb_node)) { 1091 if (RB_EMPTY_NODE(&upper->rb_node)) {
827 list_splice_tail(&upper->upper, &list); 1092 list_splice_tail(&upper->upper, &list);
828 kfree(upper); 1093 free_backref_node(cache, upper);
829 } 1094 }
830 1095
831 if (list_empty(&list)) 1096 if (list_empty(&list))
@@ -833,15 +1098,104 @@ out:
833 1098
834 edge = list_entry(list.next, struct backref_edge, 1099 edge = list_entry(list.next, struct backref_edge,
835 list[LOWER]); 1100 list[LOWER]);
1101 list_del(&edge->list[LOWER]);
836 upper = edge->node[UPPER]; 1102 upper = edge->node[UPPER];
837 kfree(edge); 1103 free_backref_edge(cache, edge);
838 } 1104 }
839 return ERR_PTR(err); 1105 return ERR_PTR(err);
840 } 1106 }
1107 BUG_ON(node && node->detached);
841 return node; 1108 return node;
842} 1109}
843 1110
844/* 1111/*
1112 * helper to add backref node for the newly created snapshot.
1113 * the backref node is created by cloning backref node that
1114 * corresponds to root of source tree
1115 */
1116static int clone_backref_node(struct btrfs_trans_handle *trans,
1117 struct reloc_control *rc,
1118 struct btrfs_root *src,
1119 struct btrfs_root *dest)
1120{
1121 struct btrfs_root *reloc_root = src->reloc_root;
1122 struct backref_cache *cache = &rc->backref_cache;
1123 struct backref_node *node = NULL;
1124 struct backref_node *new_node;
1125 struct backref_edge *edge;
1126 struct backref_edge *new_edge;
1127 struct rb_node *rb_node;
1128
1129 if (cache->last_trans > 0)
1130 update_backref_cache(trans, cache);
1131
1132 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1133 if (rb_node) {
1134 node = rb_entry(rb_node, struct backref_node, rb_node);
1135 if (node->detached)
1136 node = NULL;
1137 else
1138 BUG_ON(node->new_bytenr != reloc_root->node->start);
1139 }
1140
1141 if (!node) {
1142 rb_node = tree_search(&cache->rb_root,
1143 reloc_root->commit_root->start);
1144 if (rb_node) {
1145 node = rb_entry(rb_node, struct backref_node,
1146 rb_node);
1147 BUG_ON(node->detached);
1148 }
1149 }
1150
1151 if (!node)
1152 return 0;
1153
1154 new_node = alloc_backref_node(cache);
1155 if (!new_node)
1156 return -ENOMEM;
1157
1158 new_node->bytenr = dest->node->start;
1159 new_node->level = node->level;
1160 new_node->lowest = node->lowest;
1161 new_node->root = dest;
1162
1163 if (!node->lowest) {
1164 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1165 new_edge = alloc_backref_edge(cache);
1166 if (!new_edge)
1167 goto fail;
1168
1169 new_edge->node[UPPER] = new_node;
1170 new_edge->node[LOWER] = edge->node[LOWER];
1171 list_add_tail(&new_edge->list[UPPER],
1172 &new_node->lower);
1173 }
1174 }
1175
1176 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1177 &new_node->rb_node);
1178 BUG_ON(rb_node);
1179
1180 if (!new_node->lowest) {
1181 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1182 list_add_tail(&new_edge->list[LOWER],
1183 &new_edge->node[LOWER]->upper);
1184 }
1185 }
1186 return 0;
1187fail:
1188 while (!list_empty(&new_node->lower)) {
1189 new_edge = list_entry(new_node->lower.next,
1190 struct backref_edge, list[UPPER]);
1191 list_del(&new_edge->list[UPPER]);
1192 free_backref_edge(cache, new_edge);
1193 }
1194 free_backref_node(cache, new_node);
1195 return -ENOMEM;
1196}
1197
1198/*
845 * helper to add 'address of tree root -> reloc tree' mapping 1199 * helper to add 'address of tree root -> reloc tree' mapping
846 */ 1200 */
847static int __add_reloc_root(struct btrfs_root *root) 1201static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
901 return 0; 1255 return 0;
902} 1256}
903 1257
904/* 1258static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
905 * create reloc tree for a given fs tree. reloc tree is just a 1259 struct btrfs_root *root, u64 objectid)
906 * snapshot of the fs tree with special root objectid.
907 */
908int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
909 struct btrfs_root *root)
910{ 1260{
911 struct btrfs_root *reloc_root; 1261 struct btrfs_root *reloc_root;
912 struct extent_buffer *eb; 1262 struct extent_buffer *eb;
@@ -914,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
914 struct btrfs_key root_key; 1264 struct btrfs_key root_key;
915 int ret; 1265 int ret;
916 1266
917 if (root->reloc_root) {
918 reloc_root = root->reloc_root;
919 reloc_root->last_trans = trans->transid;
920 return 0;
921 }
922
923 if (!root->fs_info->reloc_ctl ||
924 !root->fs_info->reloc_ctl->create_reloc_root ||
925 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
926 return 0;
927
928 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1267 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
929 BUG_ON(!root_item); 1268 BUG_ON(!root_item);
930 1269
931 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1270 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
932 root_key.type = BTRFS_ROOT_ITEM_KEY; 1271 root_key.type = BTRFS_ROOT_ITEM_KEY;
933 root_key.offset = root->root_key.objectid; 1272 root_key.offset = objectid;
934 1273
935 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1274 if (root->root_key.objectid == objectid) {
936 BTRFS_TREE_RELOC_OBJECTID); 1275 /* called by btrfs_init_reloc_root */
937 BUG_ON(ret); 1276 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1277 BTRFS_TREE_RELOC_OBJECTID);
1278 BUG_ON(ret);
1279
1280 btrfs_set_root_last_snapshot(&root->root_item,
1281 trans->transid - 1);
1282 } else {
1283 /*
1284 * called by btrfs_reloc_post_snapshot_hook.
1285 * the source tree is a reloc tree, all tree blocks
1286 * modified after it was created have RELOC flag
1287 * set in their headers. so it's OK to not update
1288 * the 'last_snapshot'.
1289 */
1290 ret = btrfs_copy_root(trans, root, root->node, &eb,
1291 BTRFS_TREE_RELOC_OBJECTID);
1292 BUG_ON(ret);
1293 }
938 1294
939 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
940 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1295 memcpy(root_item, &root->root_item, sizeof(*root_item));
941 btrfs_set_root_refs(root_item, 1);
942 btrfs_set_root_bytenr(root_item, eb->start); 1296 btrfs_set_root_bytenr(root_item, eb->start);
943 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1297 btrfs_set_root_level(root_item, btrfs_header_level(eb));
944 btrfs_set_root_generation(root_item, trans->transid); 1298 btrfs_set_root_generation(root_item, trans->transid);
945 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); 1299
946 root_item->drop_level = 0; 1300 if (root->root_key.objectid == objectid) {
1301 btrfs_set_root_refs(root_item, 0);
1302 memset(&root_item->drop_progress, 0,
1303 sizeof(struct btrfs_disk_key));
1304 root_item->drop_level = 0;
1305 }
947 1306
948 btrfs_tree_unlock(eb); 1307 btrfs_tree_unlock(eb);
949 free_extent_buffer(eb); 1308 free_extent_buffer(eb);
@@ -957,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
957 &root_key); 1316 &root_key);
958 BUG_ON(IS_ERR(reloc_root)); 1317 BUG_ON(IS_ERR(reloc_root));
959 reloc_root->last_trans = trans->transid; 1318 reloc_root->last_trans = trans->transid;
1319 return reloc_root;
1320}
1321
1322/*
1323 * create reloc tree for a given fs tree. reloc tree is just a
1324 * snapshot of the fs tree with special root objectid.
1325 */
1326int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root)
1328{
1329 struct btrfs_root *reloc_root;
1330 struct reloc_control *rc = root->fs_info->reloc_ctl;
1331 int clear_rsv = 0;
1332
1333 if (root->reloc_root) {
1334 reloc_root = root->reloc_root;
1335 reloc_root->last_trans = trans->transid;
1336 return 0;
1337 }
1338
1339 if (!rc || !rc->create_reloc_tree ||
1340 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1341 return 0;
1342
1343 if (!trans->block_rsv) {
1344 trans->block_rsv = rc->block_rsv;
1345 clear_rsv = 1;
1346 }
1347 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1348 if (clear_rsv)
1349 trans->block_rsv = NULL;
960 1350
961 __add_reloc_root(reloc_root); 1351 __add_reloc_root(reloc_root);
962 root->reloc_root = reloc_root; 1352 root->reloc_root = reloc_root;
@@ -980,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
980 reloc_root = root->reloc_root; 1370 reloc_root = root->reloc_root;
981 root_item = &reloc_root->root_item; 1371 root_item = &reloc_root->root_item;
982 1372
983 if (btrfs_root_refs(root_item) == 0) { 1373 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1374 btrfs_root_refs(root_item) == 0) {
984 root->reloc_root = NULL; 1375 root->reloc_root = NULL;
985 del = 1; 1376 del = 1;
986 } 1377 }
@@ -1102,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1102 goto out; 1493 goto out;
1103 } 1494 }
1104 1495
1105 if (new_bytenr) 1496 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1107 ret = 0; 1497 ret = 0;
1108out: 1498out:
1109 btrfs_free_path(path); 1499 btrfs_free_path(path);
@@ -1114,19 +1504,18 @@ out:
1114 * update file extent items in the tree leaf to point to 1504 * update file extent items in the tree leaf to point to
1115 * the new locations. 1505 * the new locations.
1116 */ 1506 */
1117static int replace_file_extents(struct btrfs_trans_handle *trans, 1507static noinline_for_stack
1118 struct reloc_control *rc, 1508int replace_file_extents(struct btrfs_trans_handle *trans,
1119 struct btrfs_root *root, 1509 struct reloc_control *rc,
1120 struct extent_buffer *leaf, 1510 struct btrfs_root *root,
1121 struct list_head *inode_list) 1511 struct extent_buffer *leaf)
1122{ 1512{
1123 struct btrfs_key key; 1513 struct btrfs_key key;
1124 struct btrfs_file_extent_item *fi; 1514 struct btrfs_file_extent_item *fi;
1125 struct inode *inode = NULL; 1515 struct inode *inode = NULL;
1126 struct inodevec *ivec = NULL;
1127 u64 parent; 1516 u64 parent;
1128 u64 bytenr; 1517 u64 bytenr;
1129 u64 new_bytenr; 1518 u64 new_bytenr = 0;
1130 u64 num_bytes; 1519 u64 num_bytes;
1131 u64 end; 1520 u64 end;
1132 u32 nritems; 1521 u32 nritems;
@@ -1166,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1166 * to complete and drop the extent cache 1555 * to complete and drop the extent cache
1167 */ 1556 */
1168 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1557 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1169 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1170 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1171 BUG_ON(!ivec);
1172 ivec->nr = 0;
1173 list_add_tail(&ivec->list, inode_list);
1174 }
1175 if (first) { 1558 if (first) {
1176 inode = find_next_inode(root, key.objectid); 1559 inode = find_next_inode(root, key.objectid);
1177 if (inode)
1178 ivec->inode[ivec->nr++] = inode;
1179 first = 0; 1560 first = 0;
1180 } else if (inode && inode->i_ino < key.objectid) { 1561 } else if (inode && inode->i_ino < key.objectid) {
1562 btrfs_add_delayed_iput(inode);
1181 inode = find_next_inode(root, key.objectid); 1563 inode = find_next_inode(root, key.objectid);
1182 if (inode)
1183 ivec->inode[ivec->nr++] = inode;
1184 } 1564 }
1185 if (inode && inode->i_ino == key.objectid) { 1565 if (inode && inode->i_ino == key.objectid) {
1186 end = key.offset + 1566 end = key.offset +
@@ -1204,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1204 1584
1205 ret = get_new_location(rc->data_inode, &new_bytenr, 1585 ret = get_new_location(rc->data_inode, &new_bytenr,
1206 bytenr, num_bytes); 1586 bytenr, num_bytes);
1207 if (ret > 0) 1587 if (ret > 0) {
1588 WARN_ON(1);
1208 continue; 1589 continue;
1590 }
1209 BUG_ON(ret < 0); 1591 BUG_ON(ret < 0);
1210 1592
1211 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1593 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1225 } 1607 }
1226 if (dirty) 1608 if (dirty)
1227 btrfs_mark_buffer_dirty(leaf); 1609 btrfs_mark_buffer_dirty(leaf);
1610 if (inode)
1611 btrfs_add_delayed_iput(inode);
1228 return 0; 1612 return 0;
1229} 1613}
1230 1614
@@ -1248,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
1248 * if no block got replaced, 0 is returned. if there are other 1632 * if no block got replaced, 0 is returned. if there are other
1249 * errors, a negative error number is returned. 1633 * errors, a negative error number is returned.
1250 */ 1634 */
1251static int replace_path(struct btrfs_trans_handle *trans, 1635static noinline_for_stack
1252 struct btrfs_root *dest, struct btrfs_root *src, 1636int replace_path(struct btrfs_trans_handle *trans,
1253 struct btrfs_path *path, struct btrfs_key *next_key, 1637 struct btrfs_root *dest, struct btrfs_root *src,
1254 struct extent_buffer **leaf, 1638 struct btrfs_path *path, struct btrfs_key *next_key,
1255 int lowest_level, int max_level) 1639 int lowest_level, int max_level)
1256{ 1640{
1257 struct extent_buffer *eb; 1641 struct extent_buffer *eb;
1258 struct extent_buffer *parent; 1642 struct extent_buffer *parent;
@@ -1263,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
1263 u64 new_ptr_gen; 1647 u64 new_ptr_gen;
1264 u64 last_snapshot; 1648 u64 last_snapshot;
1265 u32 blocksize; 1649 u32 blocksize;
1650 int cow = 0;
1266 int level; 1651 int level;
1267 int ret; 1652 int ret;
1268 int slot; 1653 int slot;
1269 1654
1270 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1655 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1271 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1656 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1272 BUG_ON(lowest_level > 1 && leaf);
1273 1657
1274 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1658 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1275 1659again:
1276 slot = path->slots[lowest_level]; 1660 slot = path->slots[lowest_level];
1277 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1661 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1278 1662
@@ -1286,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
1286 return 0; 1670 return 0;
1287 } 1671 }
1288 1672
1289 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1673 if (cow) {
1290 BUG_ON(ret); 1674 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1675 BUG_ON(ret);
1676 }
1291 btrfs_set_lock_blocking(eb); 1677 btrfs_set_lock_blocking(eb);
1292 1678
1293 if (next_key) { 1679 if (next_key) {
@@ -1331,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
1331 1717
1332 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1718 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1333 memcmp_node_keys(parent, slot, path, level)) { 1719 memcmp_node_keys(parent, slot, path, level)) {
1334 if (level <= lowest_level && !leaf) { 1720 if (level <= lowest_level) {
1335 ret = 0; 1721 ret = 0;
1336 break; 1722 break;
1337 } 1723 }
@@ -1339,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
1339 eb = read_tree_block(dest, old_bytenr, blocksize, 1725 eb = read_tree_block(dest, old_bytenr, blocksize,
1340 old_ptr_gen); 1726 old_ptr_gen);
1341 btrfs_tree_lock(eb); 1727 btrfs_tree_lock(eb);
1342 ret = btrfs_cow_block(trans, dest, eb, parent, 1728 if (cow) {
1343 slot, &eb); 1729 ret = btrfs_cow_block(trans, dest, eb, parent,
1344 BUG_ON(ret); 1730 slot, &eb);
1345 btrfs_set_lock_blocking(eb); 1731 BUG_ON(ret);
1346
1347 if (level <= lowest_level) {
1348 *leaf = eb;
1349 ret = 0;
1350 break;
1351 } 1732 }
1733 btrfs_set_lock_blocking(eb);
1352 1734
1353 btrfs_tree_unlock(parent); 1735 btrfs_tree_unlock(parent);
1354 free_extent_buffer(parent); 1736 free_extent_buffer(parent);
@@ -1357,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
1357 continue; 1739 continue;
1358 } 1740 }
1359 1741
1742 if (!cow) {
1743 btrfs_tree_unlock(parent);
1744 free_extent_buffer(parent);
1745 cow = 1;
1746 goto again;
1747 }
1748
1360 btrfs_node_key_to_cpu(path->nodes[level], &key, 1749 btrfs_node_key_to_cpu(path->nodes[level], &key,
1361 path->slots[level]); 1750 path->slots[level]);
1362 btrfs_release_path(src, path); 1751 btrfs_release_path(src, path);
@@ -1562,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1562 return 0; 1951 return 0;
1563} 1952}
1564 1953
1565static void put_inodes(struct list_head *list)
1566{
1567 struct inodevec *ivec;
1568 while (!list_empty(list)) {
1569 ivec = list_entry(list->next, struct inodevec, list);
1570 list_del(&ivec->list);
1571 while (ivec->nr > 0) {
1572 ivec->nr--;
1573 iput(ivec->inode[ivec->nr]);
1574 }
1575 kfree(ivec);
1576 }
1577}
1578
1579static int find_next_key(struct btrfs_path *path, int level, 1954static int find_next_key(struct btrfs_path *path, int level,
1580 struct btrfs_key *key) 1955 struct btrfs_key *key)
1581 1956
@@ -1608,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1608 struct btrfs_root *reloc_root; 1983 struct btrfs_root *reloc_root;
1609 struct btrfs_root_item *root_item; 1984 struct btrfs_root_item *root_item;
1610 struct btrfs_path *path; 1985 struct btrfs_path *path;
1611 struct extent_buffer *leaf = NULL; 1986 struct extent_buffer *leaf;
1612 unsigned long nr; 1987 unsigned long nr;
1613 int level; 1988 int level;
1614 int max_level; 1989 int max_level;
1615 int replaced = 0; 1990 int replaced = 0;
1616 int ret; 1991 int ret;
1617 int err = 0; 1992 int err = 0;
1993 u32 min_reserved;
1618 1994
1619 path = btrfs_alloc_path(); 1995 path = btrfs_alloc_path();
1620 if (!path) 1996 if (!path)
@@ -1648,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1648 btrfs_unlock_up_safe(path, 0); 2024 btrfs_unlock_up_safe(path, 0);
1649 } 2025 }
1650 2026
1651 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { 2027 min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1652 trans = btrfs_start_transaction(root, 1); 2028 memset(&next_key, 0, sizeof(next_key));
1653 2029
1654 leaf = path->nodes[0]; 2030 while (1) {
1655 btrfs_item_key_to_cpu(leaf, &key, 0); 2031 trans = btrfs_start_transaction(root, 0);
1656 btrfs_release_path(reloc_root, path); 2032 trans->block_rsv = rc->block_rsv;
1657 2033
1658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2034 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
1659 if (ret < 0) { 2035 min_reserved, 0);
1660 err = ret; 2036 if (ret) {
1661 goto out; 2037 BUG_ON(ret != -EAGAIN);
2038 ret = btrfs_commit_transaction(trans, root);
2039 BUG_ON(ret);
2040 continue;
1662 } 2041 }
1663 2042
1664 leaf = path->nodes[0];
1665 btrfs_unlock_up_safe(path, 1);
1666 ret = replace_file_extents(trans, rc, root, leaf,
1667 &inode_list);
1668 if (ret < 0)
1669 err = ret;
1670 goto out;
1671 }
1672
1673 memset(&next_key, 0, sizeof(next_key));
1674
1675 while (1) {
1676 leaf = NULL;
1677 replaced = 0; 2043 replaced = 0;
1678 trans = btrfs_start_transaction(root, 1);
1679 max_level = level; 2044 max_level = level;
1680 2045
1681 ret = walk_down_reloc_tree(reloc_root, path, &level); 2046 ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1689 if (!find_next_key(path, level, &key) && 2054 if (!find_next_key(path, level, &key) &&
1690 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2055 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1691 ret = 0; 2056 ret = 0;
1692 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1693 ret = replace_path(trans, root, reloc_root,
1694 path, &next_key, &leaf,
1695 level, max_level);
1696 } else { 2057 } else {
1697 ret = replace_path(trans, root, reloc_root, 2058 ret = replace_path(trans, root, reloc_root, path,
1698 path, &next_key, NULL, 2059 &next_key, level, max_level);
1699 level, max_level);
1700 } 2060 }
1701 if (ret < 0) { 2061 if (ret < 0) {
1702 err = ret; 2062 err = ret;
@@ -1708,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1708 btrfs_node_key_to_cpu(path->nodes[level], &key, 2068 btrfs_node_key_to_cpu(path->nodes[level], &key,
1709 path->slots[level]); 2069 path->slots[level]);
1710 replaced = 1; 2070 replaced = 1;
1711 } else if (leaf) {
1712 /*
1713 * no block got replaced, try replacing file extents
1714 */
1715 btrfs_item_key_to_cpu(leaf, &key, 0);
1716 ret = replace_file_extents(trans, rc, root, leaf,
1717 &inode_list);
1718 btrfs_tree_unlock(leaf);
1719 free_extent_buffer(leaf);
1720 BUG_ON(ret < 0);
1721 } 2071 }
1722 2072
1723 ret = walk_up_reloc_tree(reloc_root, path, &level); 2073 ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1734 root_item->drop_level = level; 2084 root_item->drop_level = level;
1735 2085
1736 nr = trans->blocks_used; 2086 nr = trans->blocks_used;
1737 btrfs_end_transaction(trans, root); 2087 btrfs_end_transaction_throttle(trans, root);
1738 2088
1739 btrfs_btree_balance_dirty(root, nr); 2089 btrfs_btree_balance_dirty(root, nr);
1740 2090
1741 /*
1742 * put inodes outside transaction, otherwise we may deadlock.
1743 */
1744 put_inodes(&inode_list);
1745
1746 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2091 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1747 invalidate_extent_cache(root, &key, &next_key); 2092 invalidate_extent_cache(root, &key, &next_key);
1748 } 2093 }
@@ -1765,87 +2110,125 @@ out:
1765 sizeof(root_item->drop_progress)); 2110 sizeof(root_item->drop_progress));
1766 root_item->drop_level = 0; 2111 root_item->drop_level = 0;
1767 btrfs_set_root_refs(root_item, 0); 2112 btrfs_set_root_refs(root_item, 0);
2113 btrfs_update_reloc_root(trans, root);
1768 } 2114 }
1769 2115
1770 nr = trans->blocks_used; 2116 nr = trans->blocks_used;
1771 btrfs_end_transaction(trans, root); 2117 btrfs_end_transaction_throttle(trans, root);
1772 2118
1773 btrfs_btree_balance_dirty(root, nr); 2119 btrfs_btree_balance_dirty(root, nr);
1774 2120
1775 put_inodes(&inode_list);
1776
1777 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2121 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1778 invalidate_extent_cache(root, &key, &next_key); 2122 invalidate_extent_cache(root, &key, &next_key);
1779 2123
1780 return err; 2124 return err;
1781} 2125}
1782 2126
1783/* 2127static noinline_for_stack
1784 * callback for the work threads. 2128int prepare_to_merge(struct reloc_control *rc, int err)
1785 * this function merges reloc tree with corresponding fs tree,
1786 * and then drops the reloc tree.
1787 */
1788static void merge_func(struct btrfs_work *work)
1789{ 2129{
1790 struct btrfs_trans_handle *trans; 2130 struct btrfs_root *root = rc->extent_root;
1791 struct btrfs_root *root;
1792 struct btrfs_root *reloc_root; 2131 struct btrfs_root *reloc_root;
1793 struct async_merge *async; 2132 struct btrfs_trans_handle *trans;
2133 LIST_HEAD(reloc_roots);
2134 u64 num_bytes = 0;
2135 int ret;
2136 int retries = 0;
2137
2138 mutex_lock(&root->fs_info->trans_mutex);
2139 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2140 rc->merging_rsv_size += rc->nodes_relocated * 2;
2141 mutex_unlock(&root->fs_info->trans_mutex);
2142again:
2143 if (!err) {
2144 num_bytes = rc->merging_rsv_size;
2145 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2146 num_bytes, &retries);
2147 if (ret)
2148 err = ret;
2149 }
2150
2151 trans = btrfs_join_transaction(rc->extent_root, 1);
2152
2153 if (!err) {
2154 if (num_bytes != rc->merging_rsv_size) {
2155 btrfs_end_transaction(trans, rc->extent_root);
2156 btrfs_block_rsv_release(rc->extent_root,
2157 rc->block_rsv, num_bytes);
2158 retries = 0;
2159 goto again;
2160 }
2161 }
1794 2162
1795 async = container_of(work, struct async_merge, work); 2163 rc->merge_reloc_tree = 1;
1796 reloc_root = async->root; 2164
2165 while (!list_empty(&rc->reloc_roots)) {
2166 reloc_root = list_entry(rc->reloc_roots.next,
2167 struct btrfs_root, root_list);
2168 list_del_init(&reloc_root->root_list);
1797 2169
1798 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1799 root = read_fs_root(reloc_root->fs_info, 2170 root = read_fs_root(reloc_root->fs_info,
1800 reloc_root->root_key.offset); 2171 reloc_root->root_key.offset);
1801 BUG_ON(IS_ERR(root)); 2172 BUG_ON(IS_ERR(root));
1802 BUG_ON(root->reloc_root != reloc_root); 2173 BUG_ON(root->reloc_root != reloc_root);
1803 2174
1804 merge_reloc_root(async->rc, root); 2175 /*
1805 2176 * set reference count to 1, so btrfs_recover_relocation
1806 trans = btrfs_start_transaction(root, 1); 2177 * knows it should resumes merging
2178 */
2179 if (!err)
2180 btrfs_set_root_refs(&reloc_root->root_item, 1);
1807 btrfs_update_reloc_root(trans, root); 2181 btrfs_update_reloc_root(trans, root);
1808 btrfs_end_transaction(trans, root);
1809 }
1810 2182
1811 btrfs_drop_snapshot(reloc_root, 0); 2183 list_add(&reloc_root->root_list, &reloc_roots);
2184 }
1812 2185
1813 if (atomic_dec_and_test(async->num_pending)) 2186 list_splice(&reloc_roots, &rc->reloc_roots);
1814 complete(async->done);
1815 2187
1816 kfree(async); 2188 if (!err)
2189 btrfs_commit_transaction(trans, rc->extent_root);
2190 else
2191 btrfs_end_transaction(trans, rc->extent_root);
2192 return err;
1817} 2193}
1818 2194
1819static int merge_reloc_roots(struct reloc_control *rc) 2195static noinline_for_stack
2196int merge_reloc_roots(struct reloc_control *rc)
1820{ 2197{
1821 struct async_merge *async;
1822 struct btrfs_root *root; 2198 struct btrfs_root *root;
1823 struct completion done; 2199 struct btrfs_root *reloc_root;
1824 atomic_t num_pending; 2200 LIST_HEAD(reloc_roots);
2201 int found = 0;
2202 int ret;
2203again:
2204 root = rc->extent_root;
2205 mutex_lock(&root->fs_info->trans_mutex);
2206 list_splice_init(&rc->reloc_roots, &reloc_roots);
2207 mutex_unlock(&root->fs_info->trans_mutex);
1825 2208
1826 init_completion(&done); 2209 while (!list_empty(&reloc_roots)) {
1827 atomic_set(&num_pending, 1); 2210 found = 1;
2211 reloc_root = list_entry(reloc_roots.next,
2212 struct btrfs_root, root_list);
1828 2213
1829 while (!list_empty(&rc->reloc_roots)) { 2214 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1830 root = list_entry(rc->reloc_roots.next, 2215 root = read_fs_root(reloc_root->fs_info,
1831 struct btrfs_root, root_list); 2216 reloc_root->root_key.offset);
1832 list_del_init(&root->root_list); 2217 BUG_ON(IS_ERR(root));
2218 BUG_ON(root->reloc_root != reloc_root);
1833 2219
1834 async = kmalloc(sizeof(*async), GFP_NOFS); 2220 ret = merge_reloc_root(rc, root);
1835 BUG_ON(!async); 2221 BUG_ON(ret);
1836 async->work.func = merge_func; 2222 } else {
1837 async->work.flags = 0; 2223 list_del_init(&reloc_root->root_list);
1838 async->rc = rc; 2224 }
1839 async->root = root; 2225 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
1840 async->done = &done;
1841 async->num_pending = &num_pending;
1842 atomic_inc(&num_pending);
1843 btrfs_queue_worker(&rc->workers, &async->work);
1844 } 2226 }
1845 2227
1846 if (!atomic_dec_and_test(&num_pending)) 2228 if (found) {
1847 wait_for_completion(&done); 2229 found = 0;
1848 2230 goto again;
2231 }
1849 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2232 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1850 return 0; 2233 return 0;
1851} 2234}
@@ -1876,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1876 return btrfs_record_root_in_trans(trans, root); 2259 return btrfs_record_root_in_trans(trans, root);
1877} 2260}
1878 2261
1879/* 2262static noinline_for_stack
1880 * select one tree from trees that references the block. 2263struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1881 * for blocks in refernce counted trees, we preper reloc tree. 2264 struct reloc_control *rc,
1882 * if no reloc tree found and reloc_only is true, NULL is returned. 2265 struct backref_node *node,
1883 */ 2266 struct backref_edge *edges[], int *nr)
1884static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1885 struct backref_node *node,
1886 struct backref_edge *edges[],
1887 int *nr, int reloc_only)
1888{ 2267{
1889 struct backref_node *next; 2268 struct backref_node *next;
1890 struct btrfs_root *root; 2269 struct btrfs_root *root;
1891 int index; 2270 int index = 0;
1892 int loop = 0; 2271
1893again:
1894 index = 0;
1895 next = node; 2272 next = node;
1896 while (1) { 2273 while (1) {
1897 cond_resched(); 2274 cond_resched();
1898 next = walk_up_backref(next, edges, &index); 2275 next = walk_up_backref(next, edges, &index);
1899 root = next->root; 2276 root = next->root;
1900 if (!root) { 2277 BUG_ON(!root);
1901 BUG_ON(!node->old_root); 2278 BUG_ON(!root->ref_cows);
1902 goto skip;
1903 }
1904
1905 /* no other choice for non-refernce counted tree */
1906 if (!root->ref_cows) {
1907 BUG_ON(reloc_only);
1908 break;
1909 }
1910 2279
1911 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2280 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1912 record_reloc_root_in_trans(trans, root); 2281 record_reloc_root_in_trans(trans, root);
1913 break; 2282 break;
1914 } 2283 }
1915 2284
1916 if (loop) { 2285 btrfs_record_root_in_trans(trans, root);
1917 btrfs_record_root_in_trans(trans, root); 2286 root = root->reloc_root;
2287
2288 if (next->new_bytenr != root->node->start) {
2289 BUG_ON(next->new_bytenr);
2290 BUG_ON(!list_empty(&next->list));
2291 next->new_bytenr = root->node->start;
2292 next->root = root;
2293 list_add_tail(&next->list,
2294 &rc->backref_cache.changed);
2295 __mark_block_processed(rc, next);
1918 break; 2296 break;
1919 } 2297 }
1920 2298
1921 if (reloc_only || next != node) { 2299 WARN_ON(1);
1922 if (!root->reloc_root)
1923 btrfs_record_root_in_trans(trans, root);
1924 root = root->reloc_root;
1925 /*
1926 * if the reloc tree was created in current
1927 * transation, there is no node in backref tree
1928 * corresponds to the root of the reloc tree.
1929 */
1930 if (btrfs_root_last_snapshot(&root->root_item) ==
1931 trans->transid - 1)
1932 break;
1933 }
1934skip:
1935 root = NULL; 2300 root = NULL;
1936 next = walk_down_backref(edges, &index); 2301 next = walk_down_backref(edges, &index);
1937 if (!next || next->level <= node->level) 2302 if (!next || next->level <= node->level)
1938 break; 2303 break;
1939 } 2304 }
2305 if (!root)
2306 return NULL;
1940 2307
1941 if (!root && !loop && !reloc_only) { 2308 *nr = index;
1942 loop = 1; 2309 next = node;
1943 goto again; 2310 /* setup backref node path for btrfs_reloc_cow_block */
2311 while (1) {
2312 rc->backref_cache.path[next->level] = next;
2313 if (--index < 0)
2314 break;
2315 next = edges[index]->node[UPPER];
1944 } 2316 }
1945
1946 if (root)
1947 *nr = index;
1948 else
1949 *nr = 0;
1950
1951 return root; 2317 return root;
1952} 2318}
1953 2319
2320/*
2321 * select a tree root for relocation. return NULL if the block
2322 * is reference counted. we should use do_relocation() in this
2323 * case. return a tree root pointer if the block isn't reference
2324 * counted. return -ENOENT if the block is root of reloc tree.
2325 */
1954static noinline_for_stack 2326static noinline_for_stack
1955struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, 2327struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1956 struct backref_node *node) 2328 struct backref_node *node)
1957{ 2329{
2330 struct backref_node *next;
2331 struct btrfs_root *root;
2332 struct btrfs_root *fs_root = NULL;
1958 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2333 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1959 int nr; 2334 int index = 0;
1960 return __select_one_root(trans, node, edges, &nr, 0); 2335
2336 next = node;
2337 while (1) {
2338 cond_resched();
2339 next = walk_up_backref(next, edges, &index);
2340 root = next->root;
2341 BUG_ON(!root);
2342
2343 /* no other choice for non-refernce counted tree */
2344 if (!root->ref_cows)
2345 return root;
2346
2347 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2348 fs_root = root;
2349
2350 if (next != node)
2351 return NULL;
2352
2353 next = walk_down_backref(edges, &index);
2354 if (!next || next->level <= node->level)
2355 break;
2356 }
2357
2358 if (!fs_root)
2359 return ERR_PTR(-ENOENT);
2360 return fs_root;
1961} 2361}
1962 2362
1963static noinline_for_stack 2363static noinline_for_stack
1964struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2364u64 calcu_metadata_size(struct reloc_control *rc,
1965 struct backref_node *node, 2365 struct backref_node *node, int reserve)
1966 struct backref_edge *edges[], int *nr)
1967{ 2366{
1968 return __select_one_root(trans, node, edges, nr, 1); 2367 struct backref_node *next = node;
2368 struct backref_edge *edge;
2369 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2370 u64 num_bytes = 0;
2371 int index = 0;
2372
2373 BUG_ON(reserve && node->processed);
2374
2375 while (next) {
2376 cond_resched();
2377 while (1) {
2378 if (next->processed && (reserve || next != node))
2379 break;
2380
2381 num_bytes += btrfs_level_size(rc->extent_root,
2382 next->level);
2383
2384 if (list_empty(&next->upper))
2385 break;
2386
2387 edge = list_entry(next->upper.next,
2388 struct backref_edge, list[LOWER]);
2389 edges[index++] = edge;
2390 next = edge->node[UPPER];
2391 }
2392 next = walk_down_backref(edges, &index);
2393 }
2394 return num_bytes;
1969} 2395}
1970 2396
1971static void grab_path_buffers(struct btrfs_path *path, 2397static int reserve_metadata_space(struct btrfs_trans_handle *trans,
1972 struct backref_node *node, 2398 struct reloc_control *rc,
1973 struct backref_edge *edges[], int nr) 2399 struct backref_node *node)
1974{ 2400{
1975 int i = 0; 2401 struct btrfs_root *root = rc->extent_root;
1976 while (1) { 2402 u64 num_bytes;
1977 drop_node_buffer(node); 2403 int ret;
1978 node->eb = path->nodes[node->level]; 2404
1979 BUG_ON(!node->eb); 2405 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
1980 if (path->locks[node->level])
1981 node->locked = 1;
1982 path->nodes[node->level] = NULL;
1983 path->locks[node->level] = 0;
1984
1985 if (i >= nr)
1986 break;
1987 2406
1988 edges[i]->blockptr = node->eb->start; 2407 trans->block_rsv = rc->block_rsv;
1989 node = edges[i]->node[UPPER]; 2408 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
1990 i++; 2409 &rc->block_rsv_retries);
2410 if (ret) {
2411 if (ret == -EAGAIN)
2412 rc->commit_transaction = 1;
2413 return ret;
1991 } 2414 }
2415
2416 rc->block_rsv_retries = 0;
2417 return 0;
2418}
2419
2420static void release_metadata_space(struct reloc_control *rc,
2421 struct backref_node *node)
2422{
2423 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2424 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
1992} 2425}
1993 2426
1994/* 2427/*
@@ -1999,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path,
1999 * in that case this function just updates pointers. 2432 * in that case this function just updates pointers.
2000 */ 2433 */
2001static int do_relocation(struct btrfs_trans_handle *trans, 2434static int do_relocation(struct btrfs_trans_handle *trans,
2435 struct reloc_control *rc,
2002 struct backref_node *node, 2436 struct backref_node *node,
2003 struct btrfs_key *key, 2437 struct btrfs_key *key,
2004 struct btrfs_path *path, int lowest) 2438 struct btrfs_path *path, int lowest)
@@ -2019,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2019 BUG_ON(lowest && node->eb); 2453 BUG_ON(lowest && node->eb);
2020 2454
2021 path->lowest_level = node->level + 1; 2455 path->lowest_level = node->level + 1;
2456 rc->backref_cache.path[node->level] = node;
2022 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2457 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2023 cond_resched(); 2458 cond_resched();
2024 if (node->eb && node->eb->start == edge->blockptr)
2025 continue;
2026 2459
2027 upper = edge->node[UPPER]; 2460 upper = edge->node[UPPER];
2028 root = select_reloc_root(trans, upper, edges, &nr); 2461 root = select_reloc_root(trans, rc, upper, edges, &nr);
2029 if (!root) 2462 BUG_ON(!root);
2030 continue; 2463
2031 2464 if (upper->eb && !upper->locked) {
2032 if (upper->eb && !upper->locked) 2465 if (!lowest) {
2466 ret = btrfs_bin_search(upper->eb, key,
2467 upper->level, &slot);
2468 BUG_ON(ret);
2469 bytenr = btrfs_node_blockptr(upper->eb, slot);
2470 if (node->eb->start == bytenr)
2471 goto next;
2472 }
2033 drop_node_buffer(upper); 2473 drop_node_buffer(upper);
2474 }
2034 2475
2035 if (!upper->eb) { 2476 if (!upper->eb) {
2036 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2477 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2040 } 2481 }
2041 BUG_ON(ret > 0); 2482 BUG_ON(ret > 0);
2042 2483
2043 slot = path->slots[upper->level]; 2484 if (!upper->eb) {
2485 upper->eb = path->nodes[upper->level];
2486 path->nodes[upper->level] = NULL;
2487 } else {
2488 BUG_ON(upper->eb != path->nodes[upper->level]);
2489 }
2044 2490
2045 btrfs_unlock_up_safe(path, upper->level + 1); 2491 upper->locked = 1;
2046 grab_path_buffers(path, upper, edges, nr); 2492 path->locks[upper->level] = 0;
2047 2493
2494 slot = path->slots[upper->level];
2048 btrfs_release_path(NULL, path); 2495 btrfs_release_path(NULL, path);
2049 } else { 2496 } else {
2050 ret = btrfs_bin_search(upper->eb, key, upper->level, 2497 ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2053 } 2500 }
2054 2501
2055 bytenr = btrfs_node_blockptr(upper->eb, slot); 2502 bytenr = btrfs_node_blockptr(upper->eb, slot);
2056 if (!lowest) { 2503 if (lowest) {
2057 if (node->eb->start == bytenr) { 2504 BUG_ON(bytenr != node->bytenr);
2058 btrfs_tree_unlock(upper->eb);
2059 upper->locked = 0;
2060 continue;
2061 }
2062 } else { 2505 } else {
2063 BUG_ON(node->bytenr != bytenr); 2506 if (node->eb->start == bytenr)
2507 goto next;
2064 } 2508 }
2065 2509
2066 blocksize = btrfs_level_size(root, node->level); 2510 blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2072 if (!node->eb) { 2516 if (!node->eb) {
2073 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2517 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2074 slot, &eb); 2518 slot, &eb);
2519 btrfs_tree_unlock(eb);
2520 free_extent_buffer(eb);
2075 if (ret < 0) { 2521 if (ret < 0) {
2076 err = ret; 2522 err = ret;
2077 break; 2523 goto next;
2078 } 2524 }
2079 btrfs_set_lock_blocking(eb); 2525 BUG_ON(node->eb != eb);
2080 node->eb = eb;
2081 node->locked = 1;
2082 } else { 2526 } else {
2083 btrfs_set_node_blockptr(upper->eb, slot, 2527 btrfs_set_node_blockptr(upper->eb, slot,
2084 node->eb->start); 2528 node->eb->start);
@@ -2096,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2096 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2540 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2097 BUG_ON(ret); 2541 BUG_ON(ret);
2098 } 2542 }
2099 if (!lowest) { 2543next:
2100 btrfs_tree_unlock(upper->eb); 2544 if (!upper->pending)
2101 upper->locked = 0; 2545 drop_node_buffer(upper);
2102 } 2546 else
2547 unlock_node_buffer(upper);
2548 if (err)
2549 break;
2103 } 2550 }
2551
2552 if (!err && node->pending) {
2553 drop_node_buffer(node);
2554 list_move_tail(&node->list, &rc->backref_cache.changed);
2555 node->pending = 0;
2556 }
2557
2104 path->lowest_level = 0; 2558 path->lowest_level = 0;
2559 BUG_ON(err == -ENOSPC);
2105 return err; 2560 return err;
2106} 2561}
2107 2562
2108static int link_to_upper(struct btrfs_trans_handle *trans, 2563static int link_to_upper(struct btrfs_trans_handle *trans,
2564 struct reloc_control *rc,
2109 struct backref_node *node, 2565 struct backref_node *node,
2110 struct btrfs_path *path) 2566 struct btrfs_path *path)
2111{ 2567{
2112 struct btrfs_key key; 2568 struct btrfs_key key;
2113 if (!node->eb || list_empty(&node->upper))
2114 return 0;
2115 2569
2116 btrfs_node_key_to_cpu(node->eb, &key, 0); 2570 btrfs_node_key_to_cpu(node->eb, &key, 0);
2117 return do_relocation(trans, node, &key, path, 0); 2571 return do_relocation(trans, rc, node, &key, path, 0);
2118} 2572}
2119 2573
2120static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2574static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2121 struct backref_cache *cache, 2575 struct reloc_control *rc,
2122 struct btrfs_path *path) 2576 struct btrfs_path *path, int err)
2123{ 2577{
2578 LIST_HEAD(list);
2579 struct backref_cache *cache = &rc->backref_cache;
2124 struct backref_node *node; 2580 struct backref_node *node;
2125 int level; 2581 int level;
2126 int ret; 2582 int ret;
2127 int err = 0;
2128 2583
2129 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2584 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2130 while (!list_empty(&cache->pending[level])) { 2585 while (!list_empty(&cache->pending[level])) {
2131 node = list_entry(cache->pending[level].next, 2586 node = list_entry(cache->pending[level].next,
2132 struct backref_node, lower); 2587 struct backref_node, list);
2133 BUG_ON(node->level != level); 2588 list_move_tail(&node->list, &list);
2589 BUG_ON(!node->pending);
2134 2590
2135 ret = link_to_upper(trans, node, path); 2591 if (!err) {
2136 if (ret < 0) 2592 ret = link_to_upper(trans, rc, node, path);
2137 err = ret; 2593 if (ret < 0)
2138 /* 2594 err = ret;
2139 * this remove the node from the pending list and 2595 }
2140 * may add some other nodes to the level + 1
2141 * pending list
2142 */
2143 remove_backref_node(cache, node);
2144 } 2596 }
2597 list_splice_init(&list, &cache->pending[level]);
2145 } 2598 }
2146 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2147 return err; 2599 return err;
2148} 2600}
2149 2601
2150static void mark_block_processed(struct reloc_control *rc, 2602static void mark_block_processed(struct reloc_control *rc,
2151 struct backref_node *node) 2603 u64 bytenr, u32 blocksize)
2604{
2605 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2606 EXTENT_DIRTY, GFP_NOFS);
2607}
2608
2609static void __mark_block_processed(struct reloc_control *rc,
2610 struct backref_node *node)
2152{ 2611{
2153 u32 blocksize; 2612 u32 blocksize;
2154 if (node->level == 0 || 2613 if (node->level == 0 ||
2155 in_block_group(node->bytenr, rc->block_group)) { 2614 in_block_group(node->bytenr, rc->block_group)) {
2156 blocksize = btrfs_level_size(rc->extent_root, node->level); 2615 blocksize = btrfs_level_size(rc->extent_root, node->level);
2157 set_extent_bits(&rc->processed_blocks, node->bytenr, 2616 mark_block_processed(rc, node->bytenr, blocksize);
2158 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2159 GFP_NOFS);
2160 } 2617 }
2161 node->processed = 1; 2618 node->processed = 1;
2162} 2619}
@@ -2179,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc,
2179 if (next->processed) 2636 if (next->processed)
2180 break; 2637 break;
2181 2638
2182 mark_block_processed(rc, next); 2639 __mark_block_processed(rc, next);
2183 2640
2184 if (list_empty(&next->upper)) 2641 if (list_empty(&next->upper))
2185 break; 2642 break;
@@ -2202,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2202 return 0; 2659 return 0;
2203} 2660}
2204 2661
2205/*
2206 * check if there are any file extent pointers in the leaf point to
2207 * data require processing
2208 */
2209static int check_file_extents(struct reloc_control *rc,
2210 u64 bytenr, u32 blocksize, u64 ptr_gen)
2211{
2212 struct btrfs_key found_key;
2213 struct btrfs_file_extent_item *fi;
2214 struct extent_buffer *leaf;
2215 u32 nritems;
2216 int i;
2217 int ret = 0;
2218
2219 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2220
2221 nritems = btrfs_header_nritems(leaf);
2222 for (i = 0; i < nritems; i++) {
2223 cond_resched();
2224 btrfs_item_key_to_cpu(leaf, &found_key, i);
2225 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2226 continue;
2227 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2228 if (btrfs_file_extent_type(leaf, fi) ==
2229 BTRFS_FILE_EXTENT_INLINE)
2230 continue;
2231 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2232 if (bytenr == 0)
2233 continue;
2234 if (in_block_group(bytenr, rc->block_group)) {
2235 ret = 1;
2236 break;
2237 }
2238 }
2239 free_extent_buffer(leaf);
2240 return ret;
2241}
2242
2243/*
2244 * scan child blocks of a given block to find blocks require processing
2245 */
2246static int add_child_blocks(struct btrfs_trans_handle *trans,
2247 struct reloc_control *rc,
2248 struct backref_node *node,
2249 struct rb_root *blocks)
2250{
2251 struct tree_block *block;
2252 struct rb_node *rb_node;
2253 u64 bytenr;
2254 u64 ptr_gen;
2255 u32 blocksize;
2256 u32 nritems;
2257 int i;
2258 int err = 0;
2259
2260 nritems = btrfs_header_nritems(node->eb);
2261 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2262 for (i = 0; i < nritems; i++) {
2263 cond_resched();
2264 bytenr = btrfs_node_blockptr(node->eb, i);
2265 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2266 if (ptr_gen == trans->transid)
2267 continue;
2268 if (!in_block_group(bytenr, rc->block_group) &&
2269 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2270 continue;
2271 if (tree_block_processed(bytenr, blocksize, rc))
2272 continue;
2273
2274 readahead_tree_block(rc->extent_root,
2275 bytenr, blocksize, ptr_gen);
2276 }
2277
2278 for (i = 0; i < nritems; i++) {
2279 cond_resched();
2280 bytenr = btrfs_node_blockptr(node->eb, i);
2281 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2282 if (ptr_gen == trans->transid)
2283 continue;
2284 if (!in_block_group(bytenr, rc->block_group) &&
2285 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2286 continue;
2287 if (tree_block_processed(bytenr, blocksize, rc))
2288 continue;
2289 if (!in_block_group(bytenr, rc->block_group) &&
2290 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2291 continue;
2292
2293 block = kmalloc(sizeof(*block), GFP_NOFS);
2294 if (!block) {
2295 err = -ENOMEM;
2296 break;
2297 }
2298 block->bytenr = bytenr;
2299 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2300 block->level = node->level - 1;
2301 block->key_ready = 1;
2302 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2303 BUG_ON(rb_node);
2304 }
2305 if (err)
2306 free_block_list(blocks);
2307 return err;
2308}
2309
2310/*
2311 * find adjacent blocks require processing
2312 */
2313static noinline_for_stack
2314int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2315 struct reloc_control *rc,
2316 struct backref_cache *cache,
2317 struct rb_root *blocks, int level,
2318 struct backref_node **upper)
2319{
2320 struct backref_node *node;
2321 int ret = 0;
2322
2323 WARN_ON(!list_empty(&cache->pending[level]));
2324
2325 if (list_empty(&cache->pending[level + 1]))
2326 return 1;
2327
2328 node = list_entry(cache->pending[level + 1].next,
2329 struct backref_node, lower);
2330 if (node->eb)
2331 ret = add_child_blocks(trans, rc, node, blocks);
2332
2333 *upper = node;
2334 return ret;
2335}
2336
2337static int get_tree_block_key(struct reloc_control *rc, 2662static int get_tree_block_key(struct reloc_control *rc,
2338 struct tree_block *block) 2663 struct tree_block *block)
2339{ 2664{
@@ -2371,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2371 struct btrfs_path *path) 2696 struct btrfs_path *path)
2372{ 2697{
2373 struct btrfs_root *root; 2698 struct btrfs_root *root;
2374 int ret; 2699 int release = 0;
2700 int ret = 0;
2375 2701
2702 if (!node)
2703 return 0;
2704
2705 BUG_ON(node->processed);
2376 root = select_one_root(trans, node); 2706 root = select_one_root(trans, node);
2377 if (unlikely(!root)) { 2707 if (root == ERR_PTR(-ENOENT)) {
2378 rc->found_old_snapshot = 1;
2379 update_processed_blocks(rc, node); 2708 update_processed_blocks(rc, node);
2380 return 0; 2709 goto out;
2381 } 2710 }
2382 2711
2383 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2712 if (!root || root->ref_cows) {
2384 ret = do_relocation(trans, node, key, path, 1); 2713 ret = reserve_metadata_space(trans, rc, node);
2385 if (ret < 0) 2714 if (ret)
2386 goto out;
2387 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2388 ret = replace_file_extents(trans, rc, root,
2389 node->eb, NULL);
2390 if (ret < 0)
2391 goto out;
2392 }
2393 drop_node_buffer(node);
2394 } else if (!root->ref_cows) {
2395 path->lowest_level = node->level;
2396 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2397 btrfs_release_path(root, path);
2398 if (ret < 0)
2399 goto out; 2715 goto out;
2400 } else if (root != node->root) { 2716 release = 1;
2401 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2402 } 2717 }
2403 2718
2404 update_processed_blocks(rc, node); 2719 if (root) {
2405 ret = 0; 2720 if (root->ref_cows) {
2721 BUG_ON(node->new_bytenr);
2722 BUG_ON(!list_empty(&node->list));
2723 btrfs_record_root_in_trans(trans, root);
2724 root = root->reloc_root;
2725 node->new_bytenr = root->node->start;
2726 node->root = root;
2727 list_add_tail(&node->list, &rc->backref_cache.changed);
2728 } else {
2729 path->lowest_level = node->level;
2730 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2731 btrfs_release_path(root, path);
2732 if (ret > 0)
2733 ret = 0;
2734 }
2735 if (!ret)
2736 update_processed_blocks(rc, node);
2737 } else {
2738 ret = do_relocation(trans, rc, node, key, path, 1);
2739 }
2406out: 2740out:
2407 drop_node_buffer(node); 2741 if (ret || node->level == 0 || node->cowonly) {
2742 if (release)
2743 release_metadata_space(rc, node);
2744 remove_backref_node(&rc->backref_cache, node);
2745 }
2408 return ret; 2746 return ret;
2409} 2747}
2410 2748
@@ -2415,12 +2753,10 @@ static noinline_for_stack
2415int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2753int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2416 struct reloc_control *rc, struct rb_root *blocks) 2754 struct reloc_control *rc, struct rb_root *blocks)
2417{ 2755{
2418 struct backref_cache *cache;
2419 struct backref_node *node; 2756 struct backref_node *node;
2420 struct btrfs_path *path; 2757 struct btrfs_path *path;
2421 struct tree_block *block; 2758 struct tree_block *block;
2422 struct rb_node *rb_node; 2759 struct rb_node *rb_node;
2423 int level = -1;
2424 int ret; 2760 int ret;
2425 int err = 0; 2761 int err = 0;
2426 2762
@@ -2428,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2428 if (!path) 2764 if (!path)
2429 return -ENOMEM; 2765 return -ENOMEM;
2430 2766
2431 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2432 if (!cache) {
2433 btrfs_free_path(path);
2434 return -ENOMEM;
2435 }
2436
2437 backref_cache_init(cache);
2438
2439 rb_node = rb_first(blocks); 2767 rb_node = rb_first(blocks);
2440 while (rb_node) { 2768 while (rb_node) {
2441 block = rb_entry(rb_node, struct tree_block, rb_node); 2769 block = rb_entry(rb_node, struct tree_block, rb_node);
2442 if (level == -1)
2443 level = block->level;
2444 else
2445 BUG_ON(level != block->level);
2446 if (!block->key_ready) 2770 if (!block->key_ready)
2447 reada_tree_block(rc, block); 2771 reada_tree_block(rc, block);
2448 rb_node = rb_next(rb_node); 2772 rb_node = rb_next(rb_node);
@@ -2460,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2460 while (rb_node) { 2784 while (rb_node) {
2461 block = rb_entry(rb_node, struct tree_block, rb_node); 2785 block = rb_entry(rb_node, struct tree_block, rb_node);
2462 2786
2463 node = build_backref_tree(rc, cache, &block->key, 2787 node = build_backref_tree(rc, &block->key,
2464 block->level, block->bytenr); 2788 block->level, block->bytenr);
2465 if (IS_ERR(node)) { 2789 if (IS_ERR(node)) {
2466 err = PTR_ERR(node); 2790 err = PTR_ERR(node);
@@ -2470,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2470 ret = relocate_tree_block(trans, rc, node, &block->key, 2794 ret = relocate_tree_block(trans, rc, node, &block->key,
2471 path); 2795 path);
2472 if (ret < 0) { 2796 if (ret < 0) {
2473 err = ret; 2797 if (ret != -EAGAIN || rb_node == rb_first(blocks))
2798 err = ret;
2474 goto out; 2799 goto out;
2475 } 2800 }
2476 remove_backref_node(cache, node);
2477 rb_node = rb_next(rb_node); 2801 rb_node = rb_next(rb_node);
2478 } 2802 }
2479 2803out:
2480 if (level > 0)
2481 goto out;
2482
2483 free_block_list(blocks); 2804 free_block_list(blocks);
2805 err = finish_pending_nodes(trans, rc, path, err);
2484 2806
2485 /* 2807 btrfs_free_path(path);
2486 * now backrefs of some upper level tree blocks have been cached, 2808 return err;
2487 * try relocating blocks referenced by these upper level blocks. 2809}
2488 */
2489 while (1) {
2490 struct backref_node *upper = NULL;
2491 if (trans->transaction->in_commit ||
2492 trans->transaction->delayed_refs.flushing)
2493 break;
2494 2810
2495 ret = add_adjacent_blocks(trans, rc, cache, blocks, level, 2811static noinline_for_stack
2496 &upper); 2812int prealloc_file_extent_cluster(struct inode *inode,
2497 if (ret < 0) 2813 struct file_extent_cluster *cluster)
2498 err = ret; 2814{
2499 if (ret != 0) 2815 u64 alloc_hint = 0;
2500 break; 2816 u64 start;
2817 u64 end;
2818 u64 offset = BTRFS_I(inode)->index_cnt;
2819 u64 num_bytes;
2820 int nr = 0;
2821 int ret = 0;
2501 2822
2502 rb_node = rb_first(blocks); 2823 BUG_ON(cluster->start != cluster->boundary[0]);
2503 while (rb_node) { 2824 mutex_lock(&inode->i_mutex);
2504 block = rb_entry(rb_node, struct tree_block, rb_node);
2505 if (trans->transaction->in_commit ||
2506 trans->transaction->delayed_refs.flushing)
2507 goto out;
2508 BUG_ON(!block->key_ready);
2509 node = build_backref_tree(rc, cache, &block->key,
2510 level, block->bytenr);
2511 if (IS_ERR(node)) {
2512 err = PTR_ERR(node);
2513 goto out;
2514 }
2515 2825
2516 ret = relocate_tree_block(trans, rc, node, 2826 ret = btrfs_check_data_free_space(inode, cluster->end +
2517 &block->key, path); 2827 1 - cluster->start);
2518 if (ret < 0) { 2828 if (ret)
2519 err = ret; 2829 goto out;
2520 goto out;
2521 }
2522 remove_backref_node(cache, node);
2523 rb_node = rb_next(rb_node);
2524 }
2525 free_block_list(blocks);
2526 2830
2527 if (upper) { 2831 while (nr < cluster->nr) {
2528 ret = link_to_upper(trans, upper, path); 2832 start = cluster->boundary[nr] - offset;
2529 if (ret < 0) { 2833 if (nr + 1 < cluster->nr)
2530 err = ret; 2834 end = cluster->boundary[nr + 1] - 1 - offset;
2531 break; 2835 else
2532 } 2836 end = cluster->end - offset;
2533 remove_backref_node(cache, upper); 2837
2534 } 2838 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2839 num_bytes = end + 1 - start;
2840 ret = btrfs_prealloc_file_range(inode, 0, start,
2841 num_bytes, num_bytes,
2842 end + 1, &alloc_hint);
2843 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2844 if (ret)
2845 break;
2846 nr++;
2535 } 2847 }
2848 btrfs_free_reserved_data_space(inode, cluster->end +
2849 1 - cluster->start);
2536out: 2850out:
2537 free_block_list(blocks); 2851 mutex_unlock(&inode->i_mutex);
2538 2852 return ret;
2539 ret = finish_pending_nodes(trans, cache, path);
2540 if (ret < 0)
2541 err = ret;
2542
2543 kfree(cache);
2544 btrfs_free_path(path);
2545 return err;
2546} 2853}
2547 2854
2548static noinline_for_stack 2855static noinline_for_stack
@@ -2588,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
2588 u64 offset = BTRFS_I(inode)->index_cnt; 2895 u64 offset = BTRFS_I(inode)->index_cnt;
2589 unsigned long index; 2896 unsigned long index;
2590 unsigned long last_index; 2897 unsigned long last_index;
2591 unsigned int dirty_page = 0;
2592 struct page *page; 2898 struct page *page;
2593 struct file_ra_state *ra; 2899 struct file_ra_state *ra;
2594 int nr = 0; 2900 int nr = 0;
@@ -2601,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
2601 if (!ra) 2907 if (!ra)
2602 return -ENOMEM; 2908 return -ENOMEM;
2603 2909
2604 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2910 ret = prealloc_file_extent_cluster(inode, cluster);
2605 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2911 if (ret)
2912 goto out;
2606 2913
2607 mutex_lock(&inode->i_mutex); 2914 file_ra_state_init(ra, inode->i_mapping);
2608 2915
2609 i_size_write(inode, cluster->end + 1 - offset);
2610 ret = setup_extent_mapping(inode, cluster->start - offset, 2916 ret = setup_extent_mapping(inode, cluster->start - offset,
2611 cluster->end - offset, cluster->start); 2917 cluster->end - offset, cluster->start);
2612 if (ret) 2918 if (ret)
2613 goto out_unlock; 2919 goto out;
2614
2615 file_ra_state_init(ra, inode->i_mapping);
2616 2920
2617 WARN_ON(cluster->start != cluster->boundary[0]); 2921 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2922 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2618 while (index <= last_index) { 2923 while (index <= last_index) {
2924 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2925 if (ret)
2926 goto out;
2927
2619 page = find_lock_page(inode->i_mapping, index); 2928 page = find_lock_page(inode->i_mapping, index);
2620 if (!page) { 2929 if (!page) {
2621 page_cache_sync_readahead(inode->i_mapping, 2930 page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2623 last_index + 1 - index); 2932 last_index + 1 - index);
2624 page = grab_cache_page(inode->i_mapping, index); 2933 page = grab_cache_page(inode->i_mapping, index);
2625 if (!page) { 2934 if (!page) {
2935 btrfs_delalloc_release_metadata(inode,
2936 PAGE_CACHE_SIZE);
2626 ret = -ENOMEM; 2937 ret = -ENOMEM;
2627 goto out_unlock; 2938 goto out;
2628 } 2939 }
2629 } 2940 }
2630 2941
@@ -2640,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2640 if (!PageUptodate(page)) { 2951 if (!PageUptodate(page)) {
2641 unlock_page(page); 2952 unlock_page(page);
2642 page_cache_release(page); 2953 page_cache_release(page);
2954 btrfs_delalloc_release_metadata(inode,
2955 PAGE_CACHE_SIZE);
2643 ret = -EIO; 2956 ret = -EIO;
2644 goto out_unlock; 2957 goto out;
2645 } 2958 }
2646 } 2959 }
2647 2960
@@ -2660,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2660 EXTENT_BOUNDARY, GFP_NOFS); 2973 EXTENT_BOUNDARY, GFP_NOFS);
2661 nr++; 2974 nr++;
2662 } 2975 }
2663 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 2976
2977 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2665 set_page_dirty(page); 2978 set_page_dirty(page);
2666 dirty_page++;
2667 2979
2668 unlock_extent(&BTRFS_I(inode)->io_tree, 2980 unlock_extent(&BTRFS_I(inode)->io_tree,
2669 page_start, page_end, GFP_NOFS); 2981 page_start, page_end, GFP_NOFS);
@@ -2671,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
2671 page_cache_release(page); 2983 page_cache_release(page);
2672 2984
2673 index++; 2985 index++;
2674 if (nr < cluster->nr && 2986 balance_dirty_pages_ratelimited(inode->i_mapping);
2675 page_end + 1 + offset == cluster->boundary[nr]) { 2987 btrfs_throttle(BTRFS_I(inode)->root);
2676 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2677 dirty_page);
2678 dirty_page = 0;
2679 }
2680 }
2681 if (dirty_page) {
2682 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2683 dirty_page);
2684 } 2988 }
2685 WARN_ON(nr != cluster->nr); 2989 WARN_ON(nr != cluster->nr);
2686out_unlock: 2990out:
2687 mutex_unlock(&inode->i_mutex);
2688 kfree(ra); 2991 kfree(ra);
2689 return ret; 2992 return ret;
2690} 2993}
@@ -2870,9 +3173,6 @@ out:
2870static int block_use_full_backref(struct reloc_control *rc, 3173static int block_use_full_backref(struct reloc_control *rc,
2871 struct extent_buffer *eb) 3174 struct extent_buffer *eb)
2872{ 3175{
2873 struct btrfs_path *path;
2874 struct btrfs_extent_item *ei;
2875 struct btrfs_key key;
2876 u64 flags; 3176 u64 flags;
2877 int ret; 3177 int ret;
2878 3178
@@ -2880,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc,
2880 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3180 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2881 return 1; 3181 return 1;
2882 3182
2883 path = btrfs_alloc_path(); 3183 ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
2884 BUG_ON(!path); 3184 eb->start, eb->len, NULL, &flags);
2885
2886 key.objectid = eb->start;
2887 key.type = BTRFS_EXTENT_ITEM_KEY;
2888 key.offset = eb->len;
2889
2890 path->search_commit_root = 1;
2891 path->skip_locking = 1;
2892 ret = btrfs_search_slot(NULL, rc->extent_root,
2893 &key, path, 0, 0);
2894 BUG_ON(ret); 3185 BUG_ON(ret);
2895 3186
2896 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2897 struct btrfs_extent_item);
2898 flags = btrfs_extent_flags(path->nodes[0], ei);
2899 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2900 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3187 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2901 ret = 1; 3188 ret = 1;
2902 else 3189 else
2903 ret = 0; 3190 ret = 0;
2904 btrfs_free_path(path);
2905 return ret; 3191 return ret;
2906} 3192}
2907 3193
@@ -3074,22 +3360,10 @@ int add_data_references(struct reloc_control *rc,
3074 struct btrfs_extent_inline_ref *iref; 3360 struct btrfs_extent_inline_ref *iref;
3075 unsigned long ptr; 3361 unsigned long ptr;
3076 unsigned long end; 3362 unsigned long end;
3077 u32 blocksize; 3363 u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3078 int ret; 3364 int ret;
3079 int err = 0; 3365 int err = 0;
3080 3366
3081 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3082 extent_key->offset);
3083 BUG_ON(ret < 0);
3084 if (ret > 0) {
3085 /* the relocated data is fragmented */
3086 rc->extents_skipped++;
3087 btrfs_release_path(rc->extent_root, path);
3088 return 0;
3089 }
3090
3091 blocksize = btrfs_level_size(rc->extent_root, 0);
3092
3093 eb = path->nodes[0]; 3367 eb = path->nodes[0];
3094 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3368 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3095 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3369 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3444,8 @@ int add_data_references(struct reloc_control *rc,
3170 */ 3444 */
3171static noinline_for_stack 3445static noinline_for_stack
3172int find_next_extent(struct btrfs_trans_handle *trans, 3446int find_next_extent(struct btrfs_trans_handle *trans,
3173 struct reloc_control *rc, struct btrfs_path *path) 3447 struct reloc_control *rc, struct btrfs_path *path,
3448 struct btrfs_key *extent_key)
3174{ 3449{
3175 struct btrfs_key key; 3450 struct btrfs_key key;
3176 struct extent_buffer *leaf; 3451 struct extent_buffer *leaf;
@@ -3225,6 +3500,7 @@ next:
3225 rc->search_start = end + 1; 3500 rc->search_start = end + 1;
3226 } else { 3501 } else {
3227 rc->search_start = key.objectid + key.offset; 3502 rc->search_start = key.objectid + key.offset;
3503 memcpy(extent_key, &key, sizeof(key));
3228 return 0; 3504 return 0;
3229 } 3505 }
3230 } 3506 }
@@ -3262,12 +3538,49 @@ static int check_extent_flags(u64 flags)
3262 return 0; 3538 return 0;
3263} 3539}
3264 3540
3541static noinline_for_stack
3542int prepare_to_relocate(struct reloc_control *rc)
3543{
3544 struct btrfs_trans_handle *trans;
3545 int ret;
3546
3547 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
3548 if (!rc->block_rsv)
3549 return -ENOMEM;
3550
3551 /*
3552 * reserve some space for creating reloc trees.
3553 * btrfs_init_reloc_root will use them when there
3554 * is no reservation in transaction handle.
3555 */
3556 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3557 rc->extent_root->nodesize * 256,
3558 &rc->block_rsv_retries);
3559 if (ret)
3560 return ret;
3561
3562 rc->block_rsv->refill_used = 1;
3563 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3564
3565 memset(&rc->cluster, 0, sizeof(rc->cluster));
3566 rc->search_start = rc->block_group->key.objectid;
3567 rc->extents_found = 0;
3568 rc->nodes_relocated = 0;
3569 rc->merging_rsv_size = 0;
3570 rc->block_rsv_retries = 0;
3571
3572 rc->create_reloc_tree = 1;
3573 set_reloc_control(rc);
3574
3575 trans = btrfs_join_transaction(rc->extent_root, 1);
3576 btrfs_commit_transaction(trans, rc->extent_root);
3577 return 0;
3578}
3265 3579
3266static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3580static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3267{ 3581{
3268 struct rb_root blocks = RB_ROOT; 3582 struct rb_root blocks = RB_ROOT;
3269 struct btrfs_key key; 3583 struct btrfs_key key;
3270 struct file_extent_cluster *cluster;
3271 struct btrfs_trans_handle *trans = NULL; 3584 struct btrfs_trans_handle *trans = NULL;
3272 struct btrfs_path *path; 3585 struct btrfs_path *path;
3273 struct btrfs_extent_item *ei; 3586 struct btrfs_extent_item *ei;
@@ -3277,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3277 int ret; 3590 int ret;
3278 int err = 0; 3591 int err = 0;
3279 3592
3280 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3281 if (!cluster)
3282 return -ENOMEM;
3283
3284 path = btrfs_alloc_path(); 3593 path = btrfs_alloc_path();
3285 if (!path) { 3594 if (!path)
3286 kfree(cluster);
3287 return -ENOMEM; 3595 return -ENOMEM;
3288 }
3289
3290 rc->extents_found = 0;
3291 rc->extents_skipped = 0;
3292
3293 rc->search_start = rc->block_group->key.objectid;
3294 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3295 GFP_NOFS);
3296
3297 rc->create_reloc_root = 1;
3298 set_reloc_control(rc);
3299 3596
3300 trans = btrfs_start_transaction(rc->extent_root, 1); 3597 ret = prepare_to_relocate(rc);
3301 btrfs_commit_transaction(trans, rc->extent_root); 3598 if (ret) {
3599 err = ret;
3600 goto out_free;
3601 }
3302 3602
3303 while (1) { 3603 while (1) {
3304 trans = btrfs_start_transaction(rc->extent_root, 1); 3604 trans = btrfs_start_transaction(rc->extent_root, 0);
3605
3606 if (update_backref_cache(trans, &rc->backref_cache)) {
3607 btrfs_end_transaction(trans, rc->extent_root);
3608 continue;
3609 }
3305 3610
3306 ret = find_next_extent(trans, rc, path); 3611 ret = find_next_extent(trans, rc, path, &key);
3307 if (ret < 0) 3612 if (ret < 0)
3308 err = ret; 3613 err = ret;
3309 if (ret != 0) 3614 if (ret != 0)
@@ -3313,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3313 3618
3314 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3619 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3315 struct btrfs_extent_item); 3620 struct btrfs_extent_item);
3316 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3621 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3317 item_size = btrfs_item_size_nr(path->nodes[0],
3318 path->slots[0]);
3319 if (item_size >= sizeof(*ei)) { 3622 if (item_size >= sizeof(*ei)) {
3320 flags = btrfs_extent_flags(path->nodes[0], ei); 3623 flags = btrfs_extent_flags(path->nodes[0], ei);
3321 ret = check_extent_flags(flags); 3624 ret = check_extent_flags(flags);
@@ -3356,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3356 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3659 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3357 ret = add_tree_block(rc, &key, path, &blocks); 3660 ret = add_tree_block(rc, &key, path, &blocks);
3358 } else if (rc->stage == UPDATE_DATA_PTRS && 3661 } else if (rc->stage == UPDATE_DATA_PTRS &&
3359 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3662 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3360 ret = add_data_references(rc, &key, path, &blocks); 3663 ret = add_data_references(rc, &key, path, &blocks);
3361 } else { 3664 } else {
3362 btrfs_release_path(rc->extent_root, path); 3665 btrfs_release_path(rc->extent_root, path);
3363 ret = 0; 3666 ret = 0;
3364 } 3667 }
3365 if (ret < 0) { 3668 if (ret < 0) {
3366 err = 0; 3669 err = ret;
3367 break; 3670 break;
3368 } 3671 }
3369 3672
3370 if (!RB_EMPTY_ROOT(&blocks)) { 3673 if (!RB_EMPTY_ROOT(&blocks)) {
3371 ret = relocate_tree_blocks(trans, rc, &blocks); 3674 ret = relocate_tree_blocks(trans, rc, &blocks);
3372 if (ret < 0) { 3675 if (ret < 0) {
3676 if (ret != -EAGAIN) {
3677 err = ret;
3678 break;
3679 }
3680 rc->extents_found--;
3681 rc->search_start = key.objectid;
3682 }
3683 }
3684
3685 ret = btrfs_block_rsv_check(trans, rc->extent_root,
3686 rc->block_rsv, 0, 5);
3687 if (ret < 0) {
3688 if (ret != -EAGAIN) {
3373 err = ret; 3689 err = ret;
3690 WARN_ON(1);
3374 break; 3691 break;
3375 } 3692 }
3693 rc->commit_transaction = 1;
3376 } 3694 }
3377 3695
3378 nr = trans->blocks_used; 3696 if (rc->commit_transaction) {
3379 btrfs_end_transaction(trans, rc->extent_root); 3697 rc->commit_transaction = 0;
3698 ret = btrfs_commit_transaction(trans, rc->extent_root);
3699 BUG_ON(ret);
3700 } else {
3701 nr = trans->blocks_used;
3702 btrfs_end_transaction_throttle(trans, rc->extent_root);
3703 btrfs_btree_balance_dirty(rc->extent_root, nr);
3704 }
3380 trans = NULL; 3705 trans = NULL;
3381 btrfs_btree_balance_dirty(rc->extent_root, nr);
3382 3706
3383 if (rc->stage == MOVE_DATA_EXTENTS && 3707 if (rc->stage == MOVE_DATA_EXTENTS &&
3384 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3708 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3385 rc->found_file_extent = 1; 3709 rc->found_file_extent = 1;
3386 ret = relocate_data_extent(rc->data_inode, 3710 ret = relocate_data_extent(rc->data_inode,
3387 &key, cluster); 3711 &key, &rc->cluster);
3388 if (ret < 0) { 3712 if (ret < 0) {
3389 err = ret; 3713 err = ret;
3390 break; 3714 break;
3391 } 3715 }
3392 } 3716 }
3393 } 3717 }
3394 btrfs_free_path(path); 3718
3719 btrfs_release_path(rc->extent_root, path);
3720 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3721 GFP_NOFS);
3395 3722
3396 if (trans) { 3723 if (trans) {
3397 nr = trans->blocks_used; 3724 nr = trans->blocks_used;
3398 btrfs_end_transaction(trans, rc->extent_root); 3725 btrfs_end_transaction_throttle(trans, rc->extent_root);
3399 btrfs_btree_balance_dirty(rc->extent_root, nr); 3726 btrfs_btree_balance_dirty(rc->extent_root, nr);
3400 } 3727 }
3401 3728
3402 if (!err) { 3729 if (!err) {
3403 ret = relocate_file_extent_cluster(rc->data_inode, cluster); 3730 ret = relocate_file_extent_cluster(rc->data_inode,
3731 &rc->cluster);
3404 if (ret < 0) 3732 if (ret < 0)
3405 err = ret; 3733 err = ret;
3406 } 3734 }
3407 3735
3408 kfree(cluster); 3736 rc->create_reloc_tree = 0;
3737 set_reloc_control(rc);
3409 3738
3410 rc->create_reloc_root = 0; 3739 backref_cache_cleanup(&rc->backref_cache);
3411 smp_mb(); 3740 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3412 3741
3413 if (rc->extents_found > 0) { 3742 err = prepare_to_merge(rc, err);
3414 trans = btrfs_start_transaction(rc->extent_root, 1);
3415 btrfs_commit_transaction(trans, rc->extent_root);
3416 }
3417 3743
3418 merge_reloc_roots(rc); 3744 merge_reloc_roots(rc);
3419 3745
3746 rc->merge_reloc_tree = 0;
3420 unset_reloc_control(rc); 3747 unset_reloc_control(rc);
3748 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3421 3749
3422 /* get rid of pinned extents */ 3750 /* get rid of pinned extents */
3423 trans = btrfs_start_transaction(rc->extent_root, 1); 3751 trans = btrfs_join_transaction(rc->extent_root, 1);
3424 btrfs_commit_transaction(trans, rc->extent_root); 3752 btrfs_commit_transaction(trans, rc->extent_root);
3425 3753out_free:
3754 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3755 btrfs_free_path(path);
3426 return err; 3756 return err;
3427} 3757}
3428 3758
@@ -3448,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3448 btrfs_set_inode_generation(leaf, item, 1); 3778 btrfs_set_inode_generation(leaf, item, 1);
3449 btrfs_set_inode_size(leaf, item, 0); 3779 btrfs_set_inode_size(leaf, item, 0);
3450 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3780 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3451 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3781 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3782 BTRFS_INODE_PREALLOC);
3452 btrfs_mark_buffer_dirty(leaf); 3783 btrfs_mark_buffer_dirty(leaf);
3453 btrfs_release_path(root, path); 3784 btrfs_release_path(root, path);
3454out: 3785out:
@@ -3460,8 +3791,9 @@ out:
3460 * helper to create inode for data relocation. 3791 * helper to create inode for data relocation.
3461 * the inode is in data relocation tree and its link count is 0 3792 * the inode is in data relocation tree and its link count is 0
3462 */ 3793 */
3463static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3794static noinline_for_stack
3464 struct btrfs_block_group_cache *group) 3795struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3796 struct btrfs_block_group_cache *group)
3465{ 3797{
3466 struct inode *inode = NULL; 3798 struct inode *inode = NULL;
3467 struct btrfs_trans_handle *trans; 3799 struct btrfs_trans_handle *trans;
@@ -3475,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3475 if (IS_ERR(root)) 3807 if (IS_ERR(root))
3476 return ERR_CAST(root); 3808 return ERR_CAST(root);
3477 3809
3478 trans = btrfs_start_transaction(root, 1); 3810 trans = btrfs_start_transaction(root, 6);
3479 BUG_ON(!trans); 3811 if (IS_ERR(trans))
3812 return ERR_CAST(trans);
3480 3813
3481 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3814 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3482 if (err) 3815 if (err)
@@ -3496,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3496out: 3829out:
3497 nr = trans->blocks_used; 3830 nr = trans->blocks_used;
3498 btrfs_end_transaction(trans, root); 3831 btrfs_end_transaction(trans, root);
3499
3500 btrfs_btree_balance_dirty(root, nr); 3832 btrfs_btree_balance_dirty(root, nr);
3501 if (err) { 3833 if (err) {
3502 if (inode) 3834 if (inode)
@@ -3506,6 +3838,21 @@ out:
3506 return inode; 3838 return inode;
3507} 3839}
3508 3840
3841static struct reloc_control *alloc_reloc_control(void)
3842{
3843 struct reloc_control *rc;
3844
3845 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3846 if (!rc)
3847 return NULL;
3848
3849 INIT_LIST_HEAD(&rc->reloc_roots);
3850 backref_cache_init(&rc->backref_cache);
3851 mapping_tree_init(&rc->reloc_root_tree);
3852 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3853 return rc;
3854}
3855
3509/* 3856/*
3510 * function to relocate all extents in a block group. 3857 * function to relocate all extents in a block group.
3511 */ 3858 */
@@ -3514,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3514 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3861 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3515 struct reloc_control *rc; 3862 struct reloc_control *rc;
3516 int ret; 3863 int ret;
3864 int rw = 0;
3517 int err = 0; 3865 int err = 0;
3518 3866
3519 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3867 rc = alloc_reloc_control();
3520 if (!rc) 3868 if (!rc)
3521 return -ENOMEM; 3869 return -ENOMEM;
3522 3870
3523 mapping_tree_init(&rc->reloc_root_tree); 3871 rc->extent_root = extent_root;
3524 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3525 INIT_LIST_HEAD(&rc->reloc_roots);
3526 3872
3527 rc->block_group = btrfs_lookup_block_group(fs_info, group_start); 3873 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3528 BUG_ON(!rc->block_group); 3874 BUG_ON(!rc->block_group);
3529 3875
3530 btrfs_init_workers(&rc->workers, "relocate", 3876 if (!rc->block_group->ro) {
3531 fs_info->thread_pool_size, NULL); 3877 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3532 3878 if (ret) {
3533 rc->extent_root = extent_root; 3879 err = ret;
3534 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3880 goto out;
3881 }
3882 rw = 1;
3883 }
3535 3884
3536 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3885 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3537 if (IS_ERR(rc->data_inode)) { 3886 if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3548 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 3897 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3549 3898
3550 while (1) { 3899 while (1) {
3551 rc->extents_found = 0;
3552 rc->extents_skipped = 0;
3553
3554 mutex_lock(&fs_info->cleaner_mutex); 3900 mutex_lock(&fs_info->cleaner_mutex);
3555 3901
3556 btrfs_clean_old_snapshots(fs_info->tree_root); 3902 btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3559 mutex_unlock(&fs_info->cleaner_mutex); 3905 mutex_unlock(&fs_info->cleaner_mutex);
3560 if (ret < 0) { 3906 if (ret < 0) {
3561 err = ret; 3907 err = ret;
3562 break; 3908 goto out;
3563 } 3909 }
3564 3910
3565 if (rc->extents_found == 0) 3911 if (rc->extents_found == 0)
@@ -3573,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3573 invalidate_mapping_pages(rc->data_inode->i_mapping, 3919 invalidate_mapping_pages(rc->data_inode->i_mapping,
3574 0, -1); 3920 0, -1);
3575 rc->stage = UPDATE_DATA_PTRS; 3921 rc->stage = UPDATE_DATA_PTRS;
3576 } else if (rc->stage == UPDATE_DATA_PTRS &&
3577 rc->extents_skipped >= rc->extents_found) {
3578 iput(rc->data_inode);
3579 rc->data_inode = create_reloc_inode(fs_info,
3580 rc->block_group);
3581 if (IS_ERR(rc->data_inode)) {
3582 err = PTR_ERR(rc->data_inode);
3583 rc->data_inode = NULL;
3584 break;
3585 }
3586 rc->stage = MOVE_DATA_EXTENTS;
3587 rc->found_file_extent = 0;
3588 } 3922 }
3589 } 3923 }
3590 3924
@@ -3597,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3597 WARN_ON(rc->block_group->reserved > 0); 3931 WARN_ON(rc->block_group->reserved > 0);
3598 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 3932 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3599out: 3933out:
3934 if (err && rw)
3935 btrfs_set_block_group_rw(extent_root, rc->block_group);
3600 iput(rc->data_inode); 3936 iput(rc->data_inode);
3601 btrfs_stop_workers(&rc->workers);
3602 btrfs_put_block_group(rc->block_group); 3937 btrfs_put_block_group(rc->block_group);
3603 kfree(rc); 3938 kfree(rc);
3604 return err; 3939 return err;
@@ -3609,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3609 struct btrfs_trans_handle *trans; 3944 struct btrfs_trans_handle *trans;
3610 int ret; 3945 int ret;
3611 3946
3612 trans = btrfs_start_transaction(root->fs_info->tree_root, 1); 3947 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
3613 3948
3614 memset(&root->root_item.drop_progress, 0, 3949 memset(&root->root_item.drop_progress, 0,
3615 sizeof(root->root_item.drop_progress)); 3950 sizeof(root->root_item.drop_progress));
@@ -3702,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3702 if (list_empty(&reloc_roots)) 4037 if (list_empty(&reloc_roots))
3703 goto out; 4038 goto out;
3704 4039
3705 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4040 rc = alloc_reloc_control();
3706 if (!rc) { 4041 if (!rc) {
3707 err = -ENOMEM; 4042 err = -ENOMEM;
3708 goto out; 4043 goto out;
3709 } 4044 }
3710 4045
3711 mapping_tree_init(&rc->reloc_root_tree);
3712 INIT_LIST_HEAD(&rc->reloc_roots);
3713 btrfs_init_workers(&rc->workers, "relocate",
3714 root->fs_info->thread_pool_size, NULL);
3715 rc->extent_root = root->fs_info->extent_root; 4046 rc->extent_root = root->fs_info->extent_root;
3716 4047
3717 set_reloc_control(rc); 4048 set_reloc_control(rc);
3718 4049
4050 trans = btrfs_join_transaction(rc->extent_root, 1);
4051
4052 rc->merge_reloc_tree = 1;
4053
3719 while (!list_empty(&reloc_roots)) { 4054 while (!list_empty(&reloc_roots)) {
3720 reloc_root = list_entry(reloc_roots.next, 4055 reloc_root = list_entry(reloc_roots.next,
3721 struct btrfs_root, root_list); 4056 struct btrfs_root, root_list);
@@ -3735,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3735 fs_root->reloc_root = reloc_root; 4070 fs_root->reloc_root = reloc_root;
3736 } 4071 }
3737 4072
3738 trans = btrfs_start_transaction(rc->extent_root, 1);
3739 btrfs_commit_transaction(trans, rc->extent_root); 4073 btrfs_commit_transaction(trans, rc->extent_root);
3740 4074
3741 merge_reloc_roots(rc); 4075 merge_reloc_roots(rc);
3742 4076
3743 unset_reloc_control(rc); 4077 unset_reloc_control(rc);
3744 4078
3745 trans = btrfs_start_transaction(rc->extent_root, 1); 4079 trans = btrfs_join_transaction(rc->extent_root, 1);
3746 btrfs_commit_transaction(trans, rc->extent_root); 4080 btrfs_commit_transaction(trans, rc->extent_root);
3747out: 4081out:
3748 if (rc) { 4082 kfree(rc);
3749 btrfs_stop_workers(&rc->workers);
3750 kfree(rc);
3751 }
3752 while (!list_empty(&reloc_roots)) { 4083 while (!list_empty(&reloc_roots)) {
3753 reloc_root = list_entry(reloc_roots.next, 4084 reloc_root = list_entry(reloc_roots.next,
3754 struct btrfs_root, root_list); 4085 struct btrfs_root, root_list);
@@ -3814,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3814 btrfs_put_ordered_extent(ordered); 4145 btrfs_put_ordered_extent(ordered);
3815 return 0; 4146 return 0;
3816} 4147}
4148
4149void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4150 struct btrfs_root *root, struct extent_buffer *buf,
4151 struct extent_buffer *cow)
4152{
4153 struct reloc_control *rc;
4154 struct backref_node *node;
4155 int first_cow = 0;
4156 int level;
4157 int ret;
4158
4159 rc = root->fs_info->reloc_ctl;
4160 if (!rc)
4161 return;
4162
4163 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4164 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4165
4166 level = btrfs_header_level(buf);
4167 if (btrfs_header_generation(buf) <=
4168 btrfs_root_last_snapshot(&root->root_item))
4169 first_cow = 1;
4170
4171 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4172 rc->create_reloc_tree) {
4173 WARN_ON(!first_cow && level == 0);
4174
4175 node = rc->backref_cache.path[level];
4176 BUG_ON(node->bytenr != buf->start &&
4177 node->new_bytenr != buf->start);
4178
4179 drop_node_buffer(node);
4180 extent_buffer_get(cow);
4181 node->eb = cow;
4182 node->new_bytenr = cow->start;
4183
4184 if (!node->pending) {
4185 list_move_tail(&node->list,
4186 &rc->backref_cache.pending[level]);
4187 node->pending = 1;
4188 }
4189
4190 if (first_cow)
4191 __mark_block_processed(rc, node);
4192
4193 if (first_cow && level > 0)
4194 rc->nodes_relocated += buf->len;
4195 }
4196
4197 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4198 ret = replace_file_extents(trans, rc, root, cow);
4199 BUG_ON(ret);
4200 }
4201}
4202
4203/*
4204 * called before creating snapshot. it calculates metadata reservation
4205 * requried for relocating tree blocks in the snapshot
4206 */
4207void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4208 struct btrfs_pending_snapshot *pending,
4209 u64 *bytes_to_reserve)
4210{
4211 struct btrfs_root *root;
4212 struct reloc_control *rc;
4213
4214 root = pending->root;
4215 if (!root->reloc_root)
4216 return;
4217
4218 rc = root->fs_info->reloc_ctl;
4219 if (!rc->merge_reloc_tree)
4220 return;
4221
4222 root = root->reloc_root;
4223 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4224 /*
4225 * relocation is in the stage of merging trees. the space
4226 * used by merging a reloc tree is twice the size of
4227 * relocated tree nodes in the worst case. half for cowing
4228 * the reloc tree, half for cowing the fs tree. the space
4229 * used by cowing the reloc tree will be freed after the
4230 * tree is dropped. if we create snapshot, cowing the fs
4231 * tree may use more space than it frees. so we need
4232 * reserve extra space.
4233 */
4234 *bytes_to_reserve += rc->nodes_relocated;
4235}
4236
4237/*
4238 * called after snapshot is created. migrate block reservation
4239 * and create reloc root for the newly created snapshot
4240 */
4241void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4242 struct btrfs_pending_snapshot *pending)
4243{
4244 struct btrfs_root *root = pending->root;
4245 struct btrfs_root *reloc_root;
4246 struct btrfs_root *new_root;
4247 struct reloc_control *rc;
4248 int ret;
4249
4250 if (!root->reloc_root)
4251 return;
4252
4253 rc = root->fs_info->reloc_ctl;
4254 rc->merging_rsv_size += rc->nodes_relocated;
4255
4256 if (rc->merge_reloc_tree) {
4257 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4258 rc->block_rsv,
4259 rc->nodes_relocated);
4260 BUG_ON(ret);
4261 }
4262
4263 new_root = pending->snap;
4264 reloc_root = create_reloc_root(trans, root->reloc_root,
4265 new_root->root_key.objectid);
4266
4267 __add_reloc_root(reloc_root);
4268 new_root->reloc_root = reloc_root;
4269
4270 if (rc->create_reloc_tree) {
4271 ret = clone_backref_node(trans, rc, root, reloc_root);
4272 BUG_ON(ret);
4273 }
4274}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259 struct extent_buffer *leaf; 259 struct extent_buffer *leaf;
260 struct btrfs_path *path; 260 struct btrfs_path *path;
261 struct btrfs_key key; 261 struct btrfs_key key;
262 struct btrfs_key root_key;
263 struct btrfs_root *root;
262 int err = 0; 264 int err = 0;
263 int ret; 265 int ret;
264 266
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
270 key.type = BTRFS_ORPHAN_ITEM_KEY; 272 key.type = BTRFS_ORPHAN_ITEM_KEY;
271 key.offset = 0; 273 key.offset = 0;
272 274
275 root_key.type = BTRFS_ROOT_ITEM_KEY;
276 root_key.offset = (u64)-1;
277
273 while (1) { 278 while (1) {
274 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 279 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
275 if (ret < 0) { 280 if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
294 key.type != BTRFS_ORPHAN_ITEM_KEY) 299 key.type != BTRFS_ORPHAN_ITEM_KEY)
295 break; 300 break;
296 301
297 ret = btrfs_find_dead_roots(tree_root, key.offset); 302 root_key.objectid = key.offset;
298 if (ret) { 303 key.offset++;
304
305 root = btrfs_read_fs_root_no_name(tree_root->fs_info,
306 &root_key);
307 if (!IS_ERR(root))
308 continue;
309
310 ret = PTR_ERR(root);
311 if (ret != -ENOENT) {
299 err = ret; 312 err = ret;
300 break; 313 break;
301 } 314 }
302 315
303 key.offset++; 316 ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
317 if (ret) {
318 err = ret;
319 break;
320 }
304 } 321 }
305 322
306 btrfs_free_path(path); 323 btrfs_free_path(path);
@@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
313{ 330{
314 struct btrfs_path *path; 331 struct btrfs_path *path;
315 int ret; 332 int ret;
316 u32 refs;
317 struct btrfs_root_item *ri; 333 struct btrfs_root_item *ri;
318 struct extent_buffer *leaf; 334 struct extent_buffer *leaf;
319 335
@@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
327 leaf = path->nodes[0]; 343 leaf = path->nodes[0];
328 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); 344 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
329 345
330 refs = btrfs_disk_root_refs(leaf, ri);
331 BUG_ON(refs != 0);
332 ret = btrfs_del_item(trans, root, path); 346 ret = btrfs_del_item(trans, root, path);
333out: 347out:
334 btrfs_free_path(path); 348 btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff0538e..1776dbd8dc98 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb,
360 */ 360 */
361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
363 if (IS_ERR(di))
364 return ERR_CAST(di);
363 if (!di) { 365 if (!di) {
364 /* 366 /*
365 * Ok the default dir item isn't there. This is weird since 367 * Ok the default dir item isn't there. This is weird since
@@ -390,8 +392,8 @@ setup_root:
390 location.offset = 0; 392 location.offset = 0;
391 393
392 inode = btrfs_iget(sb, &location, new_root, &new); 394 inode = btrfs_iget(sb, &location, new_root, &new);
393 if (!inode) 395 if (IS_ERR(inode))
394 return ERR_PTR(-ENOMEM); 396 return ERR_CAST(inode);
395 397
396 /* 398 /*
397 * If we're just mounting the root most subvol put the inode and return 399 * If we're just mounting the root most subvol put the inode and return
@@ -498,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
498 btrfs_start_delalloc_inodes(root, 0); 500 btrfs_start_delalloc_inodes(root, 0);
499 btrfs_wait_ordered_extents(root, 0, 0); 501 btrfs_wait_ordered_extents(root, 0, 0);
500 502
501 trans = btrfs_start_transaction(root, 1); 503 trans = btrfs_start_transaction(root, 0);
502 ret = btrfs_commit_transaction(trans, root); 504 ret = btrfs_commit_transaction(trans, root);
503 return ret; 505 return ret;
504} 506}
@@ -694,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
694 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 696 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
695 return -EINVAL; 697 return -EINVAL;
696 698
697 /* recover relocation */ 699 ret = btrfs_cleanup_fs_roots(root->fs_info);
698 ret = btrfs_recover_relocation(root);
699 WARN_ON(ret); 700 WARN_ON(ret);
700 701
701 ret = btrfs_cleanup_fs_roots(root->fs_info); 702 /* recover relocation */
703 ret = btrfs_recover_relocation(root);
702 WARN_ON(ret); 704 WARN_ON(ret);
703 705
704 sb->s_flags &= ~MS_RDONLY; 706 sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +716,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
714 struct list_head *head = &root->fs_info->space_info; 716 struct list_head *head = &root->fs_info->space_info;
715 struct btrfs_space_info *found; 717 struct btrfs_space_info *found;
716 u64 total_used = 0; 718 u64 total_used = 0;
717 u64 data_used = 0;
718 int bits = dentry->d_sb->s_blocksize_bits; 719 int bits = dentry->d_sb->s_blocksize_bits;
719 __be32 *fsid = (__be32 *)root->fs_info->fsid; 720 __be32 *fsid = (__be32 *)root->fs_info->fsid;
720 721
721 rcu_read_lock(); 722 rcu_read_lock();
722 list_for_each_entry_rcu(found, head, list) { 723 list_for_each_entry_rcu(found, head, list)
723 if (found->flags & (BTRFS_BLOCK_GROUP_DUP| 724 total_used += found->disk_used;
724 BTRFS_BLOCK_GROUP_RAID10|
725 BTRFS_BLOCK_GROUP_RAID1)) {
726 total_used += found->bytes_used;
727 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
728 data_used += found->bytes_used;
729 else
730 data_used += found->total_bytes;
731 }
732
733 total_used += found->bytes_used;
734 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
735 data_used += found->bytes_used;
736 else
737 data_used += found->total_bytes;
738 }
739 rcu_read_unlock(); 725 rcu_read_unlock();
740 726
741 buf->f_namelen = BTRFS_NAME_LEN; 727 buf->f_namelen = BTRFS_NAME_LEN;
742 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 728 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
743 buf->f_bfree = buf->f_blocks - (total_used >> bits); 729 buf->f_bfree = buf->f_blocks - (total_used >> bits);
744 buf->f_bavail = buf->f_blocks - (data_used >> bits); 730 buf->f_bavail = buf->f_bfree;
745 buf->f_bsize = dentry->d_sb->s_blocksize; 731 buf->f_bsize = dentry->d_sb->s_blocksize;
746 buf->f_type = BTRFS_SUPER_MAGIC; 732 buf->f_type = BTRFS_SUPER_MAGIC;
747 733
@@ -811,7 +797,7 @@ static int btrfs_unfreeze(struct super_block *sb)
811 797
812static const struct super_operations btrfs_super_ops = { 798static const struct super_operations btrfs_super_ops = {
813 .drop_inode = btrfs_drop_inode, 799 .drop_inode = btrfs_drop_inode,
814 .delete_inode = btrfs_delete_inode, 800 .evict_inode = btrfs_evict_inode,
815 .put_super = btrfs_put_super, 801 .put_super = btrfs_put_super,
816 .sync_fs = btrfs_sync_fs, 802 .sync_fs = btrfs_sync_fs,
817 .show_options = btrfs_show_options, 803 .show_options = btrfs_show_options,
@@ -832,11 +818,14 @@ static const struct file_operations btrfs_ctl_fops = {
832}; 818};
833 819
834static struct miscdevice btrfs_misc = { 820static struct miscdevice btrfs_misc = {
835 .minor = MISC_DYNAMIC_MINOR, 821 .minor = BTRFS_MINOR,
836 .name = "btrfs-control", 822 .name = "btrfs-control",
837 .fops = &btrfs_ctl_fops 823 .fops = &btrfs_ctl_fops
838}; 824};
839 825
826MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
827MODULE_ALIAS("devname:btrfs-control");
828
840static int btrfs_interface_init(void) 829static int btrfs_interface_init(void)
841{ 830{
842 return misc_register(&btrfs_misc); 831 return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
165 TRANS_USERSPACE, 165 TRANS_USERSPACE,
166}; 166};
167 167
168static int may_wait_transaction(struct btrfs_root *root, int type)
169{
170 if (!root->fs_info->log_root_recovering &&
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172 type == TRANS_USERSPACE))
173 return 1;
174 return 0;
175}
176
168static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 177static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
169 int num_blocks, int type) 178 u64 num_items, int type)
170{ 179{
171 struct btrfs_trans_handle *h = 180 struct btrfs_trans_handle *h;
172 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
173 int ret; 183 int ret;
184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h)
187 return ERR_PTR(-ENOMEM);
174 188
175 mutex_lock(&root->fs_info->trans_mutex); 189 mutex_lock(&root->fs_info->trans_mutex);
176 if (!root->fs_info->log_root_recovering && 190 if (may_wait_transaction(root, type))
177 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
178 type == TRANS_USERSPACE))
179 wait_current_trans(root); 191 wait_current_trans(root);
192
180 ret = join_transaction(root); 193 ret = join_transaction(root);
181 BUG_ON(ret); 194 BUG_ON(ret);
182 195
183 h->transid = root->fs_info->running_transaction->transid; 196 cur_trans = root->fs_info->running_transaction;
184 h->transaction = root->fs_info->running_transaction; 197 cur_trans->use_count++;
185 h->blocks_reserved = num_blocks; 198 mutex_unlock(&root->fs_info->trans_mutex);
199
200 h->transid = cur_trans->transid;
201 h->transaction = cur_trans;
186 h->blocks_used = 0; 202 h->blocks_used = 0;
187 h->block_group = 0; 203 h->block_group = 0;
188 h->alloc_exclude_nr = 0; 204 h->bytes_reserved = 0;
189 h->alloc_exclude_start = 0;
190 h->delayed_ref_updates = 0; 205 h->delayed_ref_updates = 0;
206 h->block_rsv = NULL;
191 207
192 if (!current->journal_info && type != TRANS_USERSPACE) 208 smp_mb();
193 current->journal_info = h; 209 if (cur_trans->blocked && may_wait_transaction(root, type)) {
210 btrfs_commit_transaction(h, root);
211 goto again;
212 }
213
214 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items,
216 &retries);
217 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root);
219 goto again;
220 }
221 if (ret < 0) {
222 btrfs_end_transaction(h, root);
223 return ERR_PTR(ret);
224 }
225 }
194 226
195 root->fs_info->running_transaction->use_count++; 227 mutex_lock(&root->fs_info->trans_mutex);
196 record_root_in_trans(h, root); 228 record_root_in_trans(h, root);
197 mutex_unlock(&root->fs_info->trans_mutex); 229 mutex_unlock(&root->fs_info->trans_mutex);
230
231 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h;
198 return h; 233 return h;
199} 234}
200 235
201struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 236struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
202 int num_blocks) 237 int num_items)
203{ 238{
204 return start_transaction(root, num_blocks, TRANS_START); 239 return start_transaction(root, num_items, TRANS_START);
205} 240}
206struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
207 int num_blocks) 242 int num_blocks)
208{ 243{
209 return start_transaction(root, num_blocks, TRANS_JOIN); 244 return start_transaction(root, 0, TRANS_JOIN);
210} 245}
211 246
212struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
213 int num_blocks) 248 int num_blocks)
214{ 249{
215 return start_transaction(r, num_blocks, TRANS_USERSPACE); 250 return start_transaction(r, 0, TRANS_USERSPACE);
216} 251}
217 252
218/* wait for a transaction commit to be fully complete */ 253/* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
286 mutex_unlock(&root->fs_info->trans_mutex); 321 mutex_unlock(&root->fs_info->trans_mutex);
287} 322}
288 323
324static int should_end_transaction(struct btrfs_trans_handle *trans,
325 struct btrfs_root *root)
326{
327 int ret;
328 ret = btrfs_block_rsv_check(trans, root,
329 &root->fs_info->global_block_rsv, 0, 5);
330 return ret ? 1 : 0;
331}
332
333int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
334 struct btrfs_root *root)
335{
336 struct btrfs_transaction *cur_trans = trans->transaction;
337 int updates;
338
339 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
340 return 1;
341
342 updates = trans->delayed_ref_updates;
343 trans->delayed_ref_updates = 0;
344 if (updates)
345 btrfs_run_delayed_refs(trans, root, updates);
346
347 return should_end_transaction(trans, root);
348}
349
289static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
290 struct btrfs_root *root, int throttle) 351 struct btrfs_root *root, int throttle)
291{ 352{
292 struct btrfs_transaction *cur_trans; 353 struct btrfs_transaction *cur_trans = trans->transaction;
293 struct btrfs_fs_info *info = root->fs_info; 354 struct btrfs_fs_info *info = root->fs_info;
294 int count = 0; 355 int count = 0;
295 356
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
313 count++; 374 count++;
314 } 375 }
315 376
377 btrfs_trans_release_metadata(trans, root);
378
379 if (!root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1;
382
383 if (cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle)
385 return btrfs_commit_transaction(trans, root);
386 else
387 wake_up_process(info->transaction_kthread);
388 }
389
316 mutex_lock(&info->trans_mutex); 390 mutex_lock(&info->trans_mutex);
317 cur_trans = info->running_transaction; 391 WARN_ON(cur_trans != info->running_transaction);
318 WARN_ON(cur_trans != trans->transaction);
319 WARN_ON(cur_trans->num_writers < 1); 392 WARN_ON(cur_trans->num_writers < 1);
320 cur_trans->num_writers--; 393 cur_trans->num_writers--;
321 394
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
603 676
604 btrfs_free_log(trans, root); 677 btrfs_free_log(trans, root);
605 btrfs_update_reloc_root(trans, root); 678 btrfs_update_reloc_root(trans, root);
679 btrfs_orphan_commit_root(trans, root);
606 680
607 if (root->commit_root != root->node) { 681 if (root->commit_root != root->node) {
608 switch_commit_root(root); 682 switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
627int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 701int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
628{ 702{
629 struct btrfs_fs_info *info = root->fs_info; 703 struct btrfs_fs_info *info = root->fs_info;
630 int ret;
631 struct btrfs_trans_handle *trans; 704 struct btrfs_trans_handle *trans;
705 int ret;
632 unsigned long nr; 706 unsigned long nr;
633 707
634 smp_mb(); 708 if (xchg(&root->defrag_running, 1))
635 if (root->defrag_running)
636 return 0; 709 return 0;
637 trans = btrfs_start_transaction(root, 1); 710
638 while (1) { 711 while (1) {
639 root->defrag_running = 1; 712 trans = btrfs_start_transaction(root, 0);
713 if (IS_ERR(trans))
714 return PTR_ERR(trans);
715
640 ret = btrfs_defrag_leaves(trans, root, cacheonly); 716 ret = btrfs_defrag_leaves(trans, root, cacheonly);
717
641 nr = trans->blocks_used; 718 nr = trans->blocks_used;
642 btrfs_end_transaction(trans, root); 719 btrfs_end_transaction(trans, root);
643 btrfs_btree_balance_dirty(info->tree_root, nr); 720 btrfs_btree_balance_dirty(info->tree_root, nr);
644 cond_resched(); 721 cond_resched();
645 722
646 trans = btrfs_start_transaction(root, 1);
647 if (root->fs_info->closing || ret != -EAGAIN) 723 if (root->fs_info->closing || ret != -EAGAIN)
648 break; 724 break;
649 } 725 }
650 root->defrag_running = 0; 726 root->defrag_running = 0;
651 smp_mb(); 727 return ret;
652 btrfs_end_transaction(trans, root);
653 return 0;
654} 728}
655 729
656#if 0 730#if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
758 struct btrfs_root *root = pending->root; 832 struct btrfs_root *root = pending->root;
759 struct btrfs_root *parent_root; 833 struct btrfs_root *parent_root;
760 struct inode *parent_inode; 834 struct inode *parent_inode;
835 struct dentry *dentry;
761 struct extent_buffer *tmp; 836 struct extent_buffer *tmp;
762 struct extent_buffer *old; 837 struct extent_buffer *old;
763 int ret; 838 int ret;
764 u64 objectid; 839 int retries = 0;
765 int namelen; 840 u64 to_reserve = 0;
766 u64 index = 0; 841 u64 index = 0;
767 842 u64 objectid;
768 parent_inode = pending->dentry->d_parent->d_inode;
769 parent_root = BTRFS_I(parent_inode)->root;
770 843
771 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 844 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
772 if (!new_root_item) { 845 if (!new_root_item) {
773 ret = -ENOMEM; 846 pending->error = -ENOMEM;
774 goto fail; 847 goto fail;
775 } 848 }
849
776 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 850 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
777 if (ret) 851 if (ret) {
852 pending->error = ret;
778 goto fail; 853 goto fail;
854 }
855
856 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
857 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
858
859 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries);
862 if (ret) {
863 pending->error = ret;
864 goto fail;
865 }
866 }
779 867
780 key.objectid = objectid; 868 key.objectid = objectid;
781 /* record when the snapshot was created in key.offset */ 869 key.offset = (u64)-1;
782 key.offset = trans->transid; 870 key.type = BTRFS_ROOT_ITEM_KEY;
783 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
784 871
785 memcpy(&pending->root_key, &key, sizeof(key)); 872 trans->block_rsv = &pending->block_rsv;
786 pending->root_key.offset = (u64)-1;
787 873
874 dentry = pending->dentry;
875 parent_inode = dentry->d_parent->d_inode;
876 parent_root = BTRFS_I(parent_inode)->root;
788 record_root_in_trans(trans, parent_root); 877 record_root_in_trans(trans, parent_root);
878
789 /* 879 /*
790 * insert the directory item 880 * insert the directory item
791 */ 881 */
792 namelen = strlen(pending->name);
793 ret = btrfs_set_inode_index(parent_inode, &index); 882 ret = btrfs_set_inode_index(parent_inode, &index);
794 BUG_ON(ret); 883 BUG_ON(ret);
795 ret = btrfs_insert_dir_item(trans, parent_root, 884 ret = btrfs_insert_dir_item(trans, parent_root,
796 pending->name, namelen, 885 dentry->d_name.name, dentry->d_name.len,
797 parent_inode->i_ino, 886 parent_inode->i_ino, &key,
798 &pending->root_key, BTRFS_FT_DIR, index); 887 BTRFS_FT_DIR, index);
799 BUG_ON(ret); 888 BUG_ON(ret);
800 889
801 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 890 btrfs_i_size_write(parent_inode, parent_inode->i_size +
891 dentry->d_name.len * 2);
802 ret = btrfs_update_inode(trans, parent_root, parent_inode); 892 ret = btrfs_update_inode(trans, parent_root, parent_inode);
803 BUG_ON(ret); 893 BUG_ON(ret);
804 894
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
815 free_extent_buffer(old); 905 free_extent_buffer(old);
816 906
817 btrfs_set_root_node(new_root_item, tmp); 907 btrfs_set_root_node(new_root_item, tmp);
818 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 908 /* record when the snapshot was created in key.offset */
819 new_root_item); 909 key.offset = trans->transid;
820 BUG_ON(ret); 910 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
821 btrfs_tree_unlock(tmp); 911 btrfs_tree_unlock(tmp);
822 free_extent_buffer(tmp); 912 free_extent_buffer(tmp);
913 BUG_ON(ret);
823 914
824 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 915 /*
825 pending->root_key.objectid, 916 * insert root back/forward references
917 */
918 ret = btrfs_add_root_ref(trans, tree_root, objectid,
826 parent_root->root_key.objectid, 919 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name, 920 parent_inode->i_ino, index,
828 namelen); 921 dentry->d_name.name, dentry->d_name.len);
829 BUG_ON(ret); 922 BUG_ON(ret);
830 923
924 key.offset = (u64)-1;
925 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
926 BUG_ON(IS_ERR(pending->snap));
927
928 btrfs_reloc_post_snapshot(trans, pending);
929 btrfs_orphan_post_snapshot(trans, pending);
831fail: 930fail:
832 kfree(new_root_item); 931 kfree(new_root_item);
833 return ret; 932 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
933 return 0;
834} 934}
835 935
836/* 936/*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
878 return ret; 978 return ret;
879} 979}
880 980
981int btrfs_transaction_blocked(struct btrfs_fs_info *info)
982{
983 int ret = 0;
984 spin_lock(&info->new_trans_lock);
985 if (info->running_transaction)
986 ret = info->running_transaction->blocked;
987 spin_unlock(&info->new_trans_lock);
988 return ret;
989}
990
881int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 991int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
882 struct btrfs_root *root) 992 struct btrfs_root *root)
883{ 993{
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
899 ret = btrfs_run_delayed_refs(trans, root, 0); 1009 ret = btrfs_run_delayed_refs(trans, root, 0);
900 BUG_ON(ret); 1010 BUG_ON(ret);
901 1011
1012 btrfs_trans_release_metadata(trans, root);
1013
902 cur_trans = trans->transaction; 1014 cur_trans = trans->transaction;
903 /* 1015 /*
904 * set the flushing flag so procs in this transaction have to 1016 * set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
951 snap_pending = 1; 1063 snap_pending = 1;
952 1064
953 WARN_ON(cur_trans != trans->transaction); 1065 WARN_ON(cur_trans != trans->transaction);
954 prepare_to_wait(&cur_trans->writer_wait, &wait,
955 TASK_UNINTERRUPTIBLE);
956
957 if (cur_trans->num_writers > 1) 1066 if (cur_trans->num_writers > 1)
958 timeout = MAX_SCHEDULE_TIMEOUT; 1067 timeout = MAX_SCHEDULE_TIMEOUT;
959 else if (should_grow) 1068 else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
976 */ 1085 */
977 btrfs_run_ordered_operations(root, 1); 1086 btrfs_run_ordered_operations(root, 1);
978 1087
1088 prepare_to_wait(&cur_trans->writer_wait, &wait,
1089 TASK_UNINTERRUPTIBLE);
1090
979 smp_mb(); 1091 smp_mb();
980 if (cur_trans->num_writers > 1 || should_grow) 1092 if (cur_trans->num_writers > 1 || should_grow)
981 schedule_timeout(timeout); 1093 schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1103 1215
1104 if (btrfs_header_backref_rev(root->node) < 1216 if (btrfs_header_backref_rev(root->node) <
1105 BTRFS_MIXED_BACKREF_REV) 1217 BTRFS_MIXED_BACKREF_REV)
1106 btrfs_drop_snapshot(root, 0); 1218 btrfs_drop_snapshot(root, NULL, 0);
1107 else 1219 else
1108 btrfs_drop_snapshot(root, 1); 1220 btrfs_drop_snapshot(root, NULL, 1);
1109 } 1221 }
1110 return 0; 1222 return 0;
1111} 1223}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
45 45
46struct btrfs_trans_handle { 46struct btrfs_trans_handle {
47 u64 transid; 47 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved;
48 unsigned long blocks_reserved; 50 unsigned long blocks_reserved;
49 unsigned long blocks_used; 51 unsigned long blocks_used;
50 struct btrfs_transaction *transaction;
51 u64 block_group;
52 u64 alloc_exclude_start;
53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates; 52 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv;
55}; 55};
56 56
57struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
58 struct dentry *dentry; 58 struct dentry *dentry;
59 struct btrfs_root *root; 59 struct btrfs_root *root;
60 char *name; 60 struct btrfs_root *snap;
61 struct btrfs_key root_key; 61 /* block reservation for the operation */
62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */
64 int error;
62 struct list_head list; 65 struct list_head list;
63}; 66};
64 67
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
85int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root); 89 struct btrfs_root *root);
87struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
88 int num_blocks); 91 int num_items);
89struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
90 int num_blocks); 93 int num_blocks);
91struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
92 int num_blocks); 95 int num_blocks);
93int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 97 struct btrfs_root *root);
95int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 106 struct btrfs_root *root);
104int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 108 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root);
106void btrfs_throttle(struct btrfs_root *root); 111void btrfs_throttle(struct btrfs_root *root);
107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 112int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 113 struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark); 117 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root, 118int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark); 119 struct extent_io_tree *dirty_pages, int mark);
120int btrfs_transaction_blocked(struct btrfs_fs_info *info);
115int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 121int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
116#endif 122#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
117 path->nodes[1], 0, 117 path->nodes[1], 0,
118 cache_only, &last_ret, 118 cache_only, &last_ret,
119 &root->defrag_progress); 119 &root->defrag_progress);
120 WARN_ON(ret && ret != -EAGAIN); 120 if (ret) {
121 WARN_ON(ret == -EAGAIN);
122 goto out;
123 }
121 if (next_key_ret == 0) { 124 if (next_key_ret == 0) {
122 memcpy(&root->defrag_progress, &key, sizeof(key)); 125 memcpy(&root->defrag_progress, &key, sizeof(key));
123 ret = -EAGAIN; 126 ret = -EAGAIN;
124 } 127 }
125
126 btrfs_release_path(root, path);
127out: 128out:
128 if (path) 129 if (path)
129 btrfs_free_path(path); 130 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
135 struct btrfs_root *root) 135 struct btrfs_root *root)
136{ 136{
137 int ret; 137 int ret;
138 int err = 0;
138 139
139 mutex_lock(&root->log_mutex); 140 mutex_lock(&root->log_mutex);
140 if (root->log_root) { 141 if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
155 mutex_lock(&root->fs_info->tree_log_mutex); 156 mutex_lock(&root->fs_info->tree_log_mutex);
156 if (!root->fs_info->log_root_tree) { 157 if (!root->fs_info->log_root_tree) {
157 ret = btrfs_init_log_root_tree(trans, root->fs_info); 158 ret = btrfs_init_log_root_tree(trans, root->fs_info);
158 BUG_ON(ret); 159 if (ret)
160 err = ret;
159 } 161 }
160 if (!root->log_root) { 162 if (err == 0 && !root->log_root) {
161 ret = btrfs_add_log_tree(trans, root); 163 ret = btrfs_add_log_tree(trans, root);
162 BUG_ON(ret); 164 if (ret)
165 err = ret;
163 } 166 }
164 mutex_unlock(&root->fs_info->tree_log_mutex); 167 mutex_unlock(&root->fs_info->tree_log_mutex);
165 root->log_batch++; 168 root->log_batch++;
166 atomic_inc(&root->log_writers); 169 atomic_inc(&root->log_writers);
167 mutex_unlock(&root->log_mutex); 170 mutex_unlock(&root->log_mutex);
168 return 0; 171 return err;
169} 172}
170 173
171/* 174/*
@@ -376,7 +379,7 @@ insert:
376 BUG_ON(ret); 379 BUG_ON(ret);
377 } 380 }
378 } else if (ret) { 381 } else if (ret) {
379 BUG(); 382 return ret;
380 } 383 }
381 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
382 path->slots[0]); 385 path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1699 1702
1700 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1701 1704
1702 wc->process_func(root, next, wc, ptr_gen);
1703
1704 if (*level == 1) { 1705 if (*level == 1) {
1706 wc->process_func(root, next, wc, ptr_gen);
1707
1705 path->slots[*level]++; 1708 path->slots[*level]++;
1706 if (wc->free) { 1709 if (wc->free) {
1707 btrfs_read_buffer(next, ptr_gen); 1710 btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1734 WARN_ON(*level < 0); 1737 WARN_ON(*level < 0);
1735 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1738 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1736 1739
1737 if (path->nodes[*level] == root->node) 1740 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1738 parent = path->nodes[*level];
1739 else
1740 parent = path->nodes[*level + 1];
1741
1742 bytenr = path->nodes[*level]->start;
1743
1744 blocksize = btrfs_level_size(root, *level);
1745 root_owner = btrfs_header_owner(parent);
1746 root_gen = btrfs_header_generation(parent);
1747
1748 wc->process_func(root, path->nodes[*level], wc,
1749 btrfs_header_generation(path->nodes[*level]));
1750
1751 if (wc->free) {
1752 next = path->nodes[*level];
1753 btrfs_tree_lock(next);
1754 clean_tree_block(trans, root, next);
1755 btrfs_set_lock_blocking(next);
1756 btrfs_wait_tree_block_writeback(next);
1757 btrfs_tree_unlock(next);
1758
1759 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1760 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1761 BUG_ON(ret);
1762 }
1763 free_extent_buffer(path->nodes[*level]);
1764 path->nodes[*level] = NULL;
1765 *level += 1;
1766 1741
1767 cond_resched(); 1742 cond_resched();
1768 return 0; 1743 return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1781 1756
1782 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1783 slot = path->slots[i]; 1758 slot = path->slots[i];
1784 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1785 struct extent_buffer *node; 1760 struct extent_buffer *node;
1786 node = path->nodes[i]; 1761 node = path->nodes[i];
1787 path->slots[i]++; 1762 path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2047 mutex_unlock(&log_root_tree->log_mutex); 2022 mutex_unlock(&log_root_tree->log_mutex);
2048 2023
2049 ret = update_log_root(trans, log); 2024 ret = update_log_root(trans, log);
2050 BUG_ON(ret);
2051 2025
2052 mutex_lock(&log_root_tree->log_mutex); 2026 mutex_lock(&log_root_tree->log_mutex);
2053 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2027 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2056 wake_up(&log_root_tree->log_writer_wait); 2030 wake_up(&log_root_tree->log_writer_wait);
2057 } 2031 }
2058 2032
2033 if (ret) {
2034 BUG_ON(ret != -ENOSPC);
2035 root->fs_info->last_trans_log_full_commit = trans->transid;
2036 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2037 mutex_unlock(&log_root_tree->log_mutex);
2038 ret = -EAGAIN;
2039 goto out;
2040 }
2041
2059 index2 = log_root_tree->log_transid % 2; 2042 index2 = log_root_tree->log_transid % 2;
2060 if (atomic_read(&log_root_tree->log_commit[index2])) { 2043 if (atomic_read(&log_root_tree->log_commit[index2])) {
2061 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2044 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
2129 return 0; 2112 return 0;
2130} 2113}
2131 2114
2132/* 2115static void free_log_tree(struct btrfs_trans_handle *trans,
2133 * free all the extents used by the tree log. This should be called 2116 struct btrfs_root *log)
2134 * at commit time of the full transaction
2135 */
2136int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2137{ 2117{
2138 int ret; 2118 int ret;
2139 struct btrfs_root *log;
2140 struct key;
2141 u64 start; 2119 u64 start;
2142 u64 end; 2120 u64 end;
2143 struct walk_control wc = { 2121 struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2145 .process_func = process_one_buffer 2123 .process_func = process_one_buffer
2146 }; 2124 };
2147 2125
2148 if (!root->log_root || root->fs_info->log_root_recovering)
2149 return 0;
2150
2151 log = root->log_root;
2152 ret = walk_log_tree(trans, log, &wc); 2126 ret = walk_log_tree(trans, log, &wc);
2153 BUG_ON(ret); 2127 BUG_ON(ret);
2154 2128
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2162 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2136 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2163 } 2137 }
2164 2138
2165 if (log->log_transid > 0) {
2166 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2167 &log->root_key);
2168 BUG_ON(ret);
2169 }
2170 root->log_root = NULL;
2171 free_extent_buffer(log->node); 2139 free_extent_buffer(log->node);
2172 kfree(log); 2140 kfree(log);
2141}
2142
2143/*
2144 * free all the extents used by the tree log. This should be called
2145 * at commit time of the full transaction
2146 */
2147int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2148{
2149 if (root->log_root) {
2150 free_log_tree(trans, root->log_root);
2151 root->log_root = NULL;
2152 }
2153 return 0;
2154}
2155
2156int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2157 struct btrfs_fs_info *fs_info)
2158{
2159 if (fs_info->log_root_tree) {
2160 free_log_tree(trans, fs_info->log_root_tree);
2161 fs_info->log_root_tree = NULL;
2162 }
2173 return 0; 2163 return 0;
2174} 2164}
2175 2165
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2203 struct btrfs_dir_item *di; 2193 struct btrfs_dir_item *di;
2204 struct btrfs_path *path; 2194 struct btrfs_path *path;
2205 int ret; 2195 int ret;
2196 int err = 0;
2206 int bytes_del = 0; 2197 int bytes_del = 0;
2207 2198
2208 if (BTRFS_I(dir)->logged_trans < trans->transid) 2199 if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2218 path = btrfs_alloc_path(); 2209 path = btrfs_alloc_path();
2219 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2210 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2220 name, name_len, -1); 2211 name, name_len, -1);
2221 if (di && !IS_ERR(di)) { 2212 if (IS_ERR(di)) {
2213 err = PTR_ERR(di);
2214 goto fail;
2215 }
2216 if (di) {
2222 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2217 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2223 bytes_del += name_len; 2218 bytes_del += name_len;
2224 BUG_ON(ret); 2219 BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2226 btrfs_release_path(log, path); 2221 btrfs_release_path(log, path);
2227 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2228 index, name, name_len, -1); 2223 index, name, name_len, -1);
2229 if (di && !IS_ERR(di)) { 2224 if (IS_ERR(di)) {
2225 err = PTR_ERR(di);
2226 goto fail;
2227 }
2228 if (di) {
2230 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2229 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2231 bytes_del += name_len; 2230 bytes_del += name_len;
2232 BUG_ON(ret); 2231 BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2244 btrfs_release_path(log, path); 2243 btrfs_release_path(log, path);
2245 2244
2246 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2246 if (ret < 0) {
2247 err = ret;
2248 goto fail;
2249 }
2247 if (ret == 0) { 2250 if (ret == 0) {
2248 struct btrfs_inode_item *item; 2251 struct btrfs_inode_item *item;
2249 u64 i_size; 2252 u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2261 ret = 0; 2264 ret = 0;
2262 btrfs_release_path(log, path); 2265 btrfs_release_path(log, path);
2263 } 2266 }
2264 2267fail:
2265 btrfs_free_path(path); 2268 btrfs_free_path(path);
2266 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2269 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2270 if (ret == -ENOSPC) {
2271 root->fs_info->last_trans_log_full_commit = trans->transid;
2272 ret = 0;
2273 }
2267 btrfs_end_log_trans(root); 2274 btrfs_end_log_trans(root);
2268 2275
2269 return 0; 2276 return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2291 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2298 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2292 dirid, &index); 2299 dirid, &index);
2293 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2300 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2301 if (ret == -ENOSPC) {
2302 root->fs_info->last_trans_log_full_commit = trans->transid;
2303 ret = 0;
2304 }
2294 btrfs_end_log_trans(root); 2305 btrfs_end_log_trans(root);
2295 2306
2296 return ret; 2307 return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2318 else 2329 else
2319 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2330 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2320 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2331 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2321 BUG_ON(ret); 2332 if (ret)
2333 return ret;
2322 2334
2323 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2335 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2324 struct btrfs_dir_log_item); 2336 struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2343 struct btrfs_key max_key; 2355 struct btrfs_key max_key;
2344 struct btrfs_root *log = root->log_root; 2356 struct btrfs_root *log = root->log_root;
2345 struct extent_buffer *src; 2357 struct extent_buffer *src;
2358 int err = 0;
2346 int ret; 2359 int ret;
2347 int i; 2360 int i;
2348 int nritems; 2361 int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2405 ret = overwrite_item(trans, log, dst_path, 2418 ret = overwrite_item(trans, log, dst_path,
2406 path->nodes[0], path->slots[0], 2419 path->nodes[0], path->slots[0],
2407 &tmp); 2420 &tmp);
2421 if (ret) {
2422 err = ret;
2423 goto done;
2424 }
2408 } 2425 }
2409 } 2426 }
2410 btrfs_release_path(root, path); 2427 btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2432 goto done; 2449 goto done;
2433 ret = overwrite_item(trans, log, dst_path, src, i, 2450 ret = overwrite_item(trans, log, dst_path, src, i,
2434 &min_key); 2451 &min_key);
2435 BUG_ON(ret); 2452 if (ret) {
2453 err = ret;
2454 goto done;
2455 }
2436 } 2456 }
2437 path->slots[0] = nritems; 2457 path->slots[0] = nritems;
2438 2458
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2454 ret = overwrite_item(trans, log, dst_path, 2474 ret = overwrite_item(trans, log, dst_path,
2455 path->nodes[0], path->slots[0], 2475 path->nodes[0], path->slots[0],
2456 &tmp); 2476 &tmp);
2457 2477 if (ret)
2458 BUG_ON(ret); 2478 err = ret;
2459 last_offset = tmp.offset; 2479 else
2480 last_offset = tmp.offset;
2460 goto done; 2481 goto done;
2461 } 2482 }
2462 } 2483 }
2463done: 2484done:
2464 *last_offset_ret = last_offset;
2465 btrfs_release_path(root, path); 2485 btrfs_release_path(root, path);
2466 btrfs_release_path(log, dst_path); 2486 btrfs_release_path(log, dst_path);
2467 2487
2468 /* insert the log range keys to indicate where the log is valid */ 2488 if (err == 0) {
2469 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2489 *last_offset_ret = last_offset;
2470 first_offset, last_offset); 2490 /*
2471 BUG_ON(ret); 2491 * insert the log range keys to indicate where the log
2472 return 0; 2492 * is valid
2493 */
2494 ret = insert_dir_log_key(trans, log, path, key_type,
2495 inode->i_ino, first_offset,
2496 last_offset);
2497 if (ret)
2498 err = ret;
2499 }
2500 return err;
2473} 2501}
2474 2502
2475/* 2503/*
@@ -2501,7 +2529,8 @@ again:
2501 ret = log_dir_items(trans, root, inode, path, 2529 ret = log_dir_items(trans, root, inode, path,
2502 dst_path, key_type, min_key, 2530 dst_path, key_type, min_key,
2503 &max_key); 2531 &max_key);
2504 BUG_ON(ret); 2532 if (ret)
2533 return ret;
2505 if (max_key == (u64)-1) 2534 if (max_key == (u64)-1)
2506 break; 2535 break;
2507 min_key = max_key + 1; 2536 min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2535 2564
2536 while (1) { 2565 while (1) {
2537 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2566 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2538 2567 BUG_ON(ret == 0);
2539 if (ret != 1) 2568 if (ret < 0)
2540 break; 2569 break;
2541 2570
2542 if (path->slots[0] == 0) 2571 if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2554 btrfs_release_path(log, path); 2583 btrfs_release_path(log, path);
2555 } 2584 }
2556 btrfs_release_path(log, path); 2585 btrfs_release_path(log, path);
2557 return 0; 2586 return ret;
2558} 2587}
2559 2588
2560static noinline int copy_items(struct btrfs_trans_handle *trans, 2589static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2587 } 2616 }
2588 ret = btrfs_insert_empty_items(trans, log, dst_path, 2617 ret = btrfs_insert_empty_items(trans, log, dst_path,
2589 ins_keys, ins_sizes, nr); 2618 ins_keys, ins_sizes, nr);
2590 BUG_ON(ret); 2619 if (ret) {
2620 kfree(ins_data);
2621 return ret;
2622 }
2591 2623
2592 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2624 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2593 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2625 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2660 * we have to do this after the loop above to avoid changing the 2692 * we have to do this after the loop above to avoid changing the
2661 * log tree while trying to change the log tree. 2693 * log tree while trying to change the log tree.
2662 */ 2694 */
2695 ret = 0;
2663 while (!list_empty(&ordered_sums)) { 2696 while (!list_empty(&ordered_sums)) {
2664 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2697 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2665 struct btrfs_ordered_sum, 2698 struct btrfs_ordered_sum,
2666 list); 2699 list);
2667 ret = btrfs_csum_file_blocks(trans, log, sums); 2700 if (!ret)
2668 BUG_ON(ret); 2701 ret = btrfs_csum_file_blocks(trans, log, sums);
2669 list_del(&sums->list); 2702 list_del(&sums->list);
2670 kfree(sums); 2703 kfree(sums);
2671 } 2704 }
2672 return 0; 2705 return ret;
2673} 2706}
2674 2707
2675/* log a single inode in the tree log. 2708/* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2697 struct btrfs_root *log = root->log_root; 2730 struct btrfs_root *log = root->log_root;
2698 struct extent_buffer *src = NULL; 2731 struct extent_buffer *src = NULL;
2699 u32 size; 2732 u32 size;
2733 int err = 0;
2700 int ret; 2734 int ret;
2701 int nritems; 2735 int nritems;
2702 int ins_start_slot = 0; 2736 int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2739 } else { 2773 } else {
2740 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2774 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2741 } 2775 }
2742 BUG_ON(ret); 2776 if (ret) {
2777 err = ret;
2778 goto out_unlock;
2779 }
2743 path->keep_locks = 1; 2780 path->keep_locks = 1;
2744 2781
2745 while (1) { 2782 while (1) {
@@ -2768,7 +2805,10 @@ again:
2768 2805
2769 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2806 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2770 ins_nr, inode_only); 2807 ins_nr, inode_only);
2771 BUG_ON(ret); 2808 if (ret) {
2809 err = ret;
2810 goto out_unlock;
2811 }
2772 ins_nr = 1; 2812 ins_nr = 1;
2773 ins_start_slot = path->slots[0]; 2813 ins_start_slot = path->slots[0];
2774next_slot: 2814next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
2784 ret = copy_items(trans, log, dst_path, src, 2824 ret = copy_items(trans, log, dst_path, src,
2785 ins_start_slot, 2825 ins_start_slot,
2786 ins_nr, inode_only); 2826 ins_nr, inode_only);
2787 BUG_ON(ret); 2827 if (ret) {
2828 err = ret;
2829 goto out_unlock;
2830 }
2788 ins_nr = 0; 2831 ins_nr = 0;
2789 } 2832 }
2790 btrfs_release_path(root, path); 2833 btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
2802 ret = copy_items(trans, log, dst_path, src, 2845 ret = copy_items(trans, log, dst_path, src,
2803 ins_start_slot, 2846 ins_start_slot,
2804 ins_nr, inode_only); 2847 ins_nr, inode_only);
2805 BUG_ON(ret); 2848 if (ret) {
2849 err = ret;
2850 goto out_unlock;
2851 }
2806 ins_nr = 0; 2852 ins_nr = 0;
2807 } 2853 }
2808 WARN_ON(ins_nr); 2854 WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
2810 btrfs_release_path(root, path); 2856 btrfs_release_path(root, path);
2811 btrfs_release_path(log, dst_path); 2857 btrfs_release_path(log, dst_path);
2812 ret = log_directory_changes(trans, root, inode, path, dst_path); 2858 ret = log_directory_changes(trans, root, inode, path, dst_path);
2813 BUG_ON(ret); 2859 if (ret) {
2860 err = ret;
2861 goto out_unlock;
2862 }
2814 } 2863 }
2815 BTRFS_I(inode)->logged_trans = trans->transid; 2864 BTRFS_I(inode)->logged_trans = trans->transid;
2865out_unlock:
2816 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2866 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2817 2867
2818 btrfs_free_path(path); 2868 btrfs_free_path(path);
2819 btrfs_free_path(dst_path); 2869 btrfs_free_path(dst_path);
2820 return 0; 2870 return err;
2821} 2871}
2822 2872
2823/* 2873/*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2942 goto end_no_trans; 2992 goto end_no_trans;
2943 } 2993 }
2944 2994
2945 start_log_trans(trans, root); 2995 ret = start_log_trans(trans, root);
2996 if (ret)
2997 goto end_trans;
2946 2998
2947 ret = btrfs_log_inode(trans, root, inode, inode_only); 2999 ret = btrfs_log_inode(trans, root, inode, inode_only);
2948 BUG_ON(ret); 3000 if (ret)
3001 goto end_trans;
2949 3002
2950 /* 3003 /*
2951 * for regular files, if its inode is already on disk, we don't 3004 * for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2955 */ 3008 */
2956 if (S_ISREG(inode->i_mode) && 3009 if (S_ISREG(inode->i_mode) &&
2957 BTRFS_I(inode)->generation <= last_committed && 3010 BTRFS_I(inode)->generation <= last_committed &&
2958 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3011 BTRFS_I(inode)->last_unlink_trans <= last_committed) {
2959 goto no_parent; 3012 ret = 0;
3013 goto end_trans;
3014 }
2960 3015
2961 inode_only = LOG_INODE_EXISTS; 3016 inode_only = LOG_INODE_EXISTS;
2962 while (1) { 3017 while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2970 if (BTRFS_I(inode)->generation > 3025 if (BTRFS_I(inode)->generation >
2971 root->fs_info->last_trans_committed) { 3026 root->fs_info->last_trans_committed) {
2972 ret = btrfs_log_inode(trans, root, inode, inode_only); 3027 ret = btrfs_log_inode(trans, root, inode, inode_only);
2973 BUG_ON(ret); 3028 if (ret)
3029 goto end_trans;
2974 } 3030 }
2975 if (IS_ROOT(parent)) 3031 if (IS_ROOT(parent))
2976 break; 3032 break;
2977 3033
2978 parent = parent->d_parent; 3034 parent = parent->d_parent;
2979 } 3035 }
2980no_parent:
2981 ret = 0; 3036 ret = 0;
3037end_trans:
3038 if (ret < 0) {
3039 BUG_ON(ret != -ENOSPC);
3040 root->fs_info->last_trans_log_full_commit = trans->transid;
3041 ret = 1;
3042 }
2982 btrfs_end_log_trans(root); 3043 btrfs_end_log_trans(root);
2983end_no_trans: 3044end_no_trans:
2984 return ret; 3045 return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3020 path = btrfs_alloc_path(); 3081 path = btrfs_alloc_path();
3021 BUG_ON(!path); 3082 BUG_ON(!path);
3022 3083
3023 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3084 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3024 3085
3025 wc.trans = trans; 3086 wc.trans = trans;
3026 wc.pin = 1; 3087 wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 26 struct btrfs_root *root);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info);
28int btrfs_recover_log_trees(struct btrfs_root *tree_root); 30int btrfs_recover_log_trees(struct btrfs_root *tree_root);
29int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, struct dentry *dentry); 32 struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..dd318ff280b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -258,7 +258,7 @@ loop_lock:
258 258
259 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 259 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
260 260
261 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 261 if (cur->bi_rw & REQ_SYNC)
262 num_sync_run++; 262 num_sync_run++;
263 263
264 submit_bio(cur->bi_rw, cur); 264 submit_bio(cur->bi_rw, cur);
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1097 if (!path) 1097 if (!path)
1098 return -ENOMEM; 1098 return -ENOMEM;
1099 1099
1100 trans = btrfs_start_transaction(root, 1); 1100 trans = btrfs_start_transaction(root, 0);
1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1102 key.type = BTRFS_DEV_ITEM_KEY; 1102 key.type = BTRFS_DEV_ITEM_KEY;
1103 key.offset = device->devid; 1103 key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1486 goto error; 1486 goto error;
1487 } 1487 }
1488 1488
1489 trans = btrfs_start_transaction(root, 1); 1489 trans = btrfs_start_transaction(root, 0);
1490 lock_chunks(root); 1490 lock_chunks(root);
1491 1491
1492 device->barriers = 1; 1492 device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1751 1751
1752 /* step one, relocate all the extents inside this chunk */ 1752 /* step one, relocate all the extents inside this chunk */
1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1754 BUG_ON(ret); 1754 if (ret)
1755 return ret;
1755 1756
1756 trans = btrfs_start_transaction(root, 1); 1757 trans = btrfs_start_transaction(root, 0);
1757 BUG_ON(!trans); 1758 BUG_ON(!trans);
1758 1759
1759 lock_chunks(root); 1760 lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1925 break; 1926 break;
1926 BUG_ON(ret); 1927 BUG_ON(ret);
1927 1928
1928 trans = btrfs_start_transaction(dev_root, 1); 1929 trans = btrfs_start_transaction(dev_root, 0);
1929 BUG_ON(!trans); 1930 BUG_ON(!trans);
1930 1931
1931 ret = btrfs_grow_device(trans, device, old_size); 1932 ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
2094 } 2095 }
2095 2096
2096 /* Shrinking succeeded, else we would be at "done". */ 2097 /* Shrinking succeeded, else we would be at "done". */
2097 trans = btrfs_start_transaction(root, 1); 2098 trans = btrfs_start_transaction(root, 0);
2098 if (!trans) {
2099 ret = -ENOMEM;
2100 goto done;
2101 }
2102 lock_chunks(root); 2099 lock_chunks(root);
2103 2100
2104 device->disk_total_bytes = new_size; 2101 device->disk_total_bytes = new_size;
@@ -2654,7 +2651,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2654 int max_errors = 0; 2651 int max_errors = 0;
2655 struct btrfs_multi_bio *multi = NULL; 2652 struct btrfs_multi_bio *multi = NULL;
2656 2653
2657 if (multi_ret && !(rw & (1 << BIO_RW))) 2654 if (multi_ret && !(rw & REQ_WRITE))
2658 stripes_allocated = 1; 2655 stripes_allocated = 1;
2659again: 2656again:
2660 if (multi_ret) { 2657 if (multi_ret) {
@@ -2690,7 +2687,7 @@ again:
2690 mirror_num = 0; 2687 mirror_num = 0;
2691 2688
2692 /* if our multi bio struct is too small, back off and try again */ 2689 /* if our multi bio struct is too small, back off and try again */
2693 if (rw & (1 << BIO_RW)) { 2690 if (rw & REQ_WRITE) {
2694 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2691 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2695 BTRFS_BLOCK_GROUP_DUP)) { 2692 BTRFS_BLOCK_GROUP_DUP)) {
2696 stripes_required = map->num_stripes; 2693 stripes_required = map->num_stripes;
@@ -2700,7 +2697,7 @@ again:
2700 max_errors = 1; 2697 max_errors = 1;
2701 } 2698 }
2702 } 2699 }
2703 if (multi_ret && (rw & (1 << BIO_RW)) && 2700 if (multi_ret && (rw & REQ_WRITE) &&
2704 stripes_allocated < stripes_required) { 2701 stripes_allocated < stripes_required) {
2705 stripes_allocated = map->num_stripes; 2702 stripes_allocated = map->num_stripes;
2706 free_extent_map(em); 2703 free_extent_map(em);
@@ -2736,7 +2733,7 @@ again:
2736 num_stripes = 1; 2733 num_stripes = 1;
2737 stripe_index = 0; 2734 stripe_index = 0;
2738 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2735 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2739 if (unplug_page || (rw & (1 << BIO_RW))) 2736 if (unplug_page || (rw & REQ_WRITE))
2740 num_stripes = map->num_stripes; 2737 num_stripes = map->num_stripes;
2741 else if (mirror_num) 2738 else if (mirror_num)
2742 stripe_index = mirror_num - 1; 2739 stripe_index = mirror_num - 1;
@@ -2747,7 +2744,7 @@ again:
2747 } 2744 }
2748 2745
2749 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2746 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2750 if (rw & (1 << BIO_RW)) 2747 if (rw & REQ_WRITE)
2751 num_stripes = map->num_stripes; 2748 num_stripes = map->num_stripes;
2752 else if (mirror_num) 2749 else if (mirror_num)
2753 stripe_index = mirror_num - 1; 2750 stripe_index = mirror_num - 1;
@@ -2758,7 +2755,7 @@ again:
2758 stripe_index = do_div(stripe_nr, factor); 2755 stripe_index = do_div(stripe_nr, factor);
2759 stripe_index *= map->sub_stripes; 2756 stripe_index *= map->sub_stripes;
2760 2757
2761 if (unplug_page || (rw & (1 << BIO_RW))) 2758 if (unplug_page || (rw & REQ_WRITE))
2762 num_stripes = map->sub_stripes; 2759 num_stripes = map->sub_stripes;
2763 else if (mirror_num) 2760 else if (mirror_num)
2764 stripe_index += mirror_num - 1; 2761 stripe_index += mirror_num - 1;
@@ -2948,7 +2945,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2948 struct btrfs_pending_bios *pending_bios; 2945 struct btrfs_pending_bios *pending_bios;
2949 2946
2950 /* don't bother with additional async steps for reads, right now */ 2947 /* don't bother with additional async steps for reads, right now */
2951 if (!(rw & (1 << BIO_RW))) { 2948 if (!(rw & REQ_WRITE)) {
2952 bio_get(bio); 2949 bio_get(bio);
2953 submit_bio(rw, bio); 2950 submit_bio(rw, bio);
2954 bio_put(bio); 2951 bio_put(bio);
@@ -2967,7 +2964,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2967 bio->bi_rw |= rw; 2964 bio->bi_rw |= rw;
2968 2965
2969 spin_lock(&device->io_lock); 2966 spin_lock(&device->io_lock);
2970 if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) 2967 if (bio->bi_rw & REQ_SYNC)
2971 pending_bios = &device->pending_sync_bios; 2968 pending_bios = &device->pending_sync_bios;
2972 else 2969 else
2973 pending_bios = &device->pending_bios; 2970 pending_bios = &device->pending_bios;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
154 if (trans) 154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags); 155 return do_setxattr(trans, inode, name, value, size, flags);
156 156
157 ret = btrfs_reserve_metadata_space(root, 2); 157 trans = btrfs_start_transaction(root, 2);
158 if (ret) 158 if (IS_ERR(trans))
159 return ret; 159 return PTR_ERR(trans);
160 160
161 trans = btrfs_start_transaction(root, 1);
162 if (!trans) {
163 ret = -ENOMEM;
164 goto out;
165 }
166 btrfs_set_trans_block_group(trans, inode); 161 btrfs_set_trans_block_group(trans, inode);
167 162
168 ret = do_setxattr(trans, inode, name, value, size, flags); 163 ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
174 BUG_ON(ret); 169 BUG_ON(ret);
175out: 170out:
176 btrfs_end_transaction_throttle(trans, root); 171 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
178 return ret; 172 return ret;
179} 173}
180 174
@@ -282,7 +276,7 @@ err:
282 * List of handlers for synthetic system.* attributes. All real ondisk 276 * List of handlers for synthetic system.* attributes. All real ondisk
283 * attributes are handled directly. 277 * attributes are handled directly.
284 */ 278 */
285struct xattr_handler *btrfs_xattr_handlers[] = { 279const struct xattr_handler *btrfs_xattr_handlers[] = {
286#ifdef CONFIG_BTRFS_FS_POSIX_ACL 280#ifdef CONFIG_BTRFS_FS_POSIX_ACL
287 &btrfs_xattr_acl_access_handler, 281 &btrfs_xattr_acl_access_handler,
288 &btrfs_xattr_acl_default_handler, 282 &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
21 21
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23 23
24extern struct xattr_handler btrfs_xattr_acl_access_handler; 24extern const struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler; 25extern const struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[]; 26extern const struct xattr_handler *btrfs_xattr_handlers[];
27 27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size); 29 void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..3e7dca279d1c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
275 return; 275 return;
276 276
277 invalidate_bh_lrus(); 277 invalidate_bh_lrus();
278 lru_add_drain_all(); /* make sure all lru add caches are flushed */
278 invalidate_mapping_pages(mapping, 0, -1); 279 invalidate_mapping_pages(mapping, 0, -1);
279} 280}
280EXPORT_SYMBOL(invalidate_bdev); 281EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
560 return err; 561 return err;
561} 562}
562 563
563static void do_thaw_all(struct work_struct *work) 564static void do_thaw_one(struct super_block *sb, void *unused)
564{ 565{
565 struct super_block *sb;
566 char b[BDEVNAME_SIZE]; 566 char b[BDEVNAME_SIZE];
567 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
568 printk(KERN_WARNING "Emergency Thaw on %s\n",
569 bdevname(sb->s_bdev, b));
570}
567 571
568 spin_lock(&sb_lock); 572static void do_thaw_all(struct work_struct *work)
569restart: 573{
570 list_for_each_entry(sb, &super_blocks, s_list) { 574 iterate_supers(do_thaw_one, NULL);
571 sb->s_count++;
572 spin_unlock(&sb_lock);
573 down_read(&sb->s_umount);
574 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
575 printk(KERN_WARNING "Emergency Thaw on %s\n",
576 bdevname(sb->s_bdev, b));
577 up_read(&sb->s_umount);
578 spin_lock(&sb_lock);
579 if (__put_super_and_need_restart(sb))
580 goto restart;
581 }
582 spin_unlock(&sb_lock);
583 kfree(work); 575 kfree(work);
584 printk(KERN_WARNING "Emergency Thaw complete\n"); 576 printk(KERN_WARNING "Emergency Thaw complete\n");
585} 577}
@@ -778,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
778 spin_unlock(lock); 770 spin_unlock(lock);
779 /* 771 /*
780 * Ensure any pending I/O completes so that 772 * Ensure any pending I/O completes so that
781 * ll_rw_block() actually writes the current 773 * write_dirty_buffer() actually writes the
782 * contents - it is a noop if I/O is still in 774 * current contents - it is a noop if I/O is
783 * flight on potentially older contents. 775 * still in flight on potentially older
776 * contents.
784 */ 777 */
785 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh); 778 write_dirty_buffer(bh, WRITE_SYNC_PLUG);
786 779
787 /* 780 /*
788 * Kick off IO for the previous mapping. Note 781 * Kick off IO for the previous mapping. Note
@@ -1841,9 +1834,10 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1841} 1834}
1842EXPORT_SYMBOL(page_zero_new_buffers); 1835EXPORT_SYMBOL(page_zero_new_buffers);
1843 1836
1844static int __block_prepare_write(struct inode *inode, struct page *page, 1837int block_prepare_write(struct page *page, unsigned from, unsigned to,
1845 unsigned from, unsigned to, get_block_t *get_block) 1838 get_block_t *get_block)
1846{ 1839{
1840 struct inode *inode = page->mapping->host;
1847 unsigned block_start, block_end; 1841 unsigned block_start, block_end;
1848 sector_t block; 1842 sector_t block;
1849 int err = 0; 1843 int err = 0;
@@ -1916,10 +1910,13 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
1916 if (!buffer_uptodate(*wait_bh)) 1910 if (!buffer_uptodate(*wait_bh))
1917 err = -EIO; 1911 err = -EIO;
1918 } 1912 }
1919 if (unlikely(err)) 1913 if (unlikely(err)) {
1920 page_zero_new_buffers(page, from, to); 1914 page_zero_new_buffers(page, from, to);
1915 ClearPageUptodate(page);
1916 }
1921 return err; 1917 return err;
1922} 1918}
1919EXPORT_SYMBOL(block_prepare_write);
1923 1920
1924static int __block_commit_write(struct inode *inode, struct page *page, 1921static int __block_commit_write(struct inode *inode, struct page *page,
1925 unsigned from, unsigned to) 1922 unsigned from, unsigned to)
@@ -1956,62 +1953,40 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1956 return 0; 1953 return 0;
1957} 1954}
1958 1955
1956int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1957 get_block_t *get_block)
1958{
1959 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
1960
1961 return block_prepare_write(page, start, start + len, get_block);
1962}
1963EXPORT_SYMBOL(__block_write_begin);
1964
1959/* 1965/*
1960 * block_write_begin takes care of the basic task of block allocation and 1966 * block_write_begin takes care of the basic task of block allocation and
1961 * bringing partial write blocks uptodate first. 1967 * bringing partial write blocks uptodate first.
1962 * 1968 *
1963 * If *pagep is not NULL, then block_write_begin uses the locked page 1969 * The filesystem needs to handle block truncation upon failure.
1964 * at *pagep rather than allocating its own. In this case, the page will
1965 * not be unlocked or deallocated on failure.
1966 */ 1970 */
1967int block_write_begin(struct file *file, struct address_space *mapping, 1971int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1968 loff_t pos, unsigned len, unsigned flags, 1972 unsigned flags, struct page **pagep, get_block_t *get_block)
1969 struct page **pagep, void **fsdata,
1970 get_block_t *get_block)
1971{ 1973{
1972 struct inode *inode = mapping->host; 1974 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1973 int status = 0;
1974 struct page *page; 1975 struct page *page;
1975 pgoff_t index; 1976 int status;
1976 unsigned start, end;
1977 int ownpage = 0;
1978 1977
1979 index = pos >> PAGE_CACHE_SHIFT; 1978 page = grab_cache_page_write_begin(mapping, index, flags);
1980 start = pos & (PAGE_CACHE_SIZE - 1); 1979 if (!page)
1981 end = start + len; 1980 return -ENOMEM;
1982
1983 page = *pagep;
1984 if (page == NULL) {
1985 ownpage = 1;
1986 page = grab_cache_page_write_begin(mapping, index, flags);
1987 if (!page) {
1988 status = -ENOMEM;
1989 goto out;
1990 }
1991 *pagep = page;
1992 } else
1993 BUG_ON(!PageLocked(page));
1994 1981
1995 status = __block_prepare_write(inode, page, start, end, get_block); 1982 status = __block_write_begin(page, pos, len, get_block);
1996 if (unlikely(status)) { 1983 if (unlikely(status)) {
1997 ClearPageUptodate(page); 1984 unlock_page(page);
1998 1985 page_cache_release(page);
1999 if (ownpage) { 1986 page = NULL;
2000 unlock_page(page);
2001 page_cache_release(page);
2002 *pagep = NULL;
2003
2004 /*
2005 * prepare_write() may have instantiated a few blocks
2006 * outside i_size. Trim these off again. Don't need
2007 * i_size_read because we hold i_mutex.
2008 */
2009 if (pos + len > inode->i_size)
2010 vmtruncate(inode, inode->i_size);
2011 }
2012 } 1987 }
2013 1988
2014out: 1989 *pagep = page;
2015 return status; 1990 return status;
2016} 1991}
2017EXPORT_SYMBOL(block_write_begin); 1992EXPORT_SYMBOL(block_write_begin);
@@ -2344,7 +2319,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
2344 2319
2345 err = cont_expand_zero(file, mapping, pos, bytes); 2320 err = cont_expand_zero(file, mapping, pos, bytes);
2346 if (err) 2321 if (err)
2347 goto out; 2322 return err;
2348 2323
2349 zerofrom = *bytes & ~PAGE_CACHE_MASK; 2324 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2350 if (pos+len > *bytes && zerofrom & (blocksize-1)) { 2325 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
@@ -2352,25 +2327,10 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
2352 (*bytes)++; 2327 (*bytes)++;
2353 } 2328 }
2354 2329
2355 *pagep = NULL; 2330 return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2356 err = block_write_begin(file, mapping, pos, len,
2357 flags, pagep, fsdata, get_block);
2358out:
2359 return err;
2360} 2331}
2361EXPORT_SYMBOL(cont_write_begin); 2332EXPORT_SYMBOL(cont_write_begin);
2362 2333
2363int block_prepare_write(struct page *page, unsigned from, unsigned to,
2364 get_block_t *get_block)
2365{
2366 struct inode *inode = page->mapping->host;
2367 int err = __block_prepare_write(inode, page, from, to, get_block);
2368 if (err)
2369 ClearPageUptodate(page);
2370 return err;
2371}
2372EXPORT_SYMBOL(block_prepare_write);
2373
2374int block_commit_write(struct page *page, unsigned from, unsigned to) 2334int block_commit_write(struct page *page, unsigned from, unsigned to)
2375{ 2335{
2376 struct inode *inode = page->mapping->host; 2336 struct inode *inode = page->mapping->host;
@@ -2389,7 +2349,7 @@ EXPORT_SYMBOL(block_commit_write);
2389 * 2349 *
2390 * We are not allowed to take the i_mutex here so we have to play games to 2350 * We are not allowed to take the i_mutex here so we have to play games to
2391 * protect against truncate races as the page could now be beyond EOF. Because 2351 * protect against truncate races as the page could now be beyond EOF. Because
2392 * vmtruncate() writes the inode size before removing pages, once we have the 2352 * truncate writes the inode size before removing pages, once we have the
2393 * page lock we can determine safely if the page is beyond EOF. If it is not 2353 * page lock we can determine safely if the page is beyond EOF. If it is not
2394 * beyond EOF, then the page is guaranteed safe against truncation until we 2354 * beyond EOF, then the page is guaranteed safe against truncation until we
2395 * unlock the page. 2355 * unlock the page.
@@ -2474,8 +2434,9 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2474/* 2434/*
2475 * On entry, the page is fully not uptodate. 2435 * On entry, the page is fully not uptodate.
2476 * On exit the page is fully uptodate in the areas outside (from,to) 2436 * On exit the page is fully uptodate in the areas outside (from,to)
2437 * The filesystem needs to handle block truncation upon failure.
2477 */ 2438 */
2478int nobh_write_begin(struct file *file, struct address_space *mapping, 2439int nobh_write_begin(struct address_space *mapping,
2479 loff_t pos, unsigned len, unsigned flags, 2440 loff_t pos, unsigned len, unsigned flags,
2480 struct page **pagep, void **fsdata, 2441 struct page **pagep, void **fsdata,
2481 get_block_t *get_block) 2442 get_block_t *get_block)
@@ -2508,8 +2469,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
2508 unlock_page(page); 2469 unlock_page(page);
2509 page_cache_release(page); 2470 page_cache_release(page);
2510 *pagep = NULL; 2471 *pagep = NULL;
2511 return block_write_begin(file, mapping, pos, len, flags, pagep, 2472 return block_write_begin(mapping, pos, len, flags, pagep,
2512 fsdata, get_block); 2473 get_block);
2513 } 2474 }
2514 2475
2515 if (PageMappedToDisk(page)) 2476 if (PageMappedToDisk(page))
@@ -2613,9 +2574,6 @@ out_release:
2613 page_cache_release(page); 2574 page_cache_release(page);
2614 *pagep = NULL; 2575 *pagep = NULL;
2615 2576
2616 if (pos + len > inode->i_size)
2617 vmtruncate(inode, inode->i_size);
2618
2619 return ret; 2577 return ret;
2620} 2578}
2621EXPORT_SYMBOL(nobh_write_begin); 2579EXPORT_SYMBOL(nobh_write_begin);
@@ -2955,13 +2913,6 @@ int submit_bh(int rw, struct buffer_head * bh)
2955 BUG_ON(buffer_unwritten(bh)); 2913 BUG_ON(buffer_unwritten(bh));
2956 2914
2957 /* 2915 /*
2958 * Mask in barrier bit for a write (could be either a WRITE or a
2959 * WRITE_SYNC
2960 */
2961 if (buffer_ordered(bh) && (rw & WRITE))
2962 rw |= WRITE_BARRIER;
2963
2964 /*
2965 * Only clear out a write error when rewriting 2916 * Only clear out a write error when rewriting
2966 */ 2917 */
2967 if (test_set_buffer_req(bh) && (rw & WRITE)) 2918 if (test_set_buffer_req(bh) && (rw & WRITE))
@@ -2999,22 +2950,21 @@ EXPORT_SYMBOL(submit_bh);
2999 2950
3000/** 2951/**
3001 * ll_rw_block: low-level access to block devices (DEPRECATED) 2952 * ll_rw_block: low-level access to block devices (DEPRECATED)
3002 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) 2953 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
3003 * @nr: number of &struct buffer_heads in the array 2954 * @nr: number of &struct buffer_heads in the array
3004 * @bhs: array of pointers to &struct buffer_head 2955 * @bhs: array of pointers to &struct buffer_head
3005 * 2956 *
3006 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and 2957 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3007 * requests an I/O operation on them, either a %READ or a %WRITE. The third 2958 * requests an I/O operation on them, either a %READ or a %WRITE. The third
3008 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers 2959 * %READA option is described in the documentation for generic_make_request()
3009 * are sent to disk. The fourth %READA option is described in the documentation 2960 * which ll_rw_block() calls.
3010 * for generic_make_request() which ll_rw_block() calls.
3011 * 2961 *
3012 * This function drops any buffer that it cannot get a lock on (with the 2962 * This function drops any buffer that it cannot get a lock on (with the
3013 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be 2963 * BH_Lock state bit), any buffer that appears to be clean when doing a write
3014 * clean when doing a write request, and any buffer that appears to be 2964 * request, and any buffer that appears to be up-to-date when doing read
3015 * up-to-date when doing read request. Further it marks as clean buffers that 2965 * request. Further it marks as clean buffers that are processed for
3016 * are processed for writing (the buffer cache won't assume that they are 2966 * writing (the buffer cache won't assume that they are actually clean
3017 * actually clean until the buffer gets unlocked). 2967 * until the buffer gets unlocked).
3018 * 2968 *
3019 * ll_rw_block sets b_end_io to simple completion handler that marks 2969 * ll_rw_block sets b_end_io to simple completion handler that marks
3020 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 2970 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -3030,20 +2980,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3030 for (i = 0; i < nr; i++) { 2980 for (i = 0; i < nr; i++) {
3031 struct buffer_head *bh = bhs[i]; 2981 struct buffer_head *bh = bhs[i];
3032 2982
3033 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG) 2983 if (!trylock_buffer(bh))
3034 lock_buffer(bh);
3035 else if (!trylock_buffer(bh))
3036 continue; 2984 continue;
3037 2985 if (rw == WRITE) {
3038 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
3039 rw == SWRITE_SYNC_PLUG) {
3040 if (test_clear_buffer_dirty(bh)) { 2986 if (test_clear_buffer_dirty(bh)) {
3041 bh->b_end_io = end_buffer_write_sync; 2987 bh->b_end_io = end_buffer_write_sync;
3042 get_bh(bh); 2988 get_bh(bh);
3043 if (rw == SWRITE_SYNC) 2989 submit_bh(WRITE, bh);
3044 submit_bh(WRITE_SYNC, bh);
3045 else
3046 submit_bh(WRITE, bh);
3047 continue; 2990 continue;
3048 } 2991 }
3049 } else { 2992 } else {
@@ -3059,12 +3002,25 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3059} 3002}
3060EXPORT_SYMBOL(ll_rw_block); 3003EXPORT_SYMBOL(ll_rw_block);
3061 3004
3005void write_dirty_buffer(struct buffer_head *bh, int rw)
3006{
3007 lock_buffer(bh);
3008 if (!test_clear_buffer_dirty(bh)) {
3009 unlock_buffer(bh);
3010 return;
3011 }
3012 bh->b_end_io = end_buffer_write_sync;
3013 get_bh(bh);
3014 submit_bh(rw, bh);
3015}
3016EXPORT_SYMBOL(write_dirty_buffer);
3017
3062/* 3018/*
3063 * For a data-integrity writeout, we need to wait upon any in-progress I/O 3019 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3064 * and then start new I/O and then wait upon it. The caller must have a ref on 3020 * and then start new I/O and then wait upon it. The caller must have a ref on
3065 * the buffer_head. 3021 * the buffer_head.
3066 */ 3022 */
3067int sync_dirty_buffer(struct buffer_head *bh) 3023int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3068{ 3024{
3069 int ret = 0; 3025 int ret = 0;
3070 3026
@@ -3073,7 +3029,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
3073 if (test_clear_buffer_dirty(bh)) { 3029 if (test_clear_buffer_dirty(bh)) {
3074 get_bh(bh); 3030 get_bh(bh);
3075 bh->b_end_io = end_buffer_write_sync; 3031 bh->b_end_io = end_buffer_write_sync;
3076 ret = submit_bh(WRITE_SYNC, bh); 3032 ret = submit_bh(rw, bh);
3077 wait_on_buffer(bh); 3033 wait_on_buffer(bh);
3078 if (buffer_eopnotsupp(bh)) { 3034 if (buffer_eopnotsupp(bh)) {
3079 clear_buffer_eopnotsupp(bh); 3035 clear_buffer_eopnotsupp(bh);
@@ -3086,6 +3042,12 @@ int sync_dirty_buffer(struct buffer_head *bh)
3086 } 3042 }
3087 return ret; 3043 return ret;
3088} 3044}
3045EXPORT_SYMBOL(__sync_dirty_buffer);
3046
3047int sync_dirty_buffer(struct buffer_head *bh)
3048{
3049 return __sync_dirty_buffer(bh, WRITE_SYNC);
3050}
3089EXPORT_SYMBOL(sync_dirty_buffer); 3051EXPORT_SYMBOL(sync_dirty_buffer);
3090 3052
3091/* 3053/*
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 2906077ac798..a2603e7c0bb5 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -146,7 +146,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
146 goto error_unsupported; 146 goto error_unsupported;
147 147
148 /* get the cache size and blocksize */ 148 /* get the cache size and blocksize */
149 ret = vfs_statfs(root, &stats); 149 ret = vfs_statfs(&path, &stats);
150 if (ret < 0) 150 if (ret < 0)
151 goto error_unsupported; 151 goto error_unsupported;
152 152
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index c2413561ea75..727caedcdd92 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -552,8 +552,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
552 */ 552 */
553static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) 553static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
554{ 554{
555 struct fs_struct *fs; 555 struct path path;
556 struct dentry *dir;
557 const struct cred *saved_cred; 556 const struct cred *saved_cred;
558 int ret; 557 int ret;
559 558
@@ -573,24 +572,21 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
573 } 572 }
574 573
575 /* extract the directory dentry from the cwd */ 574 /* extract the directory dentry from the cwd */
576 fs = current->fs; 575 get_fs_pwd(current->fs, &path);
577 read_lock(&fs->lock);
578 dir = dget(fs->pwd.dentry);
579 read_unlock(&fs->lock);
580 576
581 if (!S_ISDIR(dir->d_inode->i_mode)) 577 if (!S_ISDIR(path.dentry->d_inode->i_mode))
582 goto notdir; 578 goto notdir;
583 579
584 cachefiles_begin_secure(cache, &saved_cred); 580 cachefiles_begin_secure(cache, &saved_cred);
585 ret = cachefiles_cull(cache, dir, args); 581 ret = cachefiles_cull(cache, path.dentry, args);
586 cachefiles_end_secure(cache, saved_cred); 582 cachefiles_end_secure(cache, saved_cred);
587 583
588 dput(dir); 584 path_put(&path);
589 _leave(" = %d", ret); 585 _leave(" = %d", ret);
590 return ret; 586 return ret;
591 587
592notdir: 588notdir:
593 dput(dir); 589 path_put(&path);
594 kerror("cull command requires dirfd to be a directory"); 590 kerror("cull command requires dirfd to be a directory");
595 return -ENOTDIR; 591 return -ENOTDIR;
596 592
@@ -628,8 +624,7 @@ inval:
628 */ 624 */
629static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) 625static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
630{ 626{
631 struct fs_struct *fs; 627 struct path path;
632 struct dentry *dir;
633 const struct cred *saved_cred; 628 const struct cred *saved_cred;
634 int ret; 629 int ret;
635 630
@@ -649,24 +644,21 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
649 } 644 }
650 645
651 /* extract the directory dentry from the cwd */ 646 /* extract the directory dentry from the cwd */
652 fs = current->fs; 647 get_fs_pwd(current->fs, &path);
653 read_lock(&fs->lock);
654 dir = dget(fs->pwd.dentry);
655 read_unlock(&fs->lock);
656 648
657 if (!S_ISDIR(dir->d_inode->i_mode)) 649 if (!S_ISDIR(path.dentry->d_inode->i_mode))
658 goto notdir; 650 goto notdir;
659 651
660 cachefiles_begin_secure(cache, &saved_cred); 652 cachefiles_begin_secure(cache, &saved_cred);
661 ret = cachefiles_check_in_use(cache, dir, args); 653 ret = cachefiles_check_in_use(cache, path.dentry, args);
662 cachefiles_end_secure(cache, saved_cred); 654 cachefiles_end_secure(cache, saved_cred);
663 655
664 dput(dir); 656 path_put(&path);
665 //_leave(" = %d", ret); 657 //_leave(" = %d", ret);
666 return ret; 658 return ret;
667 659
668notdir: 660notdir:
669 dput(dir); 661 path_put(&path);
670 kerror("inuse command requires dirfd to be a directory"); 662 kerror("inuse command requires dirfd to be a directory");
671 return -ENOTDIR; 663 return -ENOTDIR;
672 664
@@ -683,6 +675,10 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
683 unsigned fnr, unsigned bnr) 675 unsigned fnr, unsigned bnr)
684{ 676{
685 struct kstatfs stats; 677 struct kstatfs stats;
678 struct path path = {
679 .mnt = cache->mnt,
680 .dentry = cache->mnt->mnt_root,
681 };
686 int ret; 682 int ret;
687 683
688 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u", 684 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
@@ -697,7 +693,7 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
697 /* find out how many pages of blockdev are available */ 693 /* find out how many pages of blockdev are available */
698 memset(&stats, 0, sizeof(stats)); 694 memset(&stats, 0, sizeof(stats));
699 695
700 ret = vfs_statfs(cache->mnt->mnt_root, &stats); 696 ret = vfs_statfs(&path, &stats);
701 if (ret < 0) { 697 if (ret < 0) {
702 if (ret == -EIO) 698 if (ret == -EIO)
703 cachefiles_io_error(cache, "statfs failed"); 699 cachefiles_io_error(cache, "statfs failed");
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index a8cd821226da..bd6bc1bde2d7 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -267,13 +267,6 @@ do { \
267#define dbgprintk(FMT, ...) \ 267#define dbgprintk(FMT, ...) \
268 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) 268 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
269 269
270/* make sure we maintain the format strings, even when debugging is disabled */
271static inline void _dbprintk(const char *fmt, ...)
272 __attribute__((format(printf, 1, 2)));
273static inline void _dbprintk(const char *fmt, ...)
274{
275}
276
277#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 270#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
278#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 271#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
279#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) 272#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
@@ -304,9 +297,9 @@ do { \
304} while (0) 297} while (0)
305 298
306#else 299#else
307#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 300#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
308#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 301#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
309#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) 302#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
310#endif 303#endif
311 304
312#if 1 /* defined(__KDEBUGALL) */ 305#if 1 /* defined(__KDEBUGALL) */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a7840bf42c..42c7fafc8bfe 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
37 37
38 printk(KERN_ERR "%sobject: OBJ%x\n", 38 printk(KERN_ERR "%sobject: OBJ%x\n",
39 prefix, object->fscache.debug_id); 39 prefix, object->fscache.debug_id);
40 printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n", 40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
41 prefix, fscache_object_states[object->fscache.state], 41 prefix, fscache_object_states[object->fscache.state],
42 object->fscache.flags, object->fscache.work.flags, 42 object->fscache.flags, work_busy(&object->fscache.work),
43 object->fscache.events, 43 object->fscache.events,
44 object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); 44 object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
45 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", 45 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -212,7 +212,7 @@ wait_for_old_object:
212 212
213 /* if the object we're waiting for is queued for processing, 213 /* if the object we're waiting for is queued for processing,
214 * then just put ourselves on the queue behind it */ 214 * then just put ourselves on the queue behind it */
215 if (slow_work_is_queued(&xobject->fscache.work)) { 215 if (work_pending(&xobject->fscache.work)) {
216 _debug("queue OBJ%x behind OBJ%x immediately", 216 _debug("queue OBJ%x behind OBJ%x immediately",
217 object->fscache.debug_id, 217 object->fscache.debug_id,
218 xobject->fscache.debug_id); 218 xobject->fscache.debug_id);
@@ -220,8 +220,7 @@ wait_for_old_object:
220 } 220 }
221 221
222 /* otherwise we sleep until either the object we're waiting for 222 /* otherwise we sleep until either the object we're waiting for
223 * is done, or the slow-work facility wants the thread back to 223 * is done, or the fscache_object is congested */
224 * do other work */
225 wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); 224 wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
226 init_wait(&wait); 225 init_wait(&wait);
227 requeue = false; 226 requeue = false;
@@ -229,8 +228,8 @@ wait_for_old_object:
229 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 228 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
230 if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) 229 if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
231 break; 230 break;
232 requeue = slow_work_sleep_till_thread_needed( 231
233 &object->fscache.work, &timeout); 232 requeue = fscache_object_sleep_till_congested(&timeout);
234 } while (timeout > 0 && !requeue); 233 } while (timeout > 0 && !requeue);
235 finish_wait(wq, &wait); 234 finish_wait(wq, &wait);
236 235
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 0f0d41fbb03f..0e3c0924cc3a 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
422 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; 422 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
423 423
424 op->op.flags &= FSCACHE_OP_KEEP_FLAGS; 424 op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
425 op->op.flags |= FSCACHE_OP_FAST; 425 op->op.flags |= FSCACHE_OP_ASYNC;
426 op->op.processor = cachefiles_read_copier; 426 op->op.processor = cachefiles_read_copier;
427 427
428 pagevec_init(&pagevec, 0); 428 pagevec_init(&pagevec, 0);
@@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
729 pagevec_init(&pagevec, 0); 729 pagevec_init(&pagevec, 0);
730 730
731 op->op.flags &= FSCACHE_OP_KEEP_FLAGS; 731 op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
732 op->op.flags |= FSCACHE_OP_FAST; 732 op->op.flags |= FSCACHE_OP_ASYNC;
733 op->op.processor = cachefiles_read_copier; 733 op->op.processor = cachefiles_read_copier;
734 734
735 INIT_LIST_HEAD(&backpages); 735 INIT_LIST_HEAD(&backpages);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 04b8280582a9..0fcd2640c23f 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -2,7 +2,8 @@ config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)" 2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C 4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES 5 select CRYPTO_AES
6 select CRYPTO
6 help 7 help
7 Choose Y or M here to include support for mounting the 8 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely 9 experimental Ceph distributed file system. Ceph is an extremely
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 6a660e610be8..278e1172600d 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@ ifneq ($(KERNELRELEASE),)
6 6
7obj-$(CONFIG_CEPH_FS) += ceph.o 7obj-$(CONFIG_CEPH_FS) += ceph.o
8 8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ 9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \ 11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \ 12 mds_client.o mdsmap.o \
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed4..efbc604001c8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
87 87
88 /* dirty the head */ 88 /* dirty the head */
89 spin_lock(&inode->i_lock); 89 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0) 90 if (ci->i_head_snapc == NULL)
91 ci->i_head_snapc = ceph_get_snap_context(snapc); 91 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head; 92 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0) 93 if (ci->i_wrbuffer_ref == 0)
@@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page)
105 spin_lock_irq(&mapping->tree_lock); 105 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */ 106 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page)); 107 WARN_ON_ONCE(!PageUptodate(page));
108 108 account_page_dirtied(page, page->mapping);
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree, 109 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY); 110 page_index(page), PAGECACHE_TAG_DIRTY);
117 111
@@ -274,7 +268,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 268 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0; 269 int rc = 0;
276 struct page **pages; 270 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset; 271 loff_t offset;
279 u64 len; 272 u64 len;
280 273
@@ -297,8 +290,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
297 if (rc < 0) 290 if (rc < 0)
298 goto out; 291 goto out;
299 292
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0; 293 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { 294 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page = 295 struct page *page =
@@ -312,7 +303,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
312 zero_user_segment(page, s, PAGE_CACHE_SIZE); 303 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 } 304 }
314 305
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { 306 if (add_to_page_cache_lru(page, mapping, page->index,
307 GFP_NOFS)) {
316 page_cache_release(page); 308 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n", 309 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page); 310 inode, page);
@@ -323,10 +315,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
323 flush_dcache_page(page); 315 flush_dcache_page(page);
324 SetPageUptodate(page); 316 SetPageUptodate(page);
325 unlock_page(page); 317 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0) 318 page_cache_release(page);
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 } 319 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0; 320 rc = 0;
331 321
332out: 322out:
@@ -356,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
356 break; 346 break;
357 } 347 }
358 } 348 }
359 if (!snapc && ci->i_head_snapc) { 349 if (!snapc && ci->i_wrbuffer_ref_head) {
360 snapc = ceph_get_snap_context(ci->i_head_snapc); 350 snapc = ceph_get_snap_context(ci->i_head_snapc);
361 dout(" head snapc %p has %d dirty pages\n", 351 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head); 352 snapc, ci->i_wrbuffer_ref_head);
@@ -421,8 +411,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
421 if (i_size < page_off + len) 411 if (i_size < page_off + len)
422 len = i_size - page_off; 412 len = i_size - page_off;
423 413
424 dout("writepage %p page %p index %lu on %llu~%u\n", 414 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
425 inode, page, page->index, page_off, len); 415 inode, page, page->index, page_off, len, snapc);
426 416
427 writeback_stat = atomic_long_inc_return(&client->writeback_count); 417 writeback_stat = atomic_long_inc_return(&client->writeback_count);
428 if (writeback_stat > 418 if (writeback_stat >
@@ -557,7 +547,7 @@ static void writepages_finish(struct ceph_osd_request *req,
557 * page truncation thread, possibly losing some data that 547 * page truncation thread, possibly losing some data that
558 * raced its way in 548 * raced its way in
559 */ 549 */
560 if ((issued & CEPH_CAP_FILE_CACHE) == 0) 550 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
561 generic_error_remove_page(inode->i_mapping, page); 551 generic_error_remove_page(inode->i_mapping, page);
562 552
563 unlock_page(page); 553 unlock_page(page);
@@ -568,7 +558,7 @@ static void writepages_finish(struct ceph_osd_request *req,
568 ceph_release_pages(req->r_pages, req->r_num_pages); 558 ceph_release_pages(req->r_pages, req->r_num_pages);
569 if (req->r_pages_from_pool) 559 if (req->r_pages_from_pool)
570 mempool_free(req->r_pages, 560 mempool_free(req->r_pages,
571 ceph_client(inode->i_sb)->wb_pagevec_pool); 561 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
572 else 562 else
573 kfree(req->r_pages); 563 kfree(req->r_pages);
574 ceph_osdc_put_request(req); 564 ceph_osdc_put_request(req);
@@ -776,7 +766,8 @@ get_more_pages:
776 /* ok */ 766 /* ok */
777 if (locked_pages == 0) { 767 if (locked_pages == 0) {
778 /* prepare async write request */ 768 /* prepare async write request */
779 offset = page->index << PAGE_CACHE_SHIFT; 769 offset = (unsigned long long)page->index
770 << PAGE_CACHE_SHIFT;
780 len = wsize; 771 len = wsize;
781 req = ceph_osdc_new_request(&client->osdc, 772 req = ceph_osdc_new_request(&client->osdc,
782 &ci->i_layout, 773 &ci->i_layout,
@@ -802,9 +793,12 @@ get_more_pages:
802 dout("%p will write page %p idx %lu\n", 793 dout("%p will write page %p idx %lu\n",
803 inode, page, page->index); 794 inode, page, page->index);
804 795
805 writeback_stat = atomic_long_inc_return(&client->writeback_count); 796 writeback_stat =
806 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { 797 atomic_long_inc_return(&client->writeback_count);
807 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 798 if (writeback_stat > CONGESTION_ON_THRESH(
799 client->mount_args->congestion_kb)) {
800 set_bdi_congested(&client->backing_dev_info,
801 BLK_RW_ASYNC);
808 } 802 }
809 803
810 set_page_writeback(page); 804 set_page_writeback(page);
@@ -1041,7 +1035,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
1041 *pagep = page; 1035 *pagep = page;
1042 1036
1043 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1037 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1044 inode, page, (int)pos, (int)len); 1038 inode, page, (int)pos, (int)len);
1045 1039
1046 r = ceph_update_writeable_page(file, pos, len, page); 1040 r = ceph_update_writeable_page(file, pos, len, page);
1047 } while (r == -EAGAIN); 1041 } while (r == -EAGAIN);
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
index 67b2c030924b..eb2a666b0be7 100644
--- a/fs/ceph/armor.c
+++ b/fs/ceph/armor.c
@@ -1,11 +1,15 @@
1 1
2#include <linux/errno.h> 2#include <linux/errno.h>
3 3
4int ceph_armor(char *dst, const char *src, const char *end);
5int ceph_unarmor(char *dst, const char *src, const char *end);
6
4/* 7/*
5 * base64 encode/decode. 8 * base64 encode/decode.
6 */ 9 */
7 10
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 11static const char *pem_key =
12 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9 13
10static int encode_bits(int c) 14static int encode_bits(int c)
11{ 15{
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c7..6d2e30600627 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,7 +1,6 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h> 4#include <linux/err.h>
6#include <linux/slab.h> 5#include <linux/slab.h>
7 6
@@ -21,7 +20,7 @@ static u32 supported_protocols[] = {
21 CEPH_AUTH_CEPHX 20 CEPH_AUTH_CEPHX
22}; 21};
23 22
24int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) 23static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
25{ 24{
26 switch (protocol) { 25 switch (protocol) {
27 case CEPH_AUTH_NONE: 26 case CEPH_AUTH_NONE:
@@ -134,8 +133,8 @@ bad:
134 return -ERANGE; 133 return -ERANGE;
135} 134}
136 135
137int ceph_build_auth_request(struct ceph_auth_client *ac, 136static int ceph_build_auth_request(struct ceph_auth_client *ac,
138 void *msg_buf, size_t msg_len) 137 void *msg_buf, size_t msg_len)
139{ 138{
140 struct ceph_mon_request_header *monhdr = msg_buf; 139 struct ceph_mon_request_header *monhdr = msg_buf;
141 void *p = monhdr + 1; 140 void *p = monhdr + 1;
@@ -150,7 +149,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
150 149
151 ret = ac->ops->build_request(ac, p + sizeof(u32), end); 150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
152 if (ret < 0) { 151 if (ret < 0) {
153 pr_err("error %d building request\n", ret); 152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret; 154 return ret;
155 } 155 }
156 dout(" built request %d bytes\n", ret); 156 dout(" built request %d bytes\n", ret);
@@ -229,7 +229,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
229 if (ret == -EAGAIN) { 229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len); 230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) { 231 } else if (ret) {
232 pr_err("authentication error %d\n", ret); 232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret; 233 return ret;
234 } 234 }
235 return 0; 235 return 0;
@@ -246,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
246 if (!ac->protocol) 246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len); 247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops); 248 BUG_ON(!ac->ops);
249 if (!ac->ops->is_authenticated(ac)) 249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len); 250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0; 251 return 0;
252} 252}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
15struct ceph_authorizer; 15struct ceph_authorizer;
16 16
17struct ceph_auth_client_ops { 17struct ceph_auth_client_ops {
18 const char *name;
19
18 /* 20 /*
19 * true if we are authenticated and can connect to 21 * true if we are authenticated and can connect to
20 * services. 22 * services.
@@ -22,6 +24,12 @@ struct ceph_auth_client_ops {
22 int (*is_authenticated)(struct ceph_auth_client *ac); 24 int (*is_authenticated)(struct ceph_auth_client *ac);
23 25
24 /* 26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
25 * build requests and process replies during monitor 33 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build 34 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request. 35 * another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f7..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
31 return !xi->starting; 31 return !xi->starting;
32} 32}
33 33
34static int should_authenticate(struct ceph_auth_client *ac)
35{
36 struct ceph_auth_none_info *xi = ac->private;
37
38 return xi->starting;
39}
40
34/* 41/*
35 * the generic auth code decode the global_id, and we carry no actual 42 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here. 43 * authenticate state, so nothing happens here.
@@ -94,9 +101,11 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
94} 101}
95 102
96static const struct ceph_auth_client_ops ceph_auth_none_ops = { 103static const struct ceph_auth_client_ops ceph_auth_none_ops = {
104 .name = "none",
97 .reset = reset, 105 .reset = reset,
98 .destroy = destroy, 106 .destroy = destroy,
99 .is_authenticated = is_authenticated, 107 .is_authenticated = is_authenticated,
108 .should_authenticate = should_authenticate,
100 .handle_reply = handle_reply, 109 .handle_reply = handle_reply,
101 .create_authorizer = ceph_auth_none_create_authorizer, 110 .create_authorizer = ceph_auth_none_create_authorizer,
102 .destroy_authorizer = ceph_auth_none_destroy_authorizer, 111 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da881..a2d002cbdec2 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
27 return (ac->want_keys & xi->have_keys) == ac->want_keys; 27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28} 28}
29 29
30static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31{
32 struct ceph_x_info *xi = ac->private;
33 int need;
34
35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys);
38 return need != 0;
39}
40
30static int ceph_x_encrypt_buflen(int ilen) 41static int ceph_x_encrypt_buflen(int ilen)
31{ 42{
32 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + 43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
@@ -76,8 +87,8 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret,
76/* 87/*
77 * get existing (or insert new) ticket handler 88 * get existing (or insert new) ticket handler
78 */ 89 */
79struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac, 90static struct ceph_x_ticket_handler *
80 int service) 91get_ticket_handler(struct ceph_auth_client *ac, int service)
81{ 92{
82 struct ceph_x_ticket_handler *th; 93 struct ceph_x_ticket_handler *th;
83 struct ceph_x_info *xi = ac->private; 94 struct ceph_x_info *xi = ac->private;
@@ -127,7 +138,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
127 int ret; 138 int ret;
128 char *dbuf; 139 char *dbuf;
129 char *ticket_buf; 140 char *ticket_buf;
130 u8 struct_v; 141 u8 reply_struct_v;
131 142
132 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); 143 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
133 if (!dbuf) 144 if (!dbuf)
@@ -139,14 +150,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
139 goto out_dbuf; 150 goto out_dbuf;
140 151
141 ceph_decode_need(&p, end, 1 + sizeof(u32), bad); 152 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
142 struct_v = ceph_decode_8(&p); 153 reply_struct_v = ceph_decode_8(&p);
143 if (struct_v != 1) 154 if (reply_struct_v != 1)
144 goto bad; 155 goto bad;
145 num = ceph_decode_32(&p); 156 num = ceph_decode_32(&p);
146 dout("%d tickets\n", num); 157 dout("%d tickets\n", num);
147 while (num--) { 158 while (num--) {
148 int type; 159 int type;
149 u8 struct_v; 160 u8 tkt_struct_v, blob_struct_v;
150 struct ceph_x_ticket_handler *th; 161 struct ceph_x_ticket_handler *th;
151 void *dp, *dend; 162 void *dp, *dend;
152 int dlen; 163 int dlen;
@@ -165,8 +176,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
165 type = ceph_decode_32(&p); 176 type = ceph_decode_32(&p);
166 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); 177 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
167 178
168 struct_v = ceph_decode_8(&p); 179 tkt_struct_v = ceph_decode_8(&p);
169 if (struct_v != 1) 180 if (tkt_struct_v != 1)
170 goto bad; 181 goto bad;
171 182
172 th = get_ticket_handler(ac, type); 183 th = get_ticket_handler(ac, type);
@@ -186,8 +197,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
186 dend = dbuf + dlen; 197 dend = dbuf + dlen;
187 dp = dbuf; 198 dp = dbuf;
188 199
189 struct_v = ceph_decode_8(&dp); 200 tkt_struct_v = ceph_decode_8(&dp);
190 if (struct_v != 1) 201 if (tkt_struct_v != 1)
191 goto bad; 202 goto bad;
192 203
193 memcpy(&old_key, &th->session_key, sizeof(old_key)); 204 memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +235,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
224 tpend = tp + dlen; 235 tpend = tp + dlen;
225 dout(" ticket blob is %d bytes\n", dlen); 236 dout(" ticket blob is %d bytes\n", dlen);
226 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); 237 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
227 struct_v = ceph_decode_8(&tp); 238 blob_struct_v = ceph_decode_8(&tp);
228 new_secret_id = ceph_decode_64(&tp); 239 new_secret_id = ceph_decode_64(&tp);
229 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); 240 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
230 if (ret) 241 if (ret)
@@ -365,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
365 376
366 th = get_ticket_handler(ac, service); 377 th = get_ticket_handler(ac, service);
367 378
368 if (!th) { 379 if (IS_ERR(th)) {
369 *pneed |= service; 380 *pneed |= service;
370 continue; 381 continue;
371 } 382 }
@@ -388,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
388 struct ceph_x_ticket_handler *th = 399 struct ceph_x_ticket_handler *th =
389 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
390 401
402 if (IS_ERR(th))
403 return PTR_ERR(th);
404
391 ceph_x_validate_tickets(ac, &need); 405 ceph_x_validate_tickets(ac, &need);
392 406
393 dout("build_request want %x have %x need %x\n", 407 dout("build_request want %x have %x need %x\n",
@@ -418,7 +432,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
418 auth->struct_v = 1; 432 auth->struct_v = 1;
419 auth->key = 0; 433 auth->key = 0;
420 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) 434 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
421 auth->key ^= *u; 435 auth->key ^= *(__le64 *)u;
422 dout(" server_challenge %llx client_challenge %llx key %llx\n", 436 dout(" server_challenge %llx client_challenge %llx key %llx\n",
423 xi->server_challenge, le64_to_cpu(auth->client_challenge), 437 xi->server_challenge, le64_to_cpu(auth->client_challenge),
424 le64_to_cpu(auth->key)); 438 le64_to_cpu(auth->key));
@@ -439,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
439 return -ERANGE; 453 return -ERANGE;
440 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); 454 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
441 455
442 BUG_ON(!th);
443 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); 456 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
444 if (ret) 457 if (ret)
445 return ret; 458 return ret;
@@ -482,7 +495,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
482 return -EAGAIN; 495 return -EAGAIN;
483 } 496 }
484 497
485 op = le32_to_cpu(head->op); 498 op = le16_to_cpu(head->op);
486 result = le32_to_cpu(head->result); 499 result = le32_to_cpu(head->result);
487 dout("handle_reply op %d result %d\n", op, result); 500 dout("handle_reply op %d result %d\n", op, result);
488 switch (op) { 501 switch (op) {
@@ -494,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
494 507
495 case CEPHX_GET_PRINCIPAL_SESSION_KEY: 508 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
496 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 509 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
497 BUG_ON(!th); 510 if (IS_ERR(th))
511 return PTR_ERR(th);
498 ret = ceph_x_proc_ticket_reply(ac, &th->session_key, 512 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
499 buf + sizeof(*head), end); 513 buf + sizeof(*head), end);
500 break; 514 break;
@@ -552,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
552 void *end = p + sizeof(au->reply_buf); 566 void *end = p + sizeof(au->reply_buf);
553 567
554 th = get_ticket_handler(ac, au->service); 568 th = get_ticket_handler(ac, au->service);
555 if (!th) 569 if (IS_ERR(th))
556 return -EIO; /* hrm! */ 570 return PTR_ERR(th);
557 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); 571 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
558 if (ret < 0) 572 if (ret < 0)
559 return ret; 573 return ret;
@@ -602,6 +616,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
602 remove_ticket_handler(ac, th); 616 remove_ticket_handler(ac, th);
603 } 617 }
604 618
619 if (xi->auth_authorizer.buf)
620 ceph_buffer_put(xi->auth_authorizer.buf);
621
605 kfree(ac->private); 622 kfree(ac->private);
606 ac->private = NULL; 623 ac->private = NULL;
607} 624}
@@ -612,13 +629,15 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
612 struct ceph_x_ticket_handler *th; 629 struct ceph_x_ticket_handler *th;
613 630
614 th = get_ticket_handler(ac, peer_type); 631 th = get_ticket_handler(ac, peer_type);
615 if (th && !IS_ERR(th)) 632 if (!IS_ERR(th))
616 remove_ticket_handler(ac, th); 633 remove_ticket_handler(ac, th);
617} 634}
618 635
619 636
620static const struct ceph_auth_client_ops ceph_x_ops = { 637static const struct ceph_auth_client_ops ceph_x_ops = {
638 .name = "x",
621 .is_authenticated = ceph_x_is_authenticated, 639 .is_authenticated = ceph_x_is_authenticated,
640 .should_authenticate = ceph_x_should_authenticate,
622 .build_request = ceph_x_build_request, 641 .build_request = ceph_x_build_request,
623 .handle_reply = ceph_x_handle_reply, 642 .handle_reply = ceph_x_handle_reply,
624 .create_authorizer = ceph_x_create_authorizer, 643 .create_authorizer = ceph_x_create_authorizer,
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index c67535d70aa6..cd39f17021de 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -47,22 +47,6 @@ void ceph_buffer_release(struct kref *kref)
47 kfree(b); 47 kfree(b);
48} 48}
49 49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) 50int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{ 51{
68 size_t len; 52 size_t len;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b279..5e9da996a151 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -113,58 +113,41 @@ const char *ceph_cap_string(int caps)
113 return cap_str[i]; 113 return cap_str[i];
114} 114}
115 115
116/* 116void ceph_caps_init(struct ceph_mds_client *mdsc)
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{ 117{
137 INIT_LIST_HEAD(&caps_list); 118 INIT_LIST_HEAD(&mdsc->caps_list);
138 spin_lock_init(&caps_list_lock); 119 spin_lock_init(&mdsc->caps_list_lock);
139} 120}
140 121
141void ceph_caps_finalize(void) 122void ceph_caps_finalize(struct ceph_mds_client *mdsc)
142{ 123{
143 struct ceph_cap *cap; 124 struct ceph_cap *cap;
144 125
145 spin_lock(&caps_list_lock); 126 spin_lock(&mdsc->caps_list_lock);
146 while (!list_empty(&caps_list)) { 127 while (!list_empty(&mdsc->caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); 128 cap = list_first_entry(&mdsc->caps_list,
129 struct ceph_cap, caps_item);
148 list_del(&cap->caps_item); 130 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap); 131 kmem_cache_free(ceph_cap_cachep, cap);
150 } 132 }
151 caps_total_count = 0; 133 mdsc->caps_total_count = 0;
152 caps_avail_count = 0; 134 mdsc->caps_avail_count = 0;
153 caps_use_count = 0; 135 mdsc->caps_use_count = 0;
154 caps_reserve_count = 0; 136 mdsc->caps_reserve_count = 0;
155 caps_min_count = 0; 137 mdsc->caps_min_count = 0;
156 spin_unlock(&caps_list_lock); 138 spin_unlock(&mdsc->caps_list_lock);
157} 139}
158 140
159void ceph_adjust_min_caps(int delta) 141void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
160{ 142{
161 spin_lock(&caps_list_lock); 143 spin_lock(&mdsc->caps_list_lock);
162 caps_min_count += delta; 144 mdsc->caps_min_count += delta;
163 BUG_ON(caps_min_count < 0); 145 BUG_ON(mdsc->caps_min_count < 0);
164 spin_unlock(&caps_list_lock); 146 spin_unlock(&mdsc->caps_list_lock);
165} 147}
166 148
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) 149int ceph_reserve_caps(struct ceph_mds_client *mdsc,
150 struct ceph_cap_reservation *ctx, int need)
168{ 151{
169 int i; 152 int i;
170 struct ceph_cap *cap; 153 struct ceph_cap *cap;
@@ -176,16 +159,17 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
176 dout("reserve caps ctx=%p need=%d\n", ctx, need); 159 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177 160
178 /* first reserve any caps that are already allocated */ 161 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock); 162 spin_lock(&mdsc->caps_list_lock);
180 if (caps_avail_count >= need) 163 if (mdsc->caps_avail_count >= need)
181 have = need; 164 have = need;
182 else 165 else
183 have = caps_avail_count; 166 have = mdsc->caps_avail_count;
184 caps_avail_count -= have; 167 mdsc->caps_avail_count -= have;
185 caps_reserve_count += have; 168 mdsc->caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 169 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
187 caps_avail_count); 170 mdsc->caps_reserve_count +
188 spin_unlock(&caps_list_lock); 171 mdsc->caps_avail_count);
172 spin_unlock(&mdsc->caps_list_lock);
189 173
190 for (i = have; i < need; i++) { 174 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 175 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
@@ -198,19 +182,20 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
198 } 182 }
199 BUG_ON(have + alloc != need); 183 BUG_ON(have + alloc != need);
200 184
201 spin_lock(&caps_list_lock); 185 spin_lock(&mdsc->caps_list_lock);
202 caps_total_count += alloc; 186 mdsc->caps_total_count += alloc;
203 caps_reserve_count += alloc; 187 mdsc->caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list); 188 list_splice(&newcaps, &mdsc->caps_list);
205 189
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 190 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
207 caps_avail_count); 191 mdsc->caps_reserve_count +
208 spin_unlock(&caps_list_lock); 192 mdsc->caps_avail_count);
193 spin_unlock(&mdsc->caps_list_lock);
209 194
210 ctx->count = need; 195 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 196 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count, 197 ctx, mdsc->caps_total_count, mdsc->caps_use_count,
213 caps_avail_count); 198 mdsc->caps_reserve_count, mdsc->caps_avail_count);
214 return 0; 199 return 0;
215 200
216out_alloc_count: 201out_alloc_count:
@@ -220,92 +205,104 @@ out_alloc_count:
220 return ret; 205 return ret;
221} 206}
222 207
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx) 208int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
209 struct ceph_cap_reservation *ctx)
224{ 210{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); 211 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) { 212 if (ctx->count) {
227 spin_lock(&caps_list_lock); 213 spin_lock(&mdsc->caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count); 214 BUG_ON(mdsc->caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count; 215 mdsc->caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count; 216 mdsc->caps_avail_count += ctx->count;
231 ctx->count = 0; 217 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n", 218 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count, 219 mdsc->caps_total_count, mdsc->caps_use_count,
234 caps_avail_count); 220 mdsc->caps_reserve_count, mdsc->caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 221 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
236 caps_avail_count); 222 mdsc->caps_reserve_count +
237 spin_unlock(&caps_list_lock); 223 mdsc->caps_avail_count);
224 spin_unlock(&mdsc->caps_list_lock);
238 } 225 }
239 return 0; 226 return 0;
240} 227}
241 228
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) 229static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
230 struct ceph_cap_reservation *ctx)
243{ 231{
244 struct ceph_cap *cap = NULL; 232 struct ceph_cap *cap = NULL;
245 233
246 /* temporary, until we do something about cap import/export */ 234 /* temporary, until we do something about cap import/export */
247 if (!ctx) 235 if (!ctx) {
248 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 236 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
237 if (cap) {
238 mdsc->caps_use_count++;
239 mdsc->caps_total_count++;
240 }
241 return cap;
242 }
249 243
250 spin_lock(&caps_list_lock); 244 spin_lock(&mdsc->caps_list_lock);
251 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", 245 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
252 ctx, ctx->count, caps_total_count, caps_use_count, 246 ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
253 caps_reserve_count, caps_avail_count); 247 mdsc->caps_reserve_count, mdsc->caps_avail_count);
254 BUG_ON(!ctx->count); 248 BUG_ON(!ctx->count);
255 BUG_ON(ctx->count > caps_reserve_count); 249 BUG_ON(ctx->count > mdsc->caps_reserve_count);
256 BUG_ON(list_empty(&caps_list)); 250 BUG_ON(list_empty(&mdsc->caps_list));
257 251
258 ctx->count--; 252 ctx->count--;
259 caps_reserve_count--; 253 mdsc->caps_reserve_count--;
260 caps_use_count++; 254 mdsc->caps_use_count++;
261 255
262 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); 256 cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
263 list_del(&cap->caps_item); 257 list_del(&cap->caps_item);
264 258
265 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 259 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
266 caps_avail_count); 260 mdsc->caps_reserve_count + mdsc->caps_avail_count);
267 spin_unlock(&caps_list_lock); 261 spin_unlock(&mdsc->caps_list_lock);
268 return cap; 262 return cap;
269} 263}
270 264
271void ceph_put_cap(struct ceph_cap *cap) 265void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
272{ 266{
273 spin_lock(&caps_list_lock); 267 spin_lock(&mdsc->caps_list_lock);
274 dout("put_cap %p %d = %d used + %d resv + %d avail\n", 268 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
275 cap, caps_total_count, caps_use_count, 269 cap, mdsc->caps_total_count, mdsc->caps_use_count,
276 caps_reserve_count, caps_avail_count); 270 mdsc->caps_reserve_count, mdsc->caps_avail_count);
277 caps_use_count--; 271 mdsc->caps_use_count--;
278 /* 272 /*
279 * Keep some preallocated caps around (ceph_min_count), to 273 * Keep some preallocated caps around (ceph_min_count), to
280 * avoid lots of free/alloc churn. 274 * avoid lots of free/alloc churn.
281 */ 275 */
282 if (caps_avail_count >= caps_reserve_count + caps_min_count) { 276 if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
283 caps_total_count--; 277 mdsc->caps_min_count) {
278 mdsc->caps_total_count--;
284 kmem_cache_free(ceph_cap_cachep, cap); 279 kmem_cache_free(ceph_cap_cachep, cap);
285 } else { 280 } else {
286 caps_avail_count++; 281 mdsc->caps_avail_count++;
287 list_add(&cap->caps_item, &caps_list); 282 list_add(&cap->caps_item, &mdsc->caps_list);
288 } 283 }
289 284
290 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 285 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
291 caps_avail_count); 286 mdsc->caps_reserve_count + mdsc->caps_avail_count);
292 spin_unlock(&caps_list_lock); 287 spin_unlock(&mdsc->caps_list_lock);
293} 288}
294 289
295void ceph_reservation_status(struct ceph_client *client, 290void ceph_reservation_status(struct ceph_client *client,
296 int *total, int *avail, int *used, int *reserved, 291 int *total, int *avail, int *used, int *reserved,
297 int *min) 292 int *min)
298{ 293{
294 struct ceph_mds_client *mdsc = &client->mdsc;
295
299 if (total) 296 if (total)
300 *total = caps_total_count; 297 *total = mdsc->caps_total_count;
301 if (avail) 298 if (avail)
302 *avail = caps_avail_count; 299 *avail = mdsc->caps_avail_count;
303 if (used) 300 if (used)
304 *used = caps_use_count; 301 *used = mdsc->caps_use_count;
305 if (reserved) 302 if (reserved)
306 *reserved = caps_reserve_count; 303 *reserved = mdsc->caps_reserve_count;
307 if (min) 304 if (min)
308 *min = caps_min_count; 305 *min = mdsc->caps_min_count;
309} 306}
310 307
311/* 308/*
@@ -330,22 +327,29 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
330 return NULL; 327 return NULL;
331} 328}
332 329
330struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
331{
332 struct ceph_cap *cap;
333
334 spin_lock(&ci->vfs_inode.i_lock);
335 cap = __get_cap_for_mds(ci, mds);
336 spin_unlock(&ci->vfs_inode.i_lock);
337 return cap;
338}
339
333/* 340/*
334 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else 341 * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
335 * -1.
336 */ 342 */
337static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) 343static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
338{ 344{
339 struct ceph_cap *cap; 345 struct ceph_cap *cap;
340 int mds = -1; 346 int mds = -1;
341 struct rb_node *p; 347 struct rb_node *p;
342 348
343 /* prefer mds with WR|WRBUFFER|EXCL caps */ 349 /* prefer mds with WR|BUFFER|EXCL caps */
344 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 350 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
345 cap = rb_entry(p, struct ceph_cap, ci_node); 351 cap = rb_entry(p, struct ceph_cap, ci_node);
346 mds = cap->mds; 352 mds = cap->mds;
347 if (mseq)
348 *mseq = cap->mseq;
349 if (cap->issued & (CEPH_CAP_FILE_WR | 353 if (cap->issued & (CEPH_CAP_FILE_WR |
350 CEPH_CAP_FILE_BUFFER | 354 CEPH_CAP_FILE_BUFFER |
351 CEPH_CAP_FILE_EXCL)) 355 CEPH_CAP_FILE_EXCL))
@@ -358,7 +362,7 @@ int ceph_get_cap_mds(struct inode *inode)
358{ 362{
359 int mds; 363 int mds;
360 spin_lock(&inode->i_lock); 364 spin_lock(&inode->i_lock);
361 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL); 365 mds = __ceph_get_cap_mds(ceph_inode(inode));
362 spin_unlock(&inode->i_lock); 366 spin_unlock(&inode->i_lock);
363 return mds; 367 return mds;
364} 368}
@@ -477,8 +481,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
477 * Each time we receive FILE_CACHE anew, we increment 481 * Each time we receive FILE_CACHE anew, we increment
478 * i_rdcache_gen. 482 * i_rdcache_gen.
479 */ 483 */
480 if ((issued & CEPH_CAP_FILE_CACHE) && 484 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
481 (had & CEPH_CAP_FILE_CACHE) == 0) 485 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
482 ci->i_rdcache_gen++; 486 ci->i_rdcache_gen++;
483 487
484 /* 488 /*
@@ -537,7 +541,7 @@ retry:
537 new_cap = NULL; 541 new_cap = NULL;
538 } else { 542 } else {
539 spin_unlock(&inode->i_lock); 543 spin_unlock(&inode->i_lock);
540 new_cap = get_cap(caps_reservation); 544 new_cap = get_cap(mdsc, caps_reservation);
541 if (new_cap == NULL) 545 if (new_cap == NULL)
542 return -ENOMEM; 546 return -ENOMEM;
543 goto retry; 547 goto retry;
@@ -582,6 +586,7 @@ retry:
582 } else { 586 } else {
583 pr_err("ceph_add_cap: couldn't find snap realm %llx\n", 587 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
584 realmino); 588 realmino);
589 WARN_ON(!realm);
585 } 590 }
586 } 591 }
587 592
@@ -621,7 +626,7 @@ retry:
621 if (fmode >= 0) 626 if (fmode >= 0)
622 __ceph_get_fmode(ci, fmode); 627 __ceph_get_fmode(ci, fmode);
623 spin_unlock(&inode->i_lock); 628 spin_unlock(&inode->i_lock);
624 wake_up(&ci->i_cap_wq); 629 wake_up_all(&ci->i_cap_wq);
625 return 0; 630 return 0;
626} 631}
627 632
@@ -809,7 +814,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
809 used |= CEPH_CAP_PIN; 814 used |= CEPH_CAP_PIN;
810 if (ci->i_rd_ref) 815 if (ci->i_rd_ref)
811 used |= CEPH_CAP_FILE_RD; 816 used |= CEPH_CAP_FILE_RD;
812 if (ci->i_rdcache_ref || ci->i_rdcache_gen) 817 if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
813 used |= CEPH_CAP_FILE_CACHE; 818 used |= CEPH_CAP_FILE_CACHE;
814 if (ci->i_wr_ref) 819 if (ci->i_wr_ref)
815 used |= CEPH_CAP_FILE_WR; 820 used |= CEPH_CAP_FILE_WR;
@@ -825,7 +830,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
825{ 830{
826 int want = 0; 831 int want = 0;
827 int mode; 832 int mode;
828 for (mode = 0; mode < 4; mode++) 833 for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
829 if (ci->i_nr_by_mode[mode]) 834 if (ci->i_nr_by_mode[mode])
830 want |= ceph_caps_for_mode(mode); 835 want |= ceph_caps_for_mode(mode);
831 return want; 836 return want;
@@ -867,7 +872,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
867{ 872{
868 struct ceph_mds_session *session = cap->session; 873 struct ceph_mds_session *session = cap->session;
869 struct ceph_inode_info *ci = cap->ci; 874 struct ceph_inode_info *ci = cap->ci;
870 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 875 struct ceph_mds_client *mdsc =
876 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
871 int removed = 0; 877 int removed = 0;
872 878
873 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 879 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -894,7 +900,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
894 ci->i_auth_cap = NULL; 900 ci->i_auth_cap = NULL;
895 901
896 if (removed) 902 if (removed)
897 ceph_put_cap(cap); 903 ceph_put_cap(mdsc, cap);
898 904
899 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { 905 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
900 struct ceph_snap_realm *realm = ci->i_snap_realm; 906 struct ceph_snap_realm *realm = ci->i_snap_realm;
@@ -937,9 +943,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
937 seq, issue_seq, mseq, follows, size, max_size, 943 seq, issue_seq, mseq, follows, size, max_size,
938 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 944 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
939 945
940 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); 946 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
941 if (IS_ERR(msg)) 947 if (!msg)
942 return PTR_ERR(msg); 948 return -ENOMEM;
943 949
944 msg->hdr.tid = cpu_to_le64(flush_tid); 950 msg->hdr.tid = cpu_to_le64(flush_tid);
945 951
@@ -980,6 +986,46 @@ static int send_cap_msg(struct ceph_mds_session *session,
980 return 0; 986 return 0;
981} 987}
982 988
989static void __queue_cap_release(struct ceph_mds_session *session,
990 u64 ino, u64 cap_id, u32 migrate_seq,
991 u32 issue_seq)
992{
993 struct ceph_msg *msg;
994 struct ceph_mds_cap_release *head;
995 struct ceph_mds_cap_item *item;
996
997 spin_lock(&session->s_cap_lock);
998 BUG_ON(!session->s_num_cap_releases);
999 msg = list_first_entry(&session->s_cap_releases,
1000 struct ceph_msg, list_head);
1001
1002 dout(" adding %llx release to mds%d msg %p (%d left)\n",
1003 ino, session->s_mds, msg, session->s_num_cap_releases);
1004
1005 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1006 head = msg->front.iov_base;
1007 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1008 item = msg->front.iov_base + msg->front.iov_len;
1009 item->ino = cpu_to_le64(ino);
1010 item->cap_id = cpu_to_le64(cap_id);
1011 item->migrate_seq = cpu_to_le32(migrate_seq);
1012 item->seq = cpu_to_le32(issue_seq);
1013
1014 session->s_num_cap_releases--;
1015
1016 msg->front.iov_len += sizeof(*item);
1017 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1018 dout(" release msg %p full\n", msg);
1019 list_move_tail(&msg->list_head, &session->s_cap_releases_done);
1020 } else {
1021 dout(" release msg %p at %d/%d (%d)\n", msg,
1022 (int)le32_to_cpu(head->num),
1023 (int)CEPH_CAPS_PER_RELEASE,
1024 (int)msg->front.iov_len);
1025 }
1026 spin_unlock(&session->s_cap_lock);
1027}
1028
983/* 1029/*
984 * Queue cap releases when an inode is dropped from our cache. Since 1030 * Queue cap releases when an inode is dropped from our cache. Since
985 * inode is about to be destroyed, there is no need for i_lock. 1031 * inode is about to be destroyed, there is no need for i_lock.
@@ -993,41 +1039,9 @@ void ceph_queue_caps_release(struct inode *inode)
993 while (p) { 1039 while (p) {
994 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); 1040 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
995 struct ceph_mds_session *session = cap->session; 1041 struct ceph_mds_session *session = cap->session;
996 struct ceph_msg *msg;
997 struct ceph_mds_cap_release *head;
998 struct ceph_mds_cap_item *item;
999 1042
1000 spin_lock(&session->s_cap_lock); 1043 __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
1001 BUG_ON(!session->s_num_cap_releases); 1044 cap->mseq, cap->issue_seq);
1002 msg = list_first_entry(&session->s_cap_releases,
1003 struct ceph_msg, list_head);
1004
1005 dout(" adding %p release to mds%d msg %p (%d left)\n",
1006 inode, session->s_mds, msg, session->s_num_cap_releases);
1007
1008 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1009 head = msg->front.iov_base;
1010 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1011 item = msg->front.iov_base + msg->front.iov_len;
1012 item->ino = cpu_to_le64(ceph_ino(inode));
1013 item->cap_id = cpu_to_le64(cap->cap_id);
1014 item->migrate_seq = cpu_to_le32(cap->mseq);
1015 item->seq = cpu_to_le32(cap->issue_seq);
1016
1017 session->s_num_cap_releases--;
1018
1019 msg->front.iov_len += sizeof(*item);
1020 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1021 dout(" release msg %p full\n", msg);
1022 list_move_tail(&msg->list_head,
1023 &session->s_cap_releases_done);
1024 } else {
1025 dout(" release msg %p at %d/%d (%d)\n", msg,
1026 (int)le32_to_cpu(head->num),
1027 (int)CEPH_CAPS_PER_RELEASE,
1028 (int)msg->front.iov_len);
1029 }
1030 spin_unlock(&session->s_cap_lock);
1031 p = rb_next(p); 1045 p = rb_next(p);
1032 __ceph_remove_cap(cap); 1046 __ceph_remove_cap(cap);
1033 } 1047 }
@@ -1068,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1068 gid_t gid; 1082 gid_t gid;
1069 struct ceph_mds_session *session; 1083 struct ceph_mds_session *session;
1070 u64 xattr_version = 0; 1084 u64 xattr_version = 0;
1085 struct ceph_buffer *xattr_blob = NULL;
1071 int delayed = 0; 1086 int delayed = 0;
1072 u64 flush_tid = 0; 1087 u64 flush_tid = 0;
1073 int i; 1088 int i;
@@ -1128,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1128 for (i = 0; i < CEPH_CAP_BITS; i++) 1143 for (i = 0; i < CEPH_CAP_BITS; i++)
1129 if (flushing & (1 << i)) 1144 if (flushing & (1 << i))
1130 ci->i_cap_flush_tid[i] = flush_tid; 1145 ci->i_cap_flush_tid[i] = flush_tid;
1146
1147 follows = ci->i_head_snapc->seq;
1148 } else {
1149 follows = 0;
1131 } 1150 }
1132 1151
1133 keep = cap->implemented; 1152 keep = cap->implemented;
@@ -1141,14 +1160,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1141 mtime = inode->i_mtime; 1160 mtime = inode->i_mtime;
1142 atime = inode->i_atime; 1161 atime = inode->i_atime;
1143 time_warp_seq = ci->i_time_warp_seq; 1162 time_warp_seq = ci->i_time_warp_seq;
1144 follows = ci->i_snap_realm->cached_context->seq;
1145 uid = inode->i_uid; 1163 uid = inode->i_uid;
1146 gid = inode->i_gid; 1164 gid = inode->i_gid;
1147 mode = inode->i_mode; 1165 mode = inode->i_mode;
1148 1166
1149 if (dropping & CEPH_CAP_XATTR_EXCL) { 1167 if (flushing & CEPH_CAP_XATTR_EXCL) {
1150 __ceph_build_xattrs_blob(ci); 1168 __ceph_build_xattrs_blob(ci);
1151 xattr_version = ci->i_xattrs.version + 1; 1169 xattr_blob = ci->i_xattrs.blob;
1170 xattr_version = ci->i_xattrs.version;
1152 } 1171 }
1153 1172
1154 spin_unlock(&inode->i_lock); 1173 spin_unlock(&inode->i_lock);
@@ -1156,9 +1175,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1156 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1175 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1157 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1176 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1158 size, max_size, &mtime, &atime, time_warp_seq, 1177 size, max_size, &mtime, &atime, time_warp_seq,
1159 uid, gid, mode, 1178 uid, gid, mode, xattr_version, xattr_blob,
1160 xattr_version,
1161 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1162 follows); 1179 follows);
1163 if (ret < 0) { 1180 if (ret < 0) {
1164 dout("error sending cap msg, must requeue %p\n", inode); 1181 dout("error sending cap msg, must requeue %p\n", inode);
@@ -1166,7 +1183,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1166 } 1183 }
1167 1184
1168 if (wake) 1185 if (wake)
1169 wake_up(&ci->i_cap_wq); 1186 wake_up_all(&ci->i_cap_wq);
1170 1187
1171 return delayed; 1188 return delayed;
1172} 1189}
@@ -1178,10 +1195,16 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1178 * asynchronously back to the MDS once sync writes complete and dirty 1195 * asynchronously back to the MDS once sync writes complete and dirty
1179 * data is written out. 1196 * data is written out.
1180 * 1197 *
1198 * Unless @again is true, skip cap_snaps that were already sent to
1199 * the MDS (i.e., during this session).
1200 *
1181 * Called under i_lock. Takes s_mutex as needed. 1201 * Called under i_lock. Takes s_mutex as needed.
1182 */ 1202 */
1183void __ceph_flush_snaps(struct ceph_inode_info *ci, 1203void __ceph_flush_snaps(struct ceph_inode_info *ci,
1184 struct ceph_mds_session **psession) 1204 struct ceph_mds_session **psession,
1205 int again)
1206 __releases(ci->vfs_inode->i_lock)
1207 __acquires(ci->vfs_inode->i_lock)
1185{ 1208{
1186 struct inode *inode = &ci->vfs_inode; 1209 struct inode *inode = &ci->vfs_inode;
1187 int mds; 1210 int mds;
@@ -1208,7 +1231,7 @@ retry:
1208 * pages to be written out. 1231 * pages to be written out.
1209 */ 1232 */
1210 if (capsnap->dirty_pages || capsnap->writing) 1233 if (capsnap->dirty_pages || capsnap->writing)
1211 continue; 1234 break;
1212 1235
1213 /* 1236 /*
1214 * if cap writeback already occurred, we should have dropped 1237 * if cap writeback already occurred, we should have dropped
@@ -1217,7 +1240,20 @@ retry:
1217 BUG_ON(capsnap->dirty == 0); 1240 BUG_ON(capsnap->dirty == 0);
1218 1241
1219 /* pick mds, take s_mutex */ 1242 /* pick mds, take s_mutex */
1220 mds = __ceph_get_cap_mds(ci, &mseq); 1243 if (ci->i_auth_cap == NULL) {
1244 dout("no auth cap (migrating?), doing nothing\n");
1245 goto out;
1246 }
1247
1248 /* only flush each capsnap once */
1249 if (!again && !list_empty(&capsnap->flushing_item)) {
1250 dout("already flushed %p, skipping\n", capsnap);
1251 continue;
1252 }
1253
1254 mds = ci->i_auth_cap->session->s_mds;
1255 mseq = ci->i_auth_cap->mseq;
1256
1221 if (session && session->s_mds != mds) { 1257 if (session && session->s_mds != mds) {
1222 dout("oops, wrong session %p mutex\n", session); 1258 dout("oops, wrong session %p mutex\n", session);
1223 mutex_unlock(&session->s_mutex); 1259 mutex_unlock(&session->s_mutex);
@@ -1236,8 +1272,8 @@ retry:
1236 } 1272 }
1237 /* 1273 /*
1238 * if session == NULL, we raced against a cap 1274 * if session == NULL, we raced against a cap
1239 * deletion. retry, and we'll get a better 1275 * deletion or migration. retry, and we'll
1240 * @mds value next time. 1276 * get a better @mds value next time.
1241 */ 1277 */
1242 spin_lock(&inode->i_lock); 1278 spin_lock(&inode->i_lock);
1243 goto retry; 1279 goto retry;
@@ -1251,8 +1287,8 @@ retry:
1251 &session->s_cap_snaps_flushing); 1287 &session->s_cap_snaps_flushing);
1252 spin_unlock(&inode->i_lock); 1288 spin_unlock(&inode->i_lock);
1253 1289
1254 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n", 1290 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
1255 inode, capsnap, next_follows, capsnap->size); 1291 inode, capsnap, capsnap->follows, capsnap->flush_tid);
1256 send_cap_msg(session, ceph_vino(inode).ino, 0, 1292 send_cap_msg(session, ceph_vino(inode).ino, 0,
1257 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, 1293 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1258 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, 1294 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
@@ -1260,7 +1296,7 @@ retry:
1260 &capsnap->mtime, &capsnap->atime, 1296 &capsnap->mtime, &capsnap->atime,
1261 capsnap->time_warp_seq, 1297 capsnap->time_warp_seq,
1262 capsnap->uid, capsnap->gid, capsnap->mode, 1298 capsnap->uid, capsnap->gid, capsnap->mode,
1263 0, NULL, 1299 capsnap->xattr_version, capsnap->xattr_blob,
1264 capsnap->follows); 1300 capsnap->follows);
1265 1301
1266 next_follows = capsnap->follows + 1; 1302 next_follows = capsnap->follows + 1;
@@ -1275,6 +1311,7 @@ retry:
1275 list_del_init(&ci->i_snap_flush_item); 1311 list_del_init(&ci->i_snap_flush_item);
1276 spin_unlock(&mdsc->snap_flush_lock); 1312 spin_unlock(&mdsc->snap_flush_lock);
1277 1313
1314out:
1278 if (psession) 1315 if (psession)
1279 *psession = session; 1316 *psession = session;
1280 else if (session) { 1317 else if (session) {
@@ -1288,7 +1325,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1288 struct inode *inode = &ci->vfs_inode; 1325 struct inode *inode = &ci->vfs_inode;
1289 1326
1290 spin_lock(&inode->i_lock); 1327 spin_lock(&inode->i_lock);
1291 __ceph_flush_snaps(ci, NULL); 1328 __ceph_flush_snaps(ci, NULL, 0);
1292 spin_unlock(&inode->i_lock); 1329 spin_unlock(&inode->i_lock);
1293} 1330}
1294 1331
@@ -1298,7 +1335,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1298 */ 1335 */
1299void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1336void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1300{ 1337{
1301 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 1338 struct ceph_mds_client *mdsc =
1339 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1302 struct inode *inode = &ci->vfs_inode; 1340 struct inode *inode = &ci->vfs_inode;
1303 int was = ci->i_dirty_caps; 1341 int was = ci->i_dirty_caps;
1304 int dirty = 0; 1342 int dirty = 0;
@@ -1308,7 +1346,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1308 ceph_cap_string(was | mask)); 1346 ceph_cap_string(was | mask));
1309 ci->i_dirty_caps |= mask; 1347 ci->i_dirty_caps |= mask;
1310 if (was == 0) { 1348 if (was == 0) {
1311 dout(" inode %p now dirty\n", &ci->vfs_inode); 1349 if (!ci->i_head_snapc)
1350 ci->i_head_snapc = ceph_get_snap_context(
1351 ci->i_snap_realm->cached_context);
1352 dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
1353 ci->i_head_snapc);
1312 BUG_ON(!list_empty(&ci->i_dirty_item)); 1354 BUG_ON(!list_empty(&ci->i_dirty_item));
1313 spin_lock(&mdsc->cap_dirty_lock); 1355 spin_lock(&mdsc->cap_dirty_lock);
1314 list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1356 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1336,7 +1378,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1336static int __mark_caps_flushing(struct inode *inode, 1378static int __mark_caps_flushing(struct inode *inode,
1337 struct ceph_mds_session *session) 1379 struct ceph_mds_session *session)
1338{ 1380{
1339 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1381 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1340 struct ceph_inode_info *ci = ceph_inode(inode); 1382 struct ceph_inode_info *ci = ceph_inode(inode);
1341 int flushing; 1383 int flushing;
1342 1384
@@ -1419,7 +1461,6 @@ static int try_nonblocking_invalidate(struct inode *inode)
1419 */ 1461 */
1420void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1462void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1421 struct ceph_mds_session *session) 1463 struct ceph_mds_session *session)
1422 __releases(session->s_mutex)
1423{ 1464{
1424 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1465 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1425 struct ceph_mds_client *mdsc = &client->mdsc; 1466 struct ceph_mds_client *mdsc = &client->mdsc;
@@ -1447,7 +1488,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1447 1488
1448 /* flush snaps first time around only */ 1489 /* flush snaps first time around only */
1449 if (!list_empty(&ci->i_cap_snaps)) 1490 if (!list_empty(&ci->i_cap_snaps))
1450 __ceph_flush_snaps(ci, &session); 1491 __ceph_flush_snaps(ci, &session, 0);
1451 goto retry_locked; 1492 goto retry_locked;
1452retry: 1493retry:
1453 spin_lock(&inode->i_lock); 1494 spin_lock(&inode->i_lock);
@@ -1494,11 +1535,13 @@ retry_locked:
1494 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1535 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1495 ci->i_rdcache_gen && /* may have cached pages */ 1536 ci->i_rdcache_gen && /* may have cached pages */
1496 (file_wanted == 0 || /* no open files */ 1537 (file_wanted == 0 || /* no open files */
1497 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */ 1538 (revoking & (CEPH_CAP_FILE_CACHE|
1539 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
1498 !tried_invalidate) { 1540 !tried_invalidate) {
1499 dout("check_caps trying to invalidate on %p\n", inode); 1541 dout("check_caps trying to invalidate on %p\n", inode);
1500 if (try_nonblocking_invalidate(inode) < 0) { 1542 if (try_nonblocking_invalidate(inode) < 0) {
1501 if (revoking & CEPH_CAP_FILE_CACHE) { 1543 if (revoking & (CEPH_CAP_FILE_CACHE|
1544 CEPH_CAP_FILE_LAZYIO)) {
1502 dout("check_caps queuing invalidate\n"); 1545 dout("check_caps queuing invalidate\n");
1503 queue_invalidate = 1; 1546 queue_invalidate = 1;
1504 ci->i_rdcache_revoking = ci->i_rdcache_gen; 1547 ci->i_rdcache_revoking = ci->i_rdcache_gen;
@@ -1663,7 +1706,7 @@ ack:
1663static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1706static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1664 unsigned *flush_tid) 1707 unsigned *flush_tid)
1665{ 1708{
1666 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1709 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1667 struct ceph_inode_info *ci = ceph_inode(inode); 1710 struct ceph_inode_info *ci = ceph_inode(inode);
1668 int unlock_session = session ? 0 : 1; 1711 int unlock_session = session ? 0 : 1;
1669 int flushing = 0; 1712 int flushing = 0;
@@ -1716,10 +1759,9 @@ out_unlocked:
1716static int caps_are_flushed(struct inode *inode, unsigned tid) 1759static int caps_are_flushed(struct inode *inode, unsigned tid)
1717{ 1760{
1718 struct ceph_inode_info *ci = ceph_inode(inode); 1761 struct ceph_inode_info *ci = ceph_inode(inode);
1719 int dirty, i, ret = 1; 1762 int i, ret = 1;
1720 1763
1721 spin_lock(&inode->i_lock); 1764 spin_lock(&inode->i_lock);
1722 dirty = __ceph_caps_dirty(ci);
1723 for (i = 0; i < CEPH_CAP_BITS; i++) 1765 for (i = 0; i < CEPH_CAP_BITS; i++)
1724 if ((ci->i_flushing_caps & (1 << i)) && 1766 if ((ci->i_flushing_caps & (1 << i)) &&
1725 ci->i_cap_flush_tid[i] <= tid) { 1767 ci->i_cap_flush_tid[i] <= tid) {
@@ -1775,9 +1817,9 @@ out:
1775 spin_unlock(&ci->i_unsafe_lock); 1817 spin_unlock(&ci->i_unsafe_lock);
1776} 1818}
1777 1819
1778int ceph_fsync(struct file *file, struct dentry *dentry, int datasync) 1820int ceph_fsync(struct file *file, int datasync)
1779{ 1821{
1780 struct inode *inode = dentry->d_inode; 1822 struct inode *inode = file->f_mapping->host;
1781 struct ceph_inode_info *ci = ceph_inode(inode); 1823 struct ceph_inode_info *ci = ceph_inode(inode);
1782 unsigned flush_tid; 1824 unsigned flush_tid;
1783 int ret; 1825 int ret;
@@ -1829,7 +1871,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1829 err = wait_event_interruptible(ci->i_cap_wq, 1871 err = wait_event_interruptible(ci->i_cap_wq,
1830 caps_are_flushed(inode, flush_tid)); 1872 caps_are_flushed(inode, flush_tid));
1831 } else { 1873 } else {
1832 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1874 struct ceph_mds_client *mdsc =
1875 &ceph_sb_to_client(inode->i_sb)->mdsc;
1833 1876
1834 spin_lock(&inode->i_lock); 1877 spin_lock(&inode->i_lock);
1835 if (__ceph_caps_dirty(ci)) 1878 if (__ceph_caps_dirty(ci))
@@ -1862,7 +1905,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1862 if (cap && cap->session == session) { 1905 if (cap && cap->session == session) {
1863 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, 1906 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1864 cap, capsnap); 1907 cap, capsnap);
1865 __ceph_flush_snaps(ci, &session); 1908 __ceph_flush_snaps(ci, &session, 1);
1866 } else { 1909 } else {
1867 pr_err("%p auth cap %p not mds%d ???\n", inode, 1910 pr_err("%p auth cap %p not mds%d ???\n", inode,
1868 cap, session->s_mds); 1911 cap, session->s_mds);
@@ -2137,7 +2180,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2137 else if (flushsnaps) 2180 else if (flushsnaps)
2138 ceph_flush_snaps(ci); 2181 ceph_flush_snaps(ci);
2139 if (wake) 2182 if (wake)
2140 wake_up(&ci->i_cap_wq); 2183 wake_up_all(&ci->i_cap_wq);
2141 if (put) 2184 if (put)
2142 iput(inode); 2185 iput(inode);
2143} 2186}
@@ -2165,7 +2208,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2165 2208
2166 if (ci->i_head_snapc == snapc) { 2209 if (ci->i_head_snapc == snapc) {
2167 ci->i_wrbuffer_ref_head -= nr; 2210 ci->i_wrbuffer_ref_head -= nr;
2168 if (!ci->i_wrbuffer_ref_head) { 2211 if (ci->i_wrbuffer_ref_head == 0 &&
2212 ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
2213 BUG_ON(!ci->i_head_snapc);
2169 ceph_put_snap_context(ci->i_head_snapc); 2214 ceph_put_snap_context(ci->i_head_snapc);
2170 ci->i_head_snapc = NULL; 2215 ci->i_head_snapc = NULL;
2171 } 2216 }
@@ -2213,7 +2258,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2213 iput(inode); 2258 iput(inode);
2214 } else if (complete_capsnap) { 2259 } else if (complete_capsnap) {
2215 ceph_flush_snaps(ci); 2260 ceph_flush_snaps(ci);
2216 wake_up(&ci->i_cap_wq); 2261 wake_up_all(&ci->i_cap_wq);
2217 } 2262 }
2218 if (drop_capsnap) 2263 if (drop_capsnap)
2219 iput(inode); 2264 iput(inode);
@@ -2234,12 +2279,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2234 struct ceph_mds_session *session, 2279 struct ceph_mds_session *session,
2235 struct ceph_cap *cap, 2280 struct ceph_cap *cap,
2236 struct ceph_buffer *xattr_buf) 2281 struct ceph_buffer *xattr_buf)
2237 __releases(inode->i_lock) 2282 __releases(inode->i_lock)
2238 __releases(session->s_mutex)
2239{ 2283{
2240 struct ceph_inode_info *ci = ceph_inode(inode); 2284 struct ceph_inode_info *ci = ceph_inode(inode);
2241 int mds = session->s_mds; 2285 int mds = session->s_mds;
2242 int seq = le32_to_cpu(grant->seq); 2286 unsigned seq = le32_to_cpu(grant->seq);
2287 unsigned issue_seq = le32_to_cpu(grant->issue_seq);
2243 int newcaps = le32_to_cpu(grant->caps); 2288 int newcaps = le32_to_cpu(grant->caps);
2244 int issued, implemented, used, wanted, dirty; 2289 int issued, implemented, used, wanted, dirty;
2245 u64 size = le64_to_cpu(grant->size); 2290 u64 size = le64_to_cpu(grant->size);
@@ -2251,8 +2296,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2251 int revoked_rdcache = 0; 2296 int revoked_rdcache = 0;
2252 int queue_invalidate = 0; 2297 int queue_invalidate = 0;
2253 2298
2254 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2299 dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
2255 inode, cap, mds, seq, ceph_cap_string(newcaps)); 2300 inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
2256 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2301 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2257 inode->i_size); 2302 inode->i_size);
2258 2303
@@ -2262,6 +2307,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2262 * will invalidate _after_ writeback.) 2307 * will invalidate _after_ writeback.)
2263 */ 2308 */
2264 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && 2309 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2310 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2265 !ci->i_wrbuffer_ref) { 2311 !ci->i_wrbuffer_ref) {
2266 if (try_nonblocking_invalidate(inode) == 0) { 2312 if (try_nonblocking_invalidate(inode) == 0) {
2267 revoked_rdcache = 1; 2313 revoked_rdcache = 1;
@@ -2347,21 +2393,29 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2347 } 2393 }
2348 2394
2349 cap->seq = seq; 2395 cap->seq = seq;
2396 cap->issue_seq = issue_seq;
2350 2397
2351 /* file layout may have changed */ 2398 /* file layout may have changed */
2352 ci->i_layout = grant->layout; 2399 ci->i_layout = grant->layout;
2353 2400
2354 /* revocation, grant, or no-op? */ 2401 /* revocation, grant, or no-op? */
2355 if (cap->issued & ~newcaps) { 2402 if (cap->issued & ~newcaps) {
2356 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued), 2403 int revoking = cap->issued & ~newcaps;
2357 ceph_cap_string(newcaps)); 2404
2358 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) 2405 dout("revocation: %s -> %s (revoking %s)\n",
2359 writeback = 1; /* will delay ack */ 2406 ceph_cap_string(cap->issued),
2360 else if (dirty & ~newcaps) 2407 ceph_cap_string(newcaps),
2361 check_caps = 1; /* initiate writeback in check_caps */ 2408 ceph_cap_string(revoking));
2362 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || 2409 if (revoking & used & CEPH_CAP_FILE_BUFFER)
2363 revoked_rdcache) 2410 writeback = 1; /* initiate writeback; will delay ack */
2364 check_caps = 2; /* send revoke ack in check_caps */ 2411 else if (revoking == CEPH_CAP_FILE_CACHE &&
2412 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2413 queue_invalidate)
2414 ; /* do nothing yet, invalidation will be queued */
2415 else if (cap == ci->i_auth_cap)
2416 check_caps = 1; /* check auth cap only */
2417 else
2418 check_caps = 2; /* check all caps */
2365 cap->issued = newcaps; 2419 cap->issued = newcaps;
2366 cap->implemented |= newcaps; 2420 cap->implemented |= newcaps;
2367 } else if (cap->issued == newcaps) { 2421 } else if (cap->issued == newcaps) {
@@ -2389,7 +2443,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2389 if (queue_invalidate) 2443 if (queue_invalidate)
2390 ceph_queue_invalidate(inode); 2444 ceph_queue_invalidate(inode);
2391 if (wake) 2445 if (wake)
2392 wake_up(&ci->i_cap_wq); 2446 wake_up_all(&ci->i_cap_wq);
2393 2447
2394 if (check_caps == 1) 2448 if (check_caps == 1)
2395 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, 2449 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
@@ -2411,7 +2465,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2411 __releases(inode->i_lock) 2465 __releases(inode->i_lock)
2412{ 2466{
2413 struct ceph_inode_info *ci = ceph_inode(inode); 2467 struct ceph_inode_info *ci = ceph_inode(inode);
2414 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 2468 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
2415 unsigned seq = le32_to_cpu(m->seq); 2469 unsigned seq = le32_to_cpu(m->seq);
2416 int dirty = le32_to_cpu(m->dirty); 2470 int dirty = le32_to_cpu(m->dirty);
2417 int cleaned = 0; 2471 int cleaned = 0;
@@ -2444,19 +2498,24 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2444 struct ceph_inode_info, 2498 struct ceph_inode_info,
2445 i_flushing_item)->vfs_inode); 2499 i_flushing_item)->vfs_inode);
2446 mdsc->num_cap_flushing--; 2500 mdsc->num_cap_flushing--;
2447 wake_up(&mdsc->cap_flushing_wq); 2501 wake_up_all(&mdsc->cap_flushing_wq);
2448 dout(" inode %p now !flushing\n", inode); 2502 dout(" inode %p now !flushing\n", inode);
2449 2503
2450 if (ci->i_dirty_caps == 0) { 2504 if (ci->i_dirty_caps == 0) {
2451 dout(" inode %p now clean\n", inode); 2505 dout(" inode %p now clean\n", inode);
2452 BUG_ON(!list_empty(&ci->i_dirty_item)); 2506 BUG_ON(!list_empty(&ci->i_dirty_item));
2453 drop = 1; 2507 drop = 1;
2508 if (ci->i_wrbuffer_ref_head == 0) {
2509 BUG_ON(!ci->i_head_snapc);
2510 ceph_put_snap_context(ci->i_head_snapc);
2511 ci->i_head_snapc = NULL;
2512 }
2454 } else { 2513 } else {
2455 BUG_ON(list_empty(&ci->i_dirty_item)); 2514 BUG_ON(list_empty(&ci->i_dirty_item));
2456 } 2515 }
2457 } 2516 }
2458 spin_unlock(&mdsc->cap_dirty_lock); 2517 spin_unlock(&mdsc->cap_dirty_lock);
2459 wake_up(&ci->i_cap_wq); 2518 wake_up_all(&ci->i_cap_wq);
2460 2519
2461out: 2520out:
2462 spin_unlock(&inode->i_lock); 2521 spin_unlock(&inode->i_lock);
@@ -2552,7 +2611,8 @@ static void handle_cap_trunc(struct inode *inode,
2552 * caller holds s_mutex 2611 * caller holds s_mutex
2553 */ 2612 */
2554static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 2613static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2555 struct ceph_mds_session *session) 2614 struct ceph_mds_session *session,
2615 int *open_target_sessions)
2556{ 2616{
2557 struct ceph_inode_info *ci = ceph_inode(inode); 2617 struct ceph_inode_info *ci = ceph_inode(inode);
2558 int mds = session->s_mds; 2618 int mds = session->s_mds;
@@ -2584,6 +2644,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2584 ci->i_cap_exporting_mds = mds; 2644 ci->i_cap_exporting_mds = mds;
2585 ci->i_cap_exporting_mseq = mseq; 2645 ci->i_cap_exporting_mseq = mseq;
2586 ci->i_cap_exporting_issued = cap->issued; 2646 ci->i_cap_exporting_issued = cap->issued;
2647
2648 /*
2649 * make sure we have open sessions with all possible
2650 * export targets, so that we get the matching IMPORT
2651 */
2652 *open_target_sessions = 1;
2587 } 2653 }
2588 __ceph_remove_cap(cap); 2654 __ceph_remove_cap(cap);
2589 } 2655 }
@@ -2653,12 +2719,16 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2653 struct ceph_mds_caps *h; 2719 struct ceph_mds_caps *h;
2654 int mds = session->s_mds; 2720 int mds = session->s_mds;
2655 int op; 2721 int op;
2656 u32 seq; 2722 u32 seq, mseq;
2657 struct ceph_vino vino; 2723 struct ceph_vino vino;
2658 u64 cap_id; 2724 u64 cap_id;
2659 u64 size, max_size; 2725 u64 size, max_size;
2660 u64 tid; 2726 u64 tid;
2661 void *snaptrace; 2727 void *snaptrace;
2728 size_t snaptrace_len;
2729 void *flock;
2730 u32 flock_len;
2731 int open_target_sessions = 0;
2662 2732
2663 dout("handle_caps from mds%d\n", mds); 2733 dout("handle_caps from mds%d\n", mds);
2664 2734
@@ -2667,15 +2737,30 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2667 if (msg->front.iov_len < sizeof(*h)) 2737 if (msg->front.iov_len < sizeof(*h))
2668 goto bad; 2738 goto bad;
2669 h = msg->front.iov_base; 2739 h = msg->front.iov_base;
2670 snaptrace = h + 1;
2671 op = le32_to_cpu(h->op); 2740 op = le32_to_cpu(h->op);
2672 vino.ino = le64_to_cpu(h->ino); 2741 vino.ino = le64_to_cpu(h->ino);
2673 vino.snap = CEPH_NOSNAP; 2742 vino.snap = CEPH_NOSNAP;
2674 cap_id = le64_to_cpu(h->cap_id); 2743 cap_id = le64_to_cpu(h->cap_id);
2675 seq = le32_to_cpu(h->seq); 2744 seq = le32_to_cpu(h->seq);
2745 mseq = le32_to_cpu(h->migrate_seq);
2676 size = le64_to_cpu(h->size); 2746 size = le64_to_cpu(h->size);
2677 max_size = le64_to_cpu(h->max_size); 2747 max_size = le64_to_cpu(h->max_size);
2678 2748
2749 snaptrace = h + 1;
2750 snaptrace_len = le32_to_cpu(h->snap_trace_len);
2751
2752 if (le16_to_cpu(msg->hdr.version) >= 2) {
2753 void *p, *end;
2754
2755 p = snaptrace + snaptrace_len;
2756 end = msg->front.iov_base + msg->front.iov_len;
2757 ceph_decode_32_safe(&p, end, flock_len, bad);
2758 flock = p;
2759 } else {
2760 flock = NULL;
2761 flock_len = 0;
2762 }
2763
2679 mutex_lock(&session->s_mutex); 2764 mutex_lock(&session->s_mutex);
2680 session->s_seq++; 2765 session->s_seq++;
2681 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 2766 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2687,7 +2772,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2687 vino.snap, inode); 2772 vino.snap, inode);
2688 if (!inode) { 2773 if (!inode) {
2689 dout(" i don't have ino %llx\n", vino.ino); 2774 dout(" i don't have ino %llx\n", vino.ino);
2690 goto done; 2775
2776 if (op == CEPH_CAP_OP_IMPORT)
2777 __queue_cap_release(session, vino.ino, cap_id,
2778 mseq, seq);
2779 goto flush_cap_releases;
2691 } 2780 }
2692 2781
2693 /* these will work even if we don't have a cap yet */ 2782 /* these will work even if we don't have a cap yet */
@@ -2697,12 +2786,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2697 goto done; 2786 goto done;
2698 2787
2699 case CEPH_CAP_OP_EXPORT: 2788 case CEPH_CAP_OP_EXPORT:
2700 handle_cap_export(inode, h, session); 2789 handle_cap_export(inode, h, session, &open_target_sessions);
2701 goto done; 2790 goto done;
2702 2791
2703 case CEPH_CAP_OP_IMPORT: 2792 case CEPH_CAP_OP_IMPORT:
2704 handle_cap_import(mdsc, inode, h, session, 2793 handle_cap_import(mdsc, inode, h, session,
2705 snaptrace, le32_to_cpu(h->snap_trace_len)); 2794 snaptrace, snaptrace_len);
2706 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, 2795 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2707 session); 2796 session);
2708 goto done_unlocked; 2797 goto done_unlocked;
@@ -2712,10 +2801,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2712 spin_lock(&inode->i_lock); 2801 spin_lock(&inode->i_lock);
2713 cap = __get_cap_for_mds(ceph_inode(inode), mds); 2802 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2714 if (!cap) { 2803 if (!cap) {
2715 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n", 2804 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2716 inode, ceph_ino(inode), ceph_snap(inode), mds); 2805 inode, ceph_ino(inode), ceph_snap(inode), mds);
2717 spin_unlock(&inode->i_lock); 2806 spin_unlock(&inode->i_lock);
2718 goto done; 2807 goto flush_cap_releases;
2719 } 2808 }
2720 2809
2721 /* note that each of these drops i_lock for us */ 2810 /* note that each of these drops i_lock for us */
@@ -2739,11 +2828,24 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2739 ceph_cap_op_name(op)); 2828 ceph_cap_op_name(op));
2740 } 2829 }
2741 2830
2831 goto done;
2832
2833flush_cap_releases:
2834 /*
2835 * send any full release message to try to move things
2836 * along for the mds (who clearly thinks we still have this
2837 * cap).
2838 */
2839 ceph_add_cap_releases(mdsc, session);
2840 ceph_send_cap_releases(mdsc, session);
2841
2742done: 2842done:
2743 mutex_unlock(&session->s_mutex); 2843 mutex_unlock(&session->s_mutex);
2744done_unlocked: 2844done_unlocked:
2745 if (inode) 2845 if (inode)
2746 iput(inode); 2846 iput(inode);
2847 if (open_target_sessions)
2848 ceph_mdsc_open_export_target_sessions(mdsc, session);
2747 return; 2849 return;
2748 2850
2749bad: 2851bad:
@@ -2863,18 +2965,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
2863 struct ceph_inode_info *ci = ceph_inode(inode); 2965 struct ceph_inode_info *ci = ceph_inode(inode);
2864 struct ceph_cap *cap; 2966 struct ceph_cap *cap;
2865 struct ceph_mds_request_release *rel = *p; 2967 struct ceph_mds_request_release *rel = *p;
2968 int used, dirty;
2866 int ret = 0; 2969 int ret = 0;
2867 int used = 0;
2868 2970
2869 spin_lock(&inode->i_lock); 2971 spin_lock(&inode->i_lock);
2870 used = __ceph_caps_used(ci); 2972 used = __ceph_caps_used(ci);
2973 dirty = __ceph_caps_dirty(ci);
2871 2974
2872 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode, 2975 dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
2873 mds, ceph_cap_string(used), ceph_cap_string(drop), 2976 inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
2874 ceph_cap_string(unless)); 2977 ceph_cap_string(unless));
2875 2978
2876 /* only drop unused caps */ 2979 /* only drop unused, clean caps */
2877 drop &= ~used; 2980 drop &= ~(used | dirty);
2878 2981
2879 cap = __get_cap_for_mds(ci, mds); 2982 cap = __get_cap_for_mds(ci, mds);
2880 if (cap && __cap_is_valid(cap)) { 2983 if (cap && __cap_is_valid(cap)) {
@@ -2954,6 +3057,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2954 memcpy(*p, dentry->d_name.name, dentry->d_name.len); 3057 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2955 *p += dentry->d_name.len; 3058 *p += dentry->d_name.len;
2956 rel->dname_seq = cpu_to_le32(di->lease_seq); 3059 rel->dname_seq = cpu_to_le32(di->lease_seq);
3060 __ceph_mdsc_drop_dentry_lease(dentry);
2957 } 3061 }
2958 spin_unlock(&dentry->d_lock); 3062 spin_unlock(&dentry->d_lock);
2959 return ret; 3063 return ret;
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
index 793f50cb7c22..5babb8e95352 100644
--- a/fs/ceph/ceph_frag.h
+++ b/fs/ceph/ceph_frag.h
@@ -1,5 +1,5 @@
1#ifndef _FS_CEPH_FRAG_H 1#ifndef FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H 2#define FS_CEPH_FRAG_H
3 3
4/* 4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space, 5 * "Frags" are a way to describe a subset of a 32-bit number space,
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
index 79d76bc4303f..3ac6cc7c1156 100644
--- a/fs/ceph/ceph_fs.c
+++ b/fs/ceph/ceph_fs.c
@@ -29,46 +29,44 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
29 29
30int ceph_flags_to_mode(int flags) 30int ceph_flags_to_mode(int flags)
31{ 31{
32 int mode;
33
32#ifdef O_DIRECTORY /* fixme */ 34#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY) 35 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN; 36 return CEPH_FILE_MODE_PIN;
35#endif 37#endif
38 if ((flags & O_APPEND) == O_APPEND)
39 flags |= O_WRONLY;
40
41 if ((flags & O_ACCMODE) == O_RDWR)
42 mode = CEPH_FILE_MODE_RDWR;
43 else if ((flags & O_ACCMODE) == O_WRONLY)
44 mode = CEPH_FILE_MODE_WR;
45 else
46 mode = CEPH_FILE_MODE_RD;
47
36#ifdef O_LAZY 48#ifdef O_LAZY
37 if (flags & O_LAZY) 49 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY; 50 mode |= CEPH_FILE_MODE_LAZY;
39#endif 51#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42 52
43 flags &= O_ACCMODE; 53 return mode;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49} 54}
50 55
51int ceph_caps_for_mode(int mode) 56int ceph_caps_for_mode(int mode)
52{ 57{
53 switch (mode) { 58 int caps = CEPH_CAP_PIN;
54 case CEPH_FILE_MODE_PIN: 59
55 return CEPH_CAP_PIN; 60 if (mode & CEPH_FILE_MODE_RD)
56 case CEPH_FILE_MODE_RD: 61 caps |= CEPH_CAP_FILE_SHARED |
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; 62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR: 63 if (mode & CEPH_FILE_MODE_WR)
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | 64 caps |= CEPH_CAP_FILE_EXCL |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | 65 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | 66 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; 67 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 } 68 if (mode & CEPH_FILE_MODE_LAZY)
73 return 0; 69 caps |= CEPH_CAP_FILE_LAZYIO;
70
71 return caps;
74} 72}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..d5619ac86711 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -9,34 +9,20 @@
9 * LGPL2 9 * LGPL2
10 */ 10 */
11 11
12#ifndef _FS_CEPH_CEPH_FS_H 12#ifndef CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H 13#define CEPH_FS_H
14 14
15#include "msgr.h" 15#include "msgr.h"
16#include "rados.h" 16#include "rados.h"
17 17
18/* 18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level 19 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev 20 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public, 21 * internal cluster protocols separately from the public,
36 * client-facing protocol. 22 * client-facing protocol.
37 */ 23 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ 24#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ 25#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */ 26#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */ 27#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */ 28#define CEPH_MDSC_PROTOCOL 32 /* server/client */
@@ -53,8 +39,10 @@
53/* 39/*
54 * feature bits 40 * feature bits
55 */ 41 */
56#define CEPH_FEATURE_SUPPORTED 0 42#define CEPH_FEATURE_UID (1<<0)
57#define CEPH_FEATURE_REQUIRED 0 43#define CEPH_FEATURE_NOSRCADDR (1<<1)
44#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
45#define CEPH_FEATURE_FLOCK (1<<3)
58 46
59 47
60/* 48/*
@@ -86,11 +74,15 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
86#define CEPH_CRYPTO_NONE 0x0 74#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1 75#define CEPH_CRYPTO_AES 0x1
88 76
77#define CEPH_AES_IV "cephsageyudagreg"
78
89/* security/authentication protocols */ 79/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0 80#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1 81#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2 82#define CEPH_AUTH_CEPHX 0x2
93 83
84#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
85
94 86
95/********************************************* 87/*********************************************
96 * message layer 88 * message layer
@@ -128,11 +120,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
128#define CEPH_MSG_CLIENT_SNAP 0x312 120#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 121#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130 122
123/* pool ops */
124#define CEPH_MSG_POOLOP_REPLY 48
125#define CEPH_MSG_POOLOP 49
126
127
131/* osd */ 128/* osd */
132#define CEPH_MSG_OSD_MAP 41 129#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42 130#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43 131#define CEPH_MSG_OSD_OPREPLY 43
135 132
133/* pool operations */
134enum {
135 POOL_OP_CREATE = 0x01,
136 POOL_OP_DELETE = 0x02,
137 POOL_OP_AUID_CHANGE = 0x03,
138 POOL_OP_CREATE_SNAP = 0x11,
139 POOL_OP_DELETE_SNAP = 0x12,
140 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
141 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
142};
143
136struct ceph_mon_request_header { 144struct ceph_mon_request_header {
137 __le64 have_version; 145 __le64 have_version;
138 __le16 session_mon; 146 __le16 session_mon;
@@ -155,6 +163,31 @@ struct ceph_mon_statfs_reply {
155 struct ceph_statfs st; 163 struct ceph_statfs st;
156} __attribute__ ((packed)); 164} __attribute__ ((packed));
157 165
166const char *ceph_pool_op_name(int op);
167
168struct ceph_mon_poolop {
169 struct ceph_mon_request_header monhdr;
170 struct ceph_fsid fsid;
171 __le32 pool;
172 __le32 op;
173 __le64 auid;
174 __le64 snapid;
175 __le32 name_len;
176} __attribute__ ((packed));
177
178struct ceph_mon_poolop_reply {
179 struct ceph_mon_request_header monhdr;
180 struct ceph_fsid fsid;
181 __le32 reply_code;
182 __le32 epoch;
183 char has_data;
184 char data[0];
185} __attribute__ ((packed));
186
187struct ceph_mon_unmanaged_snap {
188 __le64 snapid;
189} __attribute__ ((packed));
190
158struct ceph_osd_getmap { 191struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr; 192 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid; 193 struct ceph_fsid fsid;
@@ -212,16 +245,18 @@ extern const char *ceph_mds_state_name(int s);
212 * - they also define the lock ordering by the MDS 245 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds 246 * - a few of these are internal to the mds
214 */ 247 */
215#define CEPH_LOCK_DN 1 248#define CEPH_LOCK_DVERSION 1
216#define CEPH_LOCK_ISNAP 2 249#define CEPH_LOCK_DN 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */ 250#define CEPH_LOCK_ISNAP 16
218#define CEPH_LOCK_IFILE 8 /* mds internal */ 251#define CEPH_LOCK_IVERSION 32 /* mds internal */
219#define CEPH_LOCK_IAUTH 32 252#define CEPH_LOCK_IFILE 64
220#define CEPH_LOCK_ILINK 64 253#define CEPH_LOCK_IAUTH 128
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */ 254#define CEPH_LOCK_ILINK 256
222#define CEPH_LOCK_INEST 256 /* mds internal */ 255#define CEPH_LOCK_IDFT 512 /* dir frag tree */
223#define CEPH_LOCK_IXATTR 512 256#define CEPH_LOCK_INEST 1024 /* mds internal */
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */ 257#define CEPH_LOCK_IXATTR 2048
258#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
259#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
225 260
226/* client_session ops */ 261/* client_session ops */
227enum { 262enum {
@@ -262,6 +297,8 @@ enum {
262 CEPH_MDS_OP_RMXATTR = 0x01106, 297 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107, 298 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108, 299 CEPH_MDS_OP_SETATTR = 0x01108,
300 CEPH_MDS_OP_SETFILELOCK= 0x01109,
301 CEPH_MDS_OP_GETFILELOCK= 0x00110,
265 302
266 CEPH_MDS_OP_MKNOD = 0x01201, 303 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202, 304 CEPH_MDS_OP_LINK = 0x01202,
@@ -308,6 +345,7 @@ union ceph_mds_request_args {
308 struct { 345 struct {
309 __le32 frag; /* which dir fragment */ 346 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */ 347 __le32 max_entries; /* how many dentries to grab */
348 __le32 max_bytes;
311 } __attribute__ ((packed)) readdir; 349 } __attribute__ ((packed)) readdir;
312 struct { 350 struct {
313 __le32 mode; 351 __le32 mode;
@@ -331,6 +369,15 @@ union ceph_mds_request_args {
331 struct { 369 struct {
332 struct ceph_file_layout layout; 370 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout; 371 } __attribute__ ((packed)) setlayout;
372 struct {
373 __u8 rule; /* currently fcntl or flock */
374 __u8 type; /* shared, exclusive, remove*/
375 __le64 pid; /* process id requesting the lock */
376 __le64 pid_namespace;
377 __le64 start; /* initial location to lock */
378 __le64 length; /* num bytes to lock from start */
379 __u8 wait; /* will caller wait for lock to become available? */
380 } __attribute__ ((packed)) filelock_change;
334} __attribute__ ((packed)); 381} __attribute__ ((packed));
335 382
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ 383#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
@@ -425,6 +472,23 @@ struct ceph_mds_reply_dirfrag {
425 __le32 dist[]; 472 __le32 dist[];
426} __attribute__ ((packed)); 473} __attribute__ ((packed));
427 474
475#define CEPH_LOCK_FCNTL 1
476#define CEPH_LOCK_FLOCK 2
477
478#define CEPH_LOCK_SHARED 1
479#define CEPH_LOCK_EXCL 2
480#define CEPH_LOCK_UNLOCK 4
481
482struct ceph_filelock {
483 __le64 start;/* file offset to start lock at */
484 __le64 length; /* num bytes to lock; 0 for all following start */
485 __le64 client; /* which client holds the lock */
486 __le64 pid; /* process id holding the lock on the client */
487 __le64 pid_namespace;
488 __u8 type; /* shared lock, exclusive lock, or unlock */
489} __attribute__ ((packed));
490
491
428/* file access modes */ 492/* file access modes */
429#define CEPH_FILE_MODE_PIN 0 493#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1 494#define CEPH_FILE_MODE_RD 1
@@ -453,9 +517,10 @@ int ceph_flags_to_mode(int flags);
453#define CEPH_CAP_SAUTH 2 517#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4 518#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6 519#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */ 520#define CEPH_CAP_SFILE 8
521#define CEPH_CAP_SFLOCK 20
457 522
458#define CEPH_CAP_BITS 16 523#define CEPH_CAP_BITS 22
459 524
460/* composed values */ 525/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) 526#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
@@ -473,6 +538,9 @@ int ceph_flags_to_mode(int flags);
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) 538#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) 539#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) 540#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
541#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
542#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
543
476 544
477/* cap masks (for getattr) */ 545/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN 546#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
@@ -508,7 +576,8 @@ int ceph_flags_to_mode(int flags);
508 CEPH_CAP_FILE_EXCL) 576 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) 577#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ 578#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN) 579 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
580 CEPH_CAP_PIN)
512 581
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ 582#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR) 583 CEPH_LOCK_IXATTR)
@@ -598,12 +667,21 @@ struct ceph_mds_cap_reconnect {
598 __le64 cap_id; 667 __le64 cap_id;
599 __le32 wanted; 668 __le32 wanted;
600 __le32 issued; 669 __le32 issued;
670 __le64 snaprealm;
671 __le64 pathbase; /* base ino for our path to this ino */
672 __le32 flock_len; /* size of flock state blob, if any */
673} __attribute__ ((packed));
674/* followed by flock blob */
675
676struct ceph_mds_cap_reconnect_v1 {
677 __le64 cap_id;
678 __le32 wanted;
679 __le32 issued;
601 __le64 size; 680 __le64 size;
602 struct ceph_timespec mtime, atime; 681 struct ceph_timespec mtime, atime;
603 __le64 snaprealm; 682 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */ 683 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed)); 684} __attribute__ ((packed));
606/* followed by encoded string */
607 685
608struct ceph_mds_snaprealm_reconnect { 686struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */ 687 __le64 ino; /* snap realm base */
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
index 5ac470c433c9..d099c3f90236 100644
--- a/fs/ceph/ceph_hash.h
+++ b/fs/ceph/ceph_hash.h
@@ -1,5 +1,5 @@
1#ifndef _FS_CEPH_HASH_H 1#ifndef FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H 2#define FS_CEPH_HASH_H
3 3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ 4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ 5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..c6179d3a26a2 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
10 case CEPH_ENTITY_TYPE_OSD: return "osd"; 10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon"; 11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client"; 12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth"; 13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown"; 14 default: return "unknown";
16 } 15 }
@@ -29,6 +28,7 @@ const char *ceph_osd_op_name(int op)
29 case CEPH_OSD_OP_TRUNCATE: return "truncate"; 28 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero"; 29 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull"; 30 case CEPH_OSD_OP_WRITEFULL: return "writefull";
31 case CEPH_OSD_OP_ROLLBACK: return "rollback";
32 32
33 case CEPH_OSD_OP_APPEND: return "append"; 33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync"; 34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
@@ -45,6 +45,7 @@ const char *ceph_osd_op_name(int op)
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; 45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; 46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr"; 47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
48 49
49 case CEPH_OSD_OP_PULL: return "pull"; 50 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push"; 51 case CEPH_OSD_OP_PUSH: return "push";
@@ -129,6 +130,8 @@ const char *ceph_mds_op_name(int op)
129 case CEPH_MDS_OP_LSSNAP: return "lssnap"; 130 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap"; 131 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap"; 132 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
133 case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
134 case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
132 } 135 }
133 return "???"; 136 return "???";
134} 137}
@@ -174,3 +177,17 @@ const char *ceph_snap_op_name(int o)
174 } 177 }
175 return "???"; 178 return "???";
176} 179}
180
181const char *ceph_pool_op_name(int op)
182{
183 switch (op) {
184 case POOL_OP_CREATE: return "create";
185 case POOL_OP_DELETE: return "delete";
186 case POOL_OP_AUID_CHANGE: return "auid change";
187 case POOL_OP_CREATE_SNAP: return "create snap";
188 case POOL_OP_DELETE_SNAP: return "delete snap";
189 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
190 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
191 }
192 return "???";
193}
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
index dcd7e7523700..97e435b191f4 100644
--- a/fs/ceph/crush/crush.h
+++ b/fs/ceph/crush/crush.h
@@ -1,5 +1,5 @@
1#ifndef _CRUSH_CRUSH_H 1#ifndef CEPH_CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H 2#define CEPH_CRUSH_CRUSH_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5 5
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
index ff48e110e4bb..91e884230d5d 100644
--- a/fs/ceph/crush/hash.h
+++ b/fs/ceph/crush/hash.h
@@ -1,5 +1,5 @@
1#ifndef _CRUSH_HASH_H 1#ifndef CEPH_CRUSH_HASH_H
2#define _CRUSH_HASH_H 2#define CEPH_CRUSH_HASH_H
3 3
4#define CRUSH_HASH_RJENKINS1 0 4#define CRUSH_HASH_RJENKINS1 0
5 5
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 9ba54efb6543..a4eec133258e 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -238,7 +238,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
238 238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{ 240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r); 241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) { 242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM: 243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in, 244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -264,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
264 */ 264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x) 265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{ 266{
267 if (weight[item] >= 0x1000) 267 if (weight[item] >= 0x10000)
268 return 0; 268 return 0;
269 if (weight[item] == 0) 269 if (weight[item] == 0)
270 return 1; 270 return 1;
@@ -305,7 +305,9 @@ static int crush_choose(struct crush_map *map,
305 int itemtype; 305 int itemtype;
306 int collide, reject; 306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */ 307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos); 308
309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
310 bucket->id, x, outpos, numrep);
309 311
310 for (rep = outpos; rep < numrep; rep++) { 312 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */ 313 /* keep trying until we get a non-out, non-colliding item */
@@ -366,6 +368,7 @@ static int crush_choose(struct crush_map *map,
366 BUG_ON(item >= 0 || 368 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets); 369 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item]; 370 in = map->buckets[-1-item];
371 retry_bucket = 1;
369 continue; 372 continue;
370 } 373 }
371 374
@@ -377,15 +380,25 @@ static int crush_choose(struct crush_map *map,
377 } 380 }
378 } 381 }
379 382
380 if (recurse_to_leaf && 383 reject = 0;
381 item < 0 && 384 if (recurse_to_leaf) {
382 crush_choose(map, map->buckets[-1-item], 385 if (item < 0) {
383 weight, 386 if (crush_choose(map,
384 x, outpos+1, 0, 387 map->buckets[-1-item],
385 out2, outpos, 388 weight,
386 firstn, 0, NULL) <= outpos) { 389 x, outpos+1, 0,
387 reject = 1; 390 out2, outpos,
388 } else { 391 firstn, 0,
392 NULL) <= outpos)
393 /* didn't get leaf */
394 reject = 1;
395 } else {
396 /* we already have a leaf! */
397 out2[outpos] = item;
398 }
399 }
400
401 if (!reject) {
389 /* out? */ 402 /* out? */
390 if (itemtype == 0) 403 if (itemtype == 0)
391 reject = is_out(map, weight, 404 reject = is_out(map, weight,
@@ -424,12 +437,12 @@ reject:
424 continue; 437 continue;
425 } 438 }
426 439
427 dprintk("choose got %d\n", item); 440 dprintk("CHOOSE got %d\n", item);
428 out[outpos] = item; 441 out[outpos] = item;
429 outpos++; 442 outpos++;
430 } 443 }
431 444
432 dprintk("choose returns %d\n", outpos); 445 dprintk("CHOOSE returns %d\n", outpos);
433 return outpos; 446 return outpos;
434} 447}
435 448
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
index 98e90046fd9f..c46b99c18bb0 100644
--- a/fs/ceph/crush/mapper.h
+++ b/fs/ceph/crush/mapper.h
@@ -1,5 +1,5 @@
1#ifndef _CRUSH_MAPPER_H 1#ifndef CEPH_CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H 2#define CEPH_CRUSH_MAPPER_H
3 3
4/* 4/*
5 * CRUSH functions for find rules and then mapping an input to an 5 * CRUSH functions for find rules and then mapping an input to an
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index f704b3b62424..a3e627f63293 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -75,10 +75,11 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); 75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76} 76}
77 77
78const u8 *aes_iv = "cephsageyudagreg"; 78static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
79 79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, 80static int ceph_aes_encrypt(const void *key, int key_len,
81 const void *src, size_t src_len) 81 void *dst, size_t *dst_len,
82 const void *src, size_t src_len)
82{ 83{
83 struct scatterlist sg_in[2], sg_out[1]; 84 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 85 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -126,9 +127,10 @@ int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
126 return 0; 127 return 0;
127} 128}
128 129
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len, 130static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
130 const void *src1, size_t src1_len, 131 size_t *dst_len,
131 const void *src2, size_t src2_len) 132 const void *src1, size_t src1_len,
133 const void *src2, size_t src2_len)
132{ 134{
133 struct scatterlist sg_in[3], sg_out[1]; 135 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 136 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -179,8 +181,9 @@ int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
179 return 0; 181 return 0;
180} 182}
181 183
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, 184static int ceph_aes_decrypt(const void *key, int key_len,
183 const void *src, size_t src_len) 185 void *dst, size_t *dst_len,
186 const void *src, size_t src_len)
184{ 187{
185 struct scatterlist sg_in[1], sg_out[2]; 188 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 189 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -238,10 +241,10 @@ int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
238 return 0; 241 return 0;
239} 242}
240 243
241int ceph_aes_decrypt2(const void *key, int key_len, 244static int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len, 245 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len, 246 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len) 247 const void *src, size_t src_len)
245{ 248{
246 struct scatterlist sg_in[1], sg_out[3]; 249 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 250 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
index 40b502e6bd89..bdf38607323c 100644
--- a/fs/ceph/crypto.h
+++ b/fs/ceph/crypto.h
@@ -42,7 +42,7 @@ extern int ceph_encrypt2(struct ceph_crypto_key *secret,
42 const void *src2, size_t src2_len); 42 const void *src2, size_t src2_len);
43 43
44/* armor.c */ 44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end); 45extern int ceph_armor(char *dst, const char *src, const char *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end); 46extern int ceph_unarmor(char *dst, const char *src, const char *end);
47 47
48#endif 48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92acc..6fd8b20a8611 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
113static int monc_show(struct seq_file *s, void *p) 113static int monc_show(struct seq_file *s, void *p)
114{ 114{
115 struct ceph_client *client = s->private; 115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req; 116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc; 117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp; 118 struct rb_node *rp;
119 119
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
126 if (monc->want_next_osdmap) 126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n"); 127 seq_printf(s, "want next osdmap\n");
128 128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { 129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node); 130 __u16 op;
131 seq_printf(s, "%lld statfs\n", req->tid); 131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
132 } 137 }
133 138
134 mutex_unlock(&monc->mutex); 139 mutex_unlock(&monc->mutex);
@@ -166,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p)
166 } else if (req->r_dentry) { 171 } else if (req->r_dentry) {
167 path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 172 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
168 &pathbase, 0); 173 &pathbase, 0);
174 if (IS_ERR(path))
175 path = NULL;
169 spin_lock(&req->r_dentry->d_lock); 176 spin_lock(&req->r_dentry->d_lock);
170 seq_printf(s, " #%llx/%.*s (%s)", 177 seq_printf(s, " #%llx/%.*s (%s)",
171 ceph_ino(req->r_dentry->d_parent->d_inode), 178 ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -182,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p)
182 if (req->r_old_dentry) { 189 if (req->r_old_dentry) {
183 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, 190 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
184 &pathbase, 0); 191 &pathbase, 0);
192 if (IS_ERR(path))
193 path = NULL;
185 spin_lock(&req->r_old_dentry->d_lock); 194 spin_lock(&req->r_old_dentry->d_lock);
186 seq_printf(s, " #%llx/%.*s (%s)", 195 seq_printf(s, " #%llx/%.*s (%s)",
187 ceph_ino(req->r_old_dentry->d_parent->d_inode), 196 ceph_ino(req->r_old_dentry->d_parent->d_inode),
@@ -256,7 +265,7 @@ static int osdc_show(struct seq_file *s, void *pp)
256 265
257static int caps_show(struct seq_file *s, void *p) 266static int caps_show(struct seq_file *s, void *p)
258{ 267{
259 struct ceph_client *client = p; 268 struct ceph_client *client = s->private;
260 int total, avail, used, reserved, min; 269 int total, avail, used, reserved, min;
261 270
262 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); 271 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
@@ -286,7 +295,7 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
286 return 0; 295 return 0;
287} 296}
288 297
289#define DEFINE_SHOW_FUNC(name) \ 298#define DEFINE_SHOW_FUNC(name) \
290static int name##_open(struct inode *inode, struct file *file) \ 299static int name##_open(struct inode *inode, struct file *file) \
291{ \ 300{ \
292 struct seq_file *sf; \ 301 struct seq_file *sf; \
@@ -356,8 +365,8 @@ int ceph_debugfs_client_init(struct ceph_client *client)
356 int ret = 0; 365 int ret = 0;
357 char name[80]; 366 char name[80];
358 367
359 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", 368 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
360 PR_FSID(&client->fsid), client->monc.auth->global_id); 369 client->monc.auth->global_id);
361 370
362 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 371 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
363 if (!client->debugfs_dir) 372 if (!client->debugfs_dir)
@@ -427,11 +436,12 @@ int ceph_debugfs_client_init(struct ceph_client *client)
427 if (!client->debugfs_caps) 436 if (!client->debugfs_caps)
428 goto out; 437 goto out;
429 438
430 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 439 client->debugfs_congestion_kb =
431 0600, 440 debugfs_create_file("writeback_congestion_kb",
432 client->debugfs_dir, 441 0600,
433 client, 442 client->debugfs_dir,
434 &congestion_kb_fops); 443 client,
444 &congestion_kb_fops);
435 if (!client->debugfs_congestion_kb) 445 if (!client->debugfs_congestion_kb)
436 goto out; 446 goto out;
437 447
@@ -461,7 +471,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
461 debugfs_remove(client->debugfs_dir); 471 debugfs_remove(client->debugfs_dir);
462} 472}
463 473
464#else // CONFIG_DEBUG_FS 474#else /* CONFIG_DEBUG_FS */
465 475
466int __init ceph_debugfs_init(void) 476int __init ceph_debugfs_init(void)
467{ 477{
@@ -481,4 +491,4 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
481{ 491{
482} 492}
483 493
484#endif // CONFIG_DEBUG_FS 494#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index 65b3e022eaf5..3d25415afe63 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -99,11 +99,13 @@ static inline void ceph_encode_timespec(struct ceph_timespec *tv,
99 */ 99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a) 100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{ 101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family); 102 __be16 ss_family = htons(a->in_addr.ss_family);
103 a->in_addr.ss_family = *(__u16 *)&ss_family;
103} 104}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a) 105static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{ 106{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family); 107 __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
108 a->in_addr.ss_family = ntohs(ss_family);
107 WARN_ON(a->in_addr.ss_family == 512); 109 WARN_ON(a->in_addr.ss_family == 512);
108} 110}
109 111
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed26..a1986eb52045 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -27,7 +27,7 @@
27 27
28const struct inode_operations ceph_dir_iops; 28const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops; 29const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops; 30const struct dentry_operations ceph_dentry_ops;
31 31
32/* 32/*
33 * Initialize ceph dentry state. 33 * Initialize ceph dentry state.
@@ -46,13 +46,16 @@ int ceph_init_dentry(struct dentry *dentry)
46 else 46 else
47 dentry->d_op = &ceph_snap_dentry_ops; 47 dentry->d_op = &ceph_snap_dentry_ops;
48 48
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); 49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
50 if (!di) 50 if (!di)
51 return -ENOMEM; /* oh well */ 51 return -ENOMEM; /* oh well */
52 52
53 spin_lock(&dentry->d_lock); 53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */ 54 if (dentry->d_fsdata) {
55 /* lost a race */
56 kmem_cache_free(ceph_dentry_cachep, di);
55 goto out_unlock; 57 goto out_unlock;
58 }
56 di->dentry = dentry; 59 di->dentry = dentry;
57 di->lease_session = NULL; 60 di->lease_session = NULL;
58 dentry->d_fsdata = di; 61 dentry->d_fsdata = di;
@@ -91,6 +94,8 @@ static unsigned fpos_off(loff_t p)
91 */ 94 */
92static int __dcache_readdir(struct file *filp, 95static int __dcache_readdir(struct file *filp,
93 void *dirent, filldir_t filldir) 96 void *dirent, filldir_t filldir)
97 __releases(inode->i_lock)
98 __acquires(inode->i_lock)
94{ 99{
95 struct inode *inode = filp->f_dentry->d_inode; 100 struct inode *inode = filp->f_dentry->d_inode;
96 struct ceph_file_info *fi = filp->private_data; 101 struct ceph_file_info *fi = filp->private_data;
@@ -125,7 +130,8 @@ more:
125 dentry = list_entry(p, struct dentry, d_u.d_child); 130 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry); 131 di = ceph_dentry(dentry);
127 while (1) { 132 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, 133 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
134 d_unhashed(dentry) ? "!hashed" : "hashed",
129 parent->d_subdirs.prev, parent->d_subdirs.next); 135 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) { 136 if (p == &parent->d_subdirs) {
131 fi->at_end = 1; 137 fi->at_end = 1;
@@ -229,6 +235,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
229 u32 ftype; 235 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo; 236 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir; 237 const int max_entries = client->mount_args->max_readdir;
238 const int max_bytes = client->mount_args->max_readdir_bytes;
232 239
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 240 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end) 241 if (fi->at_end)
@@ -261,6 +268,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
261 spin_lock(&inode->i_lock); 268 spin_lock(&inode->i_lock);
262 if ((filp->f_pos == 2 || fi->dentry) && 269 if ((filp->f_pos == 2 || fi->dentry) &&
263 !ceph_test_opt(client, NOASYNCREADDIR) && 270 !ceph_test_opt(client, NOASYNCREADDIR) &&
271 ceph_snap(inode) != CEPH_SNAPDIR &&
264 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 272 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 273 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 err = __dcache_readdir(filp, dirent, filldir); 274 err = __dcache_readdir(filp, dirent, filldir);
@@ -312,6 +320,7 @@ more:
312 req->r_readdir_offset = fi->next_offset; 320 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag); 321 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 322 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
323 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
315 req->r_num_caps = max_entries + 1; 324 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req); 325 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) { 326 if (err < 0) {
@@ -335,7 +344,7 @@ more:
335 if (req->r_reply_info.dir_end) { 344 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name); 345 kfree(fi->last_name);
337 fi->last_name = NULL; 346 fi->last_name = NULL;
338 fi->next_offset = 0; 347 fi->next_offset = 2;
339 } else { 348 } else {
340 rinfo = &req->r_reply_info; 349 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi, 350 err = note_last_dentry(fi,
@@ -478,7 +487,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 487struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err) 488 struct dentry *dentry, int err)
480{ 489{
481 struct ceph_client *client = ceph_client(dentry->d_sb); 490 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode; 491 struct inode *parent = dentry->d_parent->d_inode;
483 492
484 /* .snap dir? */ 493 /* .snap dir? */
@@ -568,7 +577,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
568 !is_root_ceph_dentry(dir, dentry) && 577 !is_root_ceph_dentry(dir, dentry) &&
569 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 578 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
570 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 579 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
571 di->offset = ci->i_max_offset++;
572 spin_unlock(&dir->i_lock); 580 spin_unlock(&dir->i_lock);
573 dout(" dir %p complete, -ENOENT\n", dir); 581 dout(" dir %p complete, -ENOENT\n", dir);
574 d_add(dentry, NULL); 582 d_add(dentry, NULL);
@@ -582,7 +590,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
582 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 590 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
583 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 591 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
584 if (IS_ERR(req)) 592 if (IS_ERR(req))
585 return ERR_PTR(PTR_ERR(req)); 593 return ERR_CAST(req);
586 req->r_dentry = dget(dentry); 594 req->r_dentry = dget(dentry);
587 req->r_num_caps = 2; 595 req->r_num_caps = 2;
588 /* we only need inode linkage */ 596 /* we only need inode linkage */
@@ -888,13 +896,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
888 896
889 /* ensure target dentry is invalidated, despite 897 /* ensure target dentry is invalidated, despite
890 rehashing bug in vfs_rename_dir */ 898 rehashing bug in vfs_rename_dir */
891 new_dentry->d_time = jiffies; 899 ceph_invalidate_dentry_lease(new_dentry);
892 ceph_dentry(new_dentry)->lease_shared_gen = 0;
893 } 900 }
894 ceph_mdsc_put_request(req); 901 ceph_mdsc_put_request(req);
895 return err; 902 return err;
896} 903}
897 904
905/*
906 * Ensure a dentry lease will no longer revalidate.
907 */
908void ceph_invalidate_dentry_lease(struct dentry *dentry)
909{
910 spin_lock(&dentry->d_lock);
911 dentry->d_time = jiffies;
912 ceph_dentry(dentry)->lease_shared_gen = 0;
913 spin_unlock(&dentry->d_lock);
914}
898 915
899/* 916/*
900 * Check if dentry lease is valid. If not, delete the lease. Try to 917 * Check if dentry lease is valid. If not, delete the lease. Try to
@@ -972,8 +989,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
972{ 989{
973 struct inode *dir = dentry->d_parent->d_inode; 990 struct inode *dir = dentry->d_parent->d_inode;
974 991
975 dout("d_revalidate %p '%.*s' inode %p\n", dentry, 992 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
976 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 993 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
994 ceph_dentry(dentry)->offset);
977 995
978 /* always trust cached snapped dentries, snapdir dentry */ 996 /* always trust cached snapped dentries, snapdir dentry */
979 if (ceph_snap(dir) != CEPH_NOSNAP) { 997 if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -998,18 +1016,26 @@ out_touch:
998 1016
999/* 1017/*
1000 * When a dentry is released, clear the dir I_COMPLETE if it was part 1018 * When a dentry is released, clear the dir I_COMPLETE if it was part
1001 * of the current dir gen. 1019 * of the current dir gen or if this is in the snapshot namespace.
1002 */ 1020 */
1003static void ceph_dentry_release(struct dentry *dentry) 1021static void ceph_dentry_release(struct dentry *dentry)
1004{ 1022{
1005 struct ceph_dentry_info *di = ceph_dentry(dentry); 1023 struct ceph_dentry_info *di = ceph_dentry(dentry);
1006 struct inode *parent_inode = dentry->d_parent->d_inode; 1024 struct inode *parent_inode = NULL;
1025 u64 snapid = CEPH_NOSNAP;
1007 1026
1008 if (parent_inode) { 1027 if (!IS_ROOT(dentry)) {
1028 parent_inode = dentry->d_parent->d_inode;
1029 if (parent_inode)
1030 snapid = ceph_snap(parent_inode);
1031 }
1032 dout("dentry_release %p parent %p\n", dentry, parent_inode);
1033 if (parent_inode && snapid != CEPH_SNAPDIR) {
1009 struct ceph_inode_info *ci = ceph_inode(parent_inode); 1034 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1010 1035
1011 spin_lock(&parent_inode->i_lock); 1036 spin_lock(&parent_inode->i_lock);
1012 if (ci->i_shared_gen == di->lease_shared_gen) { 1037 if (ci->i_shared_gen == di->lease_shared_gen ||
1038 snapid <= CEPH_MAXSNAP) {
1013 dout(" clearing %p complete (d_release)\n", 1039 dout(" clearing %p complete (d_release)\n",
1014 parent_inode); 1040 parent_inode);
1015 ci->i_ceph_flags &= ~CEPH_I_COMPLETE; 1041 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
@@ -1050,7 +1076,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1050 struct ceph_inode_info *ci = ceph_inode(inode); 1076 struct ceph_inode_info *ci = ceph_inode(inode);
1051 int left; 1077 int left;
1052 1078
1053 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) 1079 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1054 return -EISDIR; 1080 return -EISDIR;
1055 1081
1056 if (!cf->dir_info) { 1082 if (!cf->dir_info) {
@@ -1092,10 +1118,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1092 * an fsync() on a dir will wait for any uncommitted directory 1118 * an fsync() on a dir will wait for any uncommitted directory
1093 * operations to commit. 1119 * operations to commit.
1094 */ 1120 */
1095static int ceph_dir_fsync(struct file *file, struct dentry *dentry, 1121static int ceph_dir_fsync(struct file *file, int datasync)
1096 int datasync)
1097{ 1122{
1098 struct inode *inode = dentry->d_inode; 1123 struct inode *inode = file->f_path.dentry->d_inode;
1099 struct ceph_inode_info *ci = ceph_inode(inode); 1124 struct ceph_inode_info *ci = ceph_inode(inode);
1100 struct list_head *head = &ci->i_unsafe_dirops; 1125 struct list_head *head = &ci->i_unsafe_dirops;
1101 struct ceph_mds_request *req; 1126 struct ceph_mds_request *req;
@@ -1152,7 +1177,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1152 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1177 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1153 dn->d_name.len, dn->d_name.name); 1178 dn->d_name.len, dn->d_name.name);
1154 if (di) { 1179 if (di) {
1155 mdsc = &ceph_client(dn->d_sb)->mdsc; 1180 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1156 spin_lock(&mdsc->dentry_lru_lock); 1181 spin_lock(&mdsc->dentry_lru_lock);
1157 list_add_tail(&di->lru, &mdsc->dentry_lru); 1182 list_add_tail(&di->lru, &mdsc->dentry_lru);
1158 mdsc->num_dentry++; 1183 mdsc->num_dentry++;
@@ -1165,10 +1190,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1165 struct ceph_dentry_info *di = ceph_dentry(dn); 1190 struct ceph_dentry_info *di = ceph_dentry(dn);
1166 struct ceph_mds_client *mdsc; 1191 struct ceph_mds_client *mdsc;
1167 1192
1168 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, 1193 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1169 dn->d_name.len, dn->d_name.name); 1194 dn->d_name.len, dn->d_name.name, di->offset);
1170 if (di) { 1195 if (di) {
1171 mdsc = &ceph_client(dn->d_sb)->mdsc; 1196 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1172 spin_lock(&mdsc->dentry_lru_lock); 1197 spin_lock(&mdsc->dentry_lru_lock);
1173 list_move_tail(&di->lru, &mdsc->dentry_lru); 1198 list_move_tail(&di->lru, &mdsc->dentry_lru);
1174 spin_unlock(&mdsc->dentry_lru_lock); 1199 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1208,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1183 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1208 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1184 dn->d_name.len, dn->d_name.name); 1209 dn->d_name.len, dn->d_name.name);
1185 if (di) { 1210 if (di) {
1186 mdsc = &ceph_client(dn->d_sb)->mdsc; 1211 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1187 spin_lock(&mdsc->dentry_lru_lock); 1212 spin_lock(&mdsc->dentry_lru_lock);
1188 list_del_init(&di->lru); 1213 list_del_init(&di->lru);
1189 mdsc->num_dentry--; 1214 mdsc->num_dentry--;
@@ -1220,14 +1245,16 @@ const struct inode_operations ceph_dir_iops = {
1220 .create = ceph_create, 1245 .create = ceph_create,
1221}; 1246};
1222 1247
1223struct dentry_operations ceph_dentry_ops = { 1248const struct dentry_operations ceph_dentry_ops = {
1224 .d_revalidate = ceph_d_revalidate, 1249 .d_revalidate = ceph_d_revalidate,
1225 .d_release = ceph_dentry_release, 1250 .d_release = ceph_dentry_release,
1226}; 1251};
1227 1252
1228struct dentry_operations ceph_snapdir_dentry_ops = { 1253const struct dentry_operations ceph_snapdir_dentry_ops = {
1229 .d_revalidate = ceph_snapdir_d_revalidate, 1254 .d_revalidate = ceph_snapdir_d_revalidate,
1255 .d_release = ceph_dentry_release,
1230}; 1256};
1231 1257
1232struct dentry_operations ceph_snap_dentry_ops = { 1258const struct dentry_operations ceph_snap_dentry_ops = {
1259 .d_release = ceph_dentry_release,
1233}; 1260};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb328..e38423e82f2e 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -42,32 +42,37 @@ struct ceph_nfs_confh {
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, 42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable) 43 int connectable)
44{ 44{
45 int type;
45 struct ceph_nfs_fh *fh = (void *)rawfh; 46 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh; 47 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent; 48 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode; 49 struct inode *inode = dentry->d_inode;
49 int type; 50 int connected_handle_length = sizeof(*cfh)/4;
51 int handle_length = sizeof(*fh)/4;
50 52
51 /* don't re-export snaps */ 53 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP) 54 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL; 55 return -EINVAL;
54 56
55 if (*max_len >= sizeof(*cfh)) { 57 if (*max_len >= connected_handle_length) {
56 dout("encode_fh %p connectable\n", dentry); 58 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode); 59 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode); 60 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash; 61 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh); 62 *max_len = connected_handle_length;
61 type = 2; 63 type = 2;
62 } else if (*max_len > sizeof(*fh)) { 64 } else if (*max_len >= handle_length) {
63 if (connectable) 65 if (connectable) {
64 return -ENOSPC; 66 *max_len = connected_handle_length;
67 return 255;
68 }
65 dout("encode_fh %p\n", dentry); 69 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode); 70 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh); 71 *max_len = handle_length;
68 type = 1; 72 type = 1;
69 } else { 73 } else {
70 return -ENOSPC; 74 *max_len = handle_length;
75 return 255;
71 } 76 }
72 return type; 77 return type;
73} 78}
@@ -93,11 +98,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
93 return ERR_PTR(-ESTALE); 98 return ERR_PTR(-ESTALE);
94 99
95 dentry = d_obtain_alias(inode); 100 dentry = d_obtain_alias(inode);
96 if (!dentry) { 101 if (IS_ERR(dentry)) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", 102 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode); 103 fh->ino, inode);
99 iput(inode); 104 iput(inode);
100 return ERR_PTR(-ENOMEM); 105 return dentry;
101 } 106 }
102 err = ceph_init_dentry(dentry); 107 err = ceph_init_dentry(dentry);
103 108
@@ -115,7 +120,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
115static struct dentry *__cfh_to_dentry(struct super_block *sb, 120static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh) 121 struct ceph_nfs_confh *cfh)
117{ 122{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; 123 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
119 struct inode *inode; 124 struct inode *inode;
120 struct dentry *dentry; 125 struct dentry *dentry;
121 struct ceph_vino vino; 126 struct ceph_vino vino;
@@ -133,7 +138,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, 138 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS); 139 USE_ANY_MDS);
135 if (IS_ERR(req)) 140 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req)); 141 return ERR_CAST(req);
137 142
138 req->r_ino1 = vino; 143 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino; 144 req->r_ino2.ino = cfh->parent_ino;
@@ -149,11 +154,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
149 } 154 }
150 155
151 dentry = d_obtain_alias(inode); 156 dentry = d_obtain_alias(inode);
152 if (!dentry) { 157 if (IS_ERR(dentry)) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", 158 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode); 159 cfh->ino, inode);
155 iput(inode); 160 iput(inode);
156 return ERR_PTR(-ENOMEM); 161 return dentry;
157 } 162 }
158 err = ceph_init_dentry(dentry); 163 err = ceph_init_dentry(dentry);
159 if (err < 0) { 164 if (err < 0) {
@@ -202,11 +207,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
202 return ERR_PTR(-ESTALE); 207 return ERR_PTR(-ESTALE);
203 208
204 dentry = d_obtain_alias(inode); 209 dentry = d_obtain_alias(inode);
205 if (!dentry) { 210 if (IS_ERR(dentry)) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", 211 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode); 212 cfh->ino, inode);
208 iput(inode); 213 iput(inode);
209 return ERR_PTR(-ENOMEM); 214 return dentry;
210 } 215 }
211 err = ceph_init_dentry(dentry); 216 err = ceph_init_dentry(dentry);
212 if (err < 0) { 217 if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed6f19721d6e..66e4da6dba22 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
230 /* do the open */ 230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode); 231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req)) 232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req)); 233 return ERR_CAST(req);
234 req->r_dentry = dget(dentry); 234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2; 235 req->r_num_caps = 2;
236 if (flags & O_CREAT) { 236 if (flags & O_CREAT) {
@@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file)
265 kmem_cache_free(ceph_file_cachep, cf); 265 kmem_cache_free(ceph_file_cachep, cf);
266 266
267 /* wake up anyone waiting for caps on this inode */ 267 /* wake up anyone waiting for caps on this inode */
268 wake_up(&ci->i_cap_wq); 268 wake_up_all(&ci->i_cap_wq);
269 return 0; 269 return 0;
270} 270}
271 271
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
317/* 317/*
318 * allocate a vector new pages 318 * allocate a vector new pages
319 */ 319 */
320static struct page **alloc_page_vector(int num_pages) 320static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{ 321{
322 struct page **pages; 322 struct page **pages;
323 int i; 323 int i;
324 324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); 325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages) 326 if (!pages)
327 return ERR_PTR(-ENOMEM); 327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) { 328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS); 329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) { 330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i); 331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM); 332 return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
540 * in sequence. 540 * in sequence.
541 */ 541 */
542 } else { 542 } else {
543 pages = alloc_page_vector(num_pages); 543 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
544 } 544 }
545 if (IS_ERR(pages)) 545 if (IS_ERR(pages))
546 return PTR_ERR(pages); 546 return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
649 do_sync, 649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size, 650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2); 651 &mtime, false, 2);
652 if (IS_ERR(req)) 652 if (!req)
653 return PTR_ERR(req); 653 return -ENOMEM;
654 654
655 num_pages = calc_pages_for(pos, len); 655 num_pages = calc_pages_for(pos, len);
656 656
@@ -665,10 +665,10 @@ more:
665 * throw out any page cache pages in this range. this 665 * throw out any page cache pages in this range. this
666 * may block. 666 * may block.
667 */ 667 */
668 truncate_inode_pages_range(inode->i_mapping, pos, 668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1)); 669 (pos+len) | (PAGE_CACHE_SIZE-1));
670 } else { 670 } else {
671 pages = alloc_page_vector(num_pages); 671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
672 if (IS_ERR(pages)) { 672 if (IS_ERR(pages)) {
673 ret = PTR_ERR(pages); 673 ret = PTR_ERR(pages);
674 goto out; 674 goto out;
@@ -697,7 +697,7 @@ more:
697 * start_request so that a tid has been assigned. 697 * start_request so that a tid has been assigned.
698 */ 698 */
699 spin_lock(&ci->i_unsafe_lock); 699 spin_lock(&ci->i_unsafe_lock);
700 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); 700 list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
701 spin_unlock(&ci->i_unsafe_lock); 701 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 } 703 }
@@ -740,28 +740,32 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
740 unsigned long nr_segs, loff_t pos) 740 unsigned long nr_segs, loff_t pos)
741{ 741{
742 struct file *filp = iocb->ki_filp; 742 struct file *filp = iocb->ki_filp;
743 struct ceph_file_info *fi = filp->private_data;
743 loff_t *ppos = &iocb->ki_pos; 744 loff_t *ppos = &iocb->ki_pos;
744 size_t len = iov->iov_len; 745 size_t len = iov->iov_len;
745 struct inode *inode = filp->f_dentry->d_inode; 746 struct inode *inode = filp->f_dentry->d_inode;
746 struct ceph_inode_info *ci = ceph_inode(inode); 747 struct ceph_inode_info *ci = ceph_inode(inode);
747 void *base = iov->iov_base; 748 void __user *base = iov->iov_base;
748 ssize_t ret; 749 ssize_t ret;
749 int got = 0; 750 int want, got = 0;
750 int checkeof = 0, read = 0; 751 int checkeof = 0, read = 0;
751 752
752 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 753 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
753 inode, ceph_vinop(inode), pos, (unsigned)len, inode); 754 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
754again: 755again:
755 __ceph_do_pending_vmtruncate(inode); 756 __ceph_do_pending_vmtruncate(inode);
756 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, 757 if (fi->fmode & CEPH_FILE_MODE_LAZY)
757 &got, -1); 758 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
759 else
760 want = CEPH_CAP_FILE_CACHE;
761 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
758 if (ret < 0) 762 if (ret < 0)
759 goto out; 763 goto out;
760 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 764 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
761 inode, ceph_vinop(inode), pos, (unsigned)len, 765 inode, ceph_vinop(inode), pos, (unsigned)len,
762 ceph_cap_string(got)); 766 ceph_cap_string(got));
763 767
764 if ((got & CEPH_CAP_FILE_CACHE) == 0 || 768 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
765 (iocb->ki_filp->f_flags & O_DIRECT) || 769 (iocb->ki_filp->f_flags & O_DIRECT) ||
766 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) 770 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
767 /* hmm, this isn't really async... */ 771 /* hmm, this isn't really async... */
@@ -807,11 +811,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
807 unsigned long nr_segs, loff_t pos) 811 unsigned long nr_segs, loff_t pos)
808{ 812{
809 struct file *file = iocb->ki_filp; 813 struct file *file = iocb->ki_filp;
814 struct ceph_file_info *fi = file->private_data;
810 struct inode *inode = file->f_dentry->d_inode; 815 struct inode *inode = file->f_dentry->d_inode;
811 struct ceph_inode_info *ci = ceph_inode(inode); 816 struct ceph_inode_info *ci = ceph_inode(inode);
812 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 817 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
813 loff_t endoff = pos + iov->iov_len; 818 loff_t endoff = pos + iov->iov_len;
814 int got = 0; 819 int want, got = 0;
815 int ret, err; 820 int ret, err;
816 821
817 if (ceph_snap(inode) != CEPH_NOSNAP) 822 if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -824,8 +829,11 @@ retry_snap:
824 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", 829 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
825 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 830 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
826 inode->i_size); 831 inode->i_size);
827 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 832 if (fi->fmode & CEPH_FILE_MODE_LAZY)
828 &got, endoff); 833 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
834 else
835 want = CEPH_CAP_FILE_BUFFER;
836 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
829 if (ret < 0) 837 if (ret < 0)
830 goto out; 838 goto out;
831 839
@@ -833,7 +841,7 @@ retry_snap:
833 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 841 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
834 ceph_cap_string(got)); 842 ceph_cap_string(got));
835 843
836 if ((got & CEPH_CAP_FILE_BUFFER) == 0 || 844 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
837 (iocb->ki_filp->f_flags & O_DIRECT) || 845 (iocb->ki_filp->f_flags & O_DIRECT) ||
838 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { 846 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
839 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, 847 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
@@ -844,8 +852,7 @@ retry_snap:
844 if ((ret >= 0 || ret == -EIOCBQUEUED) && 852 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
845 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 853 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
846 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 854 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
847 err = vfs_fsync_range(file, file->f_path.dentry, 855 err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
848 pos, pos + ret - 1, 1);
849 if (err < 0) 856 if (err < 0)
850 ret = err; 857 ret = err;
851 } 858 }
@@ -931,6 +938,8 @@ const struct file_operations ceph_file_fops = {
931 .aio_write = ceph_aio_write, 938 .aio_write = ceph_aio_write,
932 .mmap = ceph_mmap, 939 .mmap = ceph_mmap,
933 .fsync = ceph_fsync, 940 .fsync = ceph_fsync,
941 .lock = ceph_lock,
942 .flock = ceph_flock,
934 .splice_read = generic_file_splice_read, 943 .splice_read = generic_file_splice_read,
935 .splice_write = generic_file_splice_write, 944 .splice_write = generic_file_splice_write,
936 .unlocked_ioctl = ceph_ioctl, 945 .unlocked_ioctl = ceph_ioctl,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeba..62377ec37edf 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
69 69
70 BUG_ON(!S_ISDIR(parent->i_mode)); 70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode)) 71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode)); 72 return inode;
73 inode->i_mode = parent->i_mode; 73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid; 74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid; 75 inode->i_gid = parent->i_gid;
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 384 */
385 if (ci->i_snap_realm) { 385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc; 387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 389
390 dout(" dropping residual ref to snap realm %p\n", realm); 390 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -442,8 +442,9 @@ int ceph_fill_file_size(struct inode *inode, int issued,
442 * the file is either opened or mmaped 442 * the file is either opened or mmaped
443 */ 443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| 444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| 445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) || 446 CEPH_CAP_FILE_EXCL|
447 CEPH_CAP_FILE_LAZYIO)) ||
447 mapping_mapped(inode->i_mapping) || 448 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) { 449 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++; 450 ci->i_truncate_pending++;
@@ -619,11 +620,12 @@ static int fill_inode(struct inode *inode,
619 memcpy(ci->i_xattrs.blob->vec.iov_base, 620 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len); 621 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 622 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
623 xattr_blob = NULL;
622 } 624 }
623 625
624 inode->i_mapping->a_ops = &ceph_aops; 626 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info = 627 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info; 628 &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
627 629
628 switch (inode->i_mode & S_IFMT) { 630 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO: 631 case S_IFIFO:
@@ -674,14 +676,16 @@ static int fill_inode(struct inode *inode,
674 /* set dir completion flag? */ 676 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 && 677 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP && 678 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { 679 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
680 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
681 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
678 dout(" marking %p complete (empty)\n", inode); 682 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE; 683 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2; 684 ci->i_max_offset = 2;
681 } 685 }
682 686
683 /* it may be better to set st_size in getattr instead? */ 687 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) 688 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes; 689 inode->i_size = ci->i_rbytes;
686 break; 690 break;
687 default: 691 default:
@@ -802,6 +806,37 @@ out_unlock:
802} 806}
803 807
804/* 808/*
809 * Set dentry's directory position based on the current dir's max, and
810 * order it in d_subdirs, so that dcache_readdir behaves.
811 */
812static void ceph_set_dentry_offset(struct dentry *dn)
813{
814 struct dentry *dir = dn->d_parent;
815 struct inode *inode = dn->d_parent->d_inode;
816 struct ceph_dentry_info *di;
817
818 BUG_ON(!inode);
819
820 di = ceph_dentry(dn);
821
822 spin_lock(&inode->i_lock);
823 if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
824 spin_unlock(&inode->i_lock);
825 return;
826 }
827 di->offset = ceph_inode(inode)->i_max_offset++;
828 spin_unlock(&inode->i_lock);
829
830 spin_lock(&dcache_lock);
831 spin_lock(&dn->d_lock);
832 list_move(&dn->d_u.d_child, &dir->d_subdirs);
833 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
834 dn->d_u.d_child.prev, dn->d_u.d_child.next);
835 spin_unlock(&dn->d_lock);
836 spin_unlock(&dcache_lock);
837}
838
839/*
805 * splice a dentry to an inode. 840 * splice a dentry to an inode.
806 * caller must hold directory i_mutex for this to be safe. 841 * caller must hold directory i_mutex for this to be safe.
807 * 842 *
@@ -810,17 +845,19 @@ out_unlock:
810 * the caller) if we fail. 845 * the caller) if we fail.
811 */ 846 */
812static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, 847static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
813 bool *prehash) 848 bool *prehash, bool set_offset)
814{ 849{
815 struct dentry *realdn; 850 struct dentry *realdn;
816 851
852 BUG_ON(dn->d_inode);
853
817 /* dn must be unhashed */ 854 /* dn must be unhashed */
818 if (!d_unhashed(dn)) 855 if (!d_unhashed(dn))
819 d_drop(dn); 856 d_drop(dn);
820 realdn = d_materialise_unique(dn, in); 857 realdn = d_materialise_unique(dn, in);
821 if (IS_ERR(realdn)) { 858 if (IS_ERR(realdn)) {
822 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n", 859 pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
823 dn, in, ceph_vinop(in)); 860 PTR_ERR(realdn), dn, in, ceph_vinop(in));
824 if (prehash) 861 if (prehash)
825 *prehash = false; /* don't rehash on error */ 862 *prehash = false; /* don't rehash on error */
826 dn = realdn; /* note realdn contains the error */ 863 dn = realdn; /* note realdn contains the error */
@@ -835,44 +872,18 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
835 dn = realdn; 872 dn = realdn;
836 } else { 873 } else {
837 BUG_ON(!ceph_dentry(dn)); 874 BUG_ON(!ceph_dentry(dn));
838
839 dout("dn %p attached to %p ino %llx.%llx\n", 875 dout("dn %p attached to %p ino %llx.%llx\n",
840 dn, dn->d_inode, ceph_vinop(dn->d_inode)); 876 dn, dn->d_inode, ceph_vinop(dn->d_inode));
841 } 877 }
842 if ((!prehash || *prehash) && d_unhashed(dn)) 878 if ((!prehash || *prehash) && d_unhashed(dn))
843 d_rehash(dn); 879 d_rehash(dn);
880 if (set_offset)
881 ceph_set_dentry_offset(dn);
844out: 882out:
845 return dn; 883 return dn;
846} 884}
847 885
848/* 886/*
849 * Set dentry's directory position based on the current dir's max, and
850 * order it in d_subdirs, so that dcache_readdir behaves.
851 */
852static void ceph_set_dentry_offset(struct dentry *dn)
853{
854 struct dentry *dir = dn->d_parent;
855 struct inode *inode = dn->d_parent->d_inode;
856 struct ceph_dentry_info *di;
857
858 BUG_ON(!inode);
859
860 di = ceph_dentry(dn);
861
862 spin_lock(&inode->i_lock);
863 di->offset = ceph_inode(inode)->i_max_offset++;
864 spin_unlock(&inode->i_lock);
865
866 spin_lock(&dcache_lock);
867 spin_lock(&dn->d_lock);
868 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
869 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
870 dn->d_u.d_child.prev, dn->d_u.d_child.next);
871 spin_unlock(&dn->d_lock);
872 spin_unlock(&dcache_lock);
873}
874
875/*
876 * Incorporate results into the local cache. This is either just 887 * Incorporate results into the local cache. This is either just
877 * one inode, or a directory, dentry, and possibly linked-to inode (e.g., 888 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
878 * after a lookup). 889 * after a lookup).
@@ -933,14 +944,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
933 944
934 if (!rinfo->head->is_target && !rinfo->head->is_dentry) { 945 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
935 dout("fill_trace reply is empty!\n"); 946 dout("fill_trace reply is empty!\n");
936 if (rinfo->head->result == 0 && req->r_locked_dir) { 947 if (rinfo->head->result == 0 && req->r_locked_dir)
937 struct ceph_inode_info *ci = 948 ceph_invalidate_dir_request(req);
938 ceph_inode(req->r_locked_dir);
939 dout(" clearing %p complete (empty trace)\n",
940 req->r_locked_dir);
941 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
942 ci->i_release_count++;
943 }
944 return 0; 949 return 0;
945 } 950 }
946 951
@@ -1011,13 +1016,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1011 req->r_old_dentry->d_name.len, 1016 req->r_old_dentry->d_name.len,
1012 req->r_old_dentry->d_name.name, 1017 req->r_old_dentry->d_name.name,
1013 dn, dn->d_name.len, dn->d_name.name); 1018 dn, dn->d_name.len, dn->d_name.name);
1019
1014 /* ensure target dentry is invalidated, despite 1020 /* ensure target dentry is invalidated, despite
1015 rehashing bug in vfs_rename_dir */ 1021 rehashing bug in vfs_rename_dir */
1016 dn->d_time = jiffies; 1022 ceph_invalidate_dentry_lease(dn);
1017 ceph_dentry(dn)->lease_shared_gen = 0; 1023
1018 /* take overwritten dentry's readdir offset */ 1024 /* take overwritten dentry's readdir offset */
1025 dout("dn %p gets %p offset %lld (old offset %lld)\n",
1026 req->r_old_dentry, dn, ceph_dentry(dn)->offset,
1027 ceph_dentry(req->r_old_dentry)->offset);
1019 ceph_dentry(req->r_old_dentry)->offset = 1028 ceph_dentry(req->r_old_dentry)->offset =
1020 ceph_dentry(dn)->offset; 1029 ceph_dentry(dn)->offset;
1030
1021 dn = req->r_old_dentry; /* use old_dentry */ 1031 dn = req->r_old_dentry; /* use old_dentry */
1022 in = dn->d_inode; 1032 in = dn->d_inode;
1023 } 1033 }
@@ -1053,13 +1063,12 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1053 d_delete(dn); 1063 d_delete(dn);
1054 goto done; 1064 goto done;
1055 } 1065 }
1056 dn = splice_dentry(dn, in, &have_lease); 1066 dn = splice_dentry(dn, in, &have_lease, true);
1057 if (IS_ERR(dn)) { 1067 if (IS_ERR(dn)) {
1058 err = PTR_ERR(dn); 1068 err = PTR_ERR(dn);
1059 goto done; 1069 goto done;
1060 } 1070 }
1061 req->r_dentry = dn; /* may have spliced */ 1071 req->r_dentry = dn; /* may have spliced */
1062 ceph_set_dentry_offset(dn);
1063 igrab(in); 1072 igrab(in);
1064 } else if (ceph_ino(in) == vino.ino && 1073 } else if (ceph_ino(in) == vino.ino &&
1065 ceph_snap(in) == vino.snap) { 1074 ceph_snap(in) == vino.snap) {
@@ -1097,12 +1106,11 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1097 goto done; 1106 goto done;
1098 } 1107 }
1099 dout(" linking snapped dir %p to dn %p\n", in, dn); 1108 dout(" linking snapped dir %p to dn %p\n", in, dn);
1100 dn = splice_dentry(dn, in, NULL); 1109 dn = splice_dentry(dn, in, NULL, true);
1101 if (IS_ERR(dn)) { 1110 if (IS_ERR(dn)) {
1102 err = PTR_ERR(dn); 1111 err = PTR_ERR(dn);
1103 goto done; 1112 goto done;
1104 } 1113 }
1105 ceph_set_dentry_offset(dn);
1106 req->r_dentry = dn; /* may have spliced */ 1114 req->r_dentry = dn; /* may have spliced */
1107 igrab(in); 1115 igrab(in);
1108 rinfo->head->is_dentry = 1; /* fool notrace handlers */ 1116 rinfo->head->is_dentry = 1; /* fool notrace handlers */
@@ -1194,8 +1202,10 @@ retry_lookup:
1194 goto out; 1202 goto out;
1195 } 1203 }
1196 err = ceph_init_dentry(dn); 1204 err = ceph_init_dentry(dn);
1197 if (err < 0) 1205 if (err < 0) {
1206 dput(dn);
1198 goto out; 1207 goto out;
1208 }
1199 } else if (dn->d_inode && 1209 } else if (dn->d_inode &&
1200 (ceph_ino(dn->d_inode) != vino.ino || 1210 (ceph_ino(dn->d_inode) != vino.ino ||
1201 ceph_snap(dn->d_inode) != vino.snap)) { 1211 ceph_snap(dn->d_inode) != vino.snap)) {
@@ -1221,26 +1231,31 @@ retry_lookup:
1221 in = dn->d_inode; 1231 in = dn->d_inode;
1222 } else { 1232 } else {
1223 in = ceph_get_inode(parent->d_sb, vino); 1233 in = ceph_get_inode(parent->d_sb, vino);
1224 if (in == NULL) { 1234 if (IS_ERR(in)) {
1225 dout("new_inode badness\n"); 1235 dout("new_inode badness\n");
1226 d_delete(dn); 1236 d_delete(dn);
1227 dput(dn); 1237 dput(dn);
1228 err = -ENOMEM; 1238 err = PTR_ERR(in);
1229 goto out; 1239 goto out;
1230 } 1240 }
1231 dn = splice_dentry(dn, in, NULL); 1241 dn = splice_dentry(dn, in, NULL, false);
1242 if (IS_ERR(dn))
1243 dn = NULL;
1232 } 1244 }
1233 1245
1234 if (fill_inode(in, &rinfo->dir_in[i], NULL, session, 1246 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1235 req->r_request_started, -1, 1247 req->r_request_started, -1,
1236 &req->r_caps_reservation) < 0) { 1248 &req->r_caps_reservation) < 0) {
1237 pr_err("fill_inode badness on %p\n", in); 1249 pr_err("fill_inode badness on %p\n", in);
1238 dput(dn); 1250 goto next_item;
1239 continue;
1240 } 1251 }
1241 update_dentry_lease(dn, rinfo->dir_dlease[i], 1252 if (dn)
1242 req->r_session, req->r_request_started); 1253 update_dentry_lease(dn, rinfo->dir_dlease[i],
1243 dput(dn); 1254 req->r_session,
1255 req->r_request_started);
1256next_item:
1257 if (dn)
1258 dput(dn);
1244 } 1259 }
1245 req->r_did_prepopulate = true; 1260 req->r_did_prepopulate = true;
1246 1261
@@ -1429,7 +1444,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1429{ 1444{
1430 struct ceph_inode_info *ci = ceph_inode(inode); 1445 struct ceph_inode_info *ci = ceph_inode(inode);
1431 1446
1432 if (queue_work(ceph_client(inode->i_sb)->trunc_wq, 1447 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1433 &ci->i_vmtruncate_work)) { 1448 &ci->i_vmtruncate_work)) {
1434 dout("ceph_queue_vmtruncate %p\n", inode); 1449 dout("ceph_queue_vmtruncate %p\n", inode);
1435 igrab(inode); 1450 igrab(inode);
@@ -1489,7 +1504,7 @@ retry:
1489 if (wrbuffer_refs == 0) 1504 if (wrbuffer_refs == 0)
1490 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 1505 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1491 if (wake) 1506 if (wake)
1492 wake_up(&ci->i_cap_wq); 1507 wake_up_all(&ci->i_cap_wq);
1493} 1508}
1494 1509
1495 1510
@@ -1518,7 +1533,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1518 struct inode *parent_inode = dentry->d_parent->d_inode; 1533 struct inode *parent_inode = dentry->d_parent->d_inode;
1519 const unsigned int ia_valid = attr->ia_valid; 1534 const unsigned int ia_valid = attr->ia_valid;
1520 struct ceph_mds_request *req; 1535 struct ceph_mds_request *req;
1521 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; 1536 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
1522 int issued; 1537 int issued;
1523 int release = 0, dirtied = 0; 1538 int release = 0, dirtied = 0;
1524 int mask = 0; 1539 int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..76e307d2aba1 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
102 u64 len = 1, olen; 102 u64 len = 1, olen;
103 u64 tmp; 103 u64 tmp;
104 struct ceph_object_layout ol; 104 struct ceph_object_layout ol;
@@ -143,6 +143,27 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
143 return 0; 143 return 0;
144} 144}
145 145
146static long ceph_ioctl_lazyio(struct file *file)
147{
148 struct ceph_file_info *fi = file->private_data;
149 struct inode *inode = file->f_dentry->d_inode;
150 struct ceph_inode_info *ci = ceph_inode(inode);
151
152 if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
153 spin_lock(&inode->i_lock);
154 ci->i_nr_by_mode[fi->fmode]--;
155 fi->fmode |= CEPH_FILE_MODE_LAZY;
156 ci->i_nr_by_mode[fi->fmode]++;
157 spin_unlock(&inode->i_lock);
158 dout("ioctl_layzio: file %p marked lazy\n", file);
159
160 ceph_check_caps(ci, 0, NULL);
161 } else {
162 dout("ioctl_layzio: file %p already lazy\n", file);
163 }
164 return 0;
165}
166
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 167long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{ 168{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); 169 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -155,6 +176,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
155 176
156 case CEPH_IOC_GET_DATALOC: 177 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg); 178 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
179
180 case CEPH_IOC_LAZYIO:
181 return ceph_ioctl_lazyio(file);
158 } 182 }
159 return -ENOTTY; 183 return -ENOTTY;
160} 184}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 25e4f1a9d059..88451a3b6857 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -37,4 +37,6 @@ struct ceph_ioctl_dataloc {
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ 37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc) 38 struct ceph_ioctl_dataloc)
39 39
40#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
41
40#endif 42#endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
new file mode 100644
index 000000000000..ff4e753aae92
--- /dev/null
+++ b/fs/ceph/locks.c
@@ -0,0 +1,260 @@
1#include "ceph_debug.h"
2
3#include <linux/file.h>
4#include <linux/namei.h>
5
6#include "super.h"
7#include "mds_client.h"
8#include "pagelist.h"
9
10/**
11 * Implement fcntl and flock locking functions.
12 */
13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
14 u64 pid, u64 pid_ns,
15 int cmd, u64 start, u64 length, u8 wait)
16{
17 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc =
19 &ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req;
21 int err;
22
23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
24 if (IS_ERR(req))
25 return PTR_ERR(req);
26 req->r_inode = igrab(inode);
27
28 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
29 "length: %llu, wait: %d, type`: %d", (int)lock_type,
30 (int)operation, pid, start, length, wait, cmd);
31
32 req->r_args.filelock_change.rule = lock_type;
33 req->r_args.filelock_change.type = cmd;
34 req->r_args.filelock_change.pid = cpu_to_le64(pid);
35 /* This should be adjusted, but I'm not sure if
36 namespaces actually get id numbers*/
37 req->r_args.filelock_change.pid_namespace =
38 cpu_to_le64((u64)pid_ns);
39 req->r_args.filelock_change.start = cpu_to_le64(start);
40 req->r_args.filelock_change.length = cpu_to_le64(length);
41 req->r_args.filelock_change.wait = wait;
42
43 err = ceph_mdsc_do_request(mdsc, inode, req);
44 ceph_mdsc_put_request(req);
45 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
46 "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
47 (int)operation, pid, start, length, wait, cmd, err);
48 return err;
49}
50
51/**
52 * Attempt to set an fcntl lock.
53 * For now, this just goes away to the server. Later it may be more awesome.
54 */
55int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
56{
57 u64 length;
58 u8 lock_cmd;
59 int err;
60 u8 wait = 0;
61 u16 op = CEPH_MDS_OP_SETFILELOCK;
62
63 fl->fl_nspid = get_pid(task_tgid(current));
64 dout("ceph_lock, fl_pid:%d", fl->fl_pid);
65
66 /* set wait bit as appropriate, then make command as Ceph expects it*/
67 if (F_SETLKW == cmd)
68 wait = 1;
69 if (F_GETLK == cmd)
70 op = CEPH_MDS_OP_GETFILELOCK;
71
72 if (F_RDLCK == fl->fl_type)
73 lock_cmd = CEPH_LOCK_SHARED;
74 else if (F_WRLCK == fl->fl_type)
75 lock_cmd = CEPH_LOCK_EXCL;
76 else
77 lock_cmd = CEPH_LOCK_UNLOCK;
78
79 if (LLONG_MAX == fl->fl_end)
80 length = 0;
81 else
82 length = fl->fl_end - fl->fl_start + 1;
83
84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
85 (u64)fl->fl_pid,
86 (u64)(unsigned long)fl->fl_nspid,
87 lock_cmd, fl->fl_start,
88 length, wait);
89 if (!err) {
90 dout("mds locked, locking locally");
91 err = posix_lock_file(file, fl, NULL);
92 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
93 /* undo! This should only happen if the kernel detects
94 * local deadlock. */
95 ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
96 (u64)fl->fl_pid,
97 (u64)(unsigned long)fl->fl_nspid,
98 CEPH_LOCK_UNLOCK, fl->fl_start,
99 length, 0);
100 dout("got %d on posix_lock_file, undid lock", err);
101 }
102 } else {
103 dout("mds returned error code %d", err);
104 }
105 return err;
106}
107
108int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
109{
110 u64 length;
111 u8 lock_cmd;
112 int err;
113 u8 wait = 1;
114
115 fl->fl_nspid = get_pid(task_tgid(current));
116 dout("ceph_flock, fl_pid:%d", fl->fl_pid);
117
118 /* set wait bit, then clear it out of cmd*/
119 if (cmd & LOCK_NB)
120 wait = 0;
121 cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
122 /* set command sequence that Ceph wants to see:
123 shared lock, exclusive lock, or unlock */
124 if (LOCK_SH == cmd)
125 lock_cmd = CEPH_LOCK_SHARED;
126 else if (LOCK_EX == cmd)
127 lock_cmd = CEPH_LOCK_EXCL;
128 else
129 lock_cmd = CEPH_LOCK_UNLOCK;
130 /* mds requires start and length rather than start and end */
131 if (LLONG_MAX == fl->fl_end)
132 length = 0;
133 else
134 length = fl->fl_end - fl->fl_start + 1;
135
136 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
137 file, (u64)fl->fl_pid,
138 (u64)(unsigned long)fl->fl_nspid,
139 lock_cmd, fl->fl_start,
140 length, wait);
141 if (!err) {
142 err = flock_lock_file_wait(file, fl);
143 if (err) {
144 ceph_lock_message(CEPH_LOCK_FLOCK,
145 CEPH_MDS_OP_SETFILELOCK,
146 file, (u64)fl->fl_pid,
147 (u64)(unsigned long)fl->fl_nspid,
148 CEPH_LOCK_UNLOCK, fl->fl_start,
149 length, 0);
150 dout("got %d on flock_lock_file_wait, undid lock", err);
151 }
152 } else {
153 dout("mds error code %d", err);
154 }
155 return err;
156}
157
158/**
159 * Must be called with BKL already held. Fills in the passed
160 * counter variables, so you can prepare pagelist metadata before calling
161 * ceph_encode_locks.
162 */
163void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
164{
165 struct file_lock *lock;
166
167 *fcntl_count = 0;
168 *flock_count = 0;
169
170 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
171 if (lock->fl_flags & FL_POSIX)
172 ++(*fcntl_count);
173 else if (lock->fl_flags & FL_FLOCK)
174 ++(*flock_count);
175 }
176 dout("counted %d flock locks and %d fcntl locks",
177 *flock_count, *fcntl_count);
178}
179
180/**
181 * Encode the flock and fcntl locks for the given inode into the pagelist.
182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
183 * sequential flock locks.
184 * Must be called with BLK already held, and the lock numbers should have
185 * been gathered under the same lock holding window.
186 */
187int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
188 int num_fcntl_locks, int num_flock_locks)
189{
190 struct file_lock *lock;
191 struct ceph_filelock cephlock;
192 int err = 0;
193
194 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
195 num_fcntl_locks);
196 err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
197 if (err)
198 goto fail;
199 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
200 if (lock->fl_flags & FL_POSIX) {
201 err = lock_to_ceph_filelock(lock, &cephlock);
202 if (err)
203 goto fail;
204 err = ceph_pagelist_append(pagelist, &cephlock,
205 sizeof(struct ceph_filelock));
206 }
207 if (err)
208 goto fail;
209 }
210
211 err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
212 if (err)
213 goto fail;
214 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
215 if (lock->fl_flags & FL_FLOCK) {
216 err = lock_to_ceph_filelock(lock, &cephlock);
217 if (err)
218 goto fail;
219 err = ceph_pagelist_append(pagelist, &cephlock,
220 sizeof(struct ceph_filelock));
221 }
222 if (err)
223 goto fail;
224 }
225fail:
226 return err;
227}
228
229/*
230 * Given a pointer to a lock, convert it to a ceph filelock
231 */
232int lock_to_ceph_filelock(struct file_lock *lock,
233 struct ceph_filelock *cephlock)
234{
235 int err = 0;
236
237 cephlock->start = cpu_to_le64(lock->fl_start);
238 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
239 cephlock->client = cpu_to_le64(0);
240 cephlock->pid = cpu_to_le64(lock->fl_pid);
241 cephlock->pid_namespace =
242 cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
243
244 switch (lock->fl_type) {
245 case F_RDLCK:
246 cephlock->type = CEPH_LOCK_SHARED;
247 break;
248 case F_WRLCK:
249 cephlock->type = CEPH_LOCK_EXCL;
250 break;
251 case F_UNLCK:
252 cephlock->type = CEPH_LOCK_UNLOCK;
253 break;
254 default:
255 dout("Have unknown lock type %d", lock->fl_type);
256 err = -EINVAL;
257 }
258
259 return err;
260}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e01..fad95f8f2608 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
3#include <linux/wait.h> 3#include <linux/wait.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/smp_lock.h>
6 7
7#include "mds_client.h" 8#include "mds_client.h"
8#include "mon_client.h" 9#include "mon_client.h"
@@ -37,10 +38,15 @@
37 * are no longer valid. 38 * are no longer valid.
38 */ 39 */
39 40
41struct ceph_reconnect_state {
42 struct ceph_pagelist *pagelist;
43 bool flock;
44};
45
40static void __wake_requests(struct ceph_mds_client *mdsc, 46static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head); 47 struct list_head *head);
42 48
43const static struct ceph_connection_operations mds_con_ops; 49static const struct ceph_connection_operations mds_con_ops;
44 50
45 51
46/* 52/*
@@ -449,7 +455,7 @@ void ceph_mdsc_release_request(struct kref *kref)
449 kfree(req->r_path1); 455 kfree(req->r_path1);
450 kfree(req->r_path2); 456 kfree(req->r_path2);
451 put_request_session(req); 457 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation); 458 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
453 kfree(req); 459 kfree(req);
454} 460}
455 461
@@ -512,7 +518,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
512{ 518{
513 req->r_tid = ++mdsc->last_tid; 519 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps) 520 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); 521 ceph_reserve_caps(mdsc, &req->r_caps_reservation,
522 req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid); 523 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req); 524 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req); 525 __insert_request(mdsc, req);
@@ -553,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
553 * 560 *
554 * Called under mdsc->mutex. 561 * Called under mdsc->mutex.
555 */ 562 */
563struct dentry *get_nonsnap_parent(struct dentry *dentry)
564{
565 while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
566 dentry = dentry->d_parent;
567 return dentry;
568}
569
556static int __choose_mds(struct ceph_mds_client *mdsc, 570static int __choose_mds(struct ceph_mds_client *mdsc,
557 struct ceph_mds_request *req) 571 struct ceph_mds_request *req)
558{ 572{
@@ -583,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
583 if (req->r_inode) { 597 if (req->r_inode) {
584 inode = req->r_inode; 598 inode = req->r_inode;
585 } else if (req->r_dentry) { 599 } else if (req->r_dentry) {
586 if (req->r_dentry->d_inode) { 600 struct inode *dir = req->r_dentry->d_parent->d_inode;
601
602 if (dir->i_sb != mdsc->client->sb) {
603 /* not this fs! */
604 inode = req->r_dentry->d_inode;
605 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
606 /* direct snapped/virtual snapdir requests
607 * based on parent dir inode */
608 struct dentry *dn =
609 get_nonsnap_parent(req->r_dentry->d_parent);
610 inode = dn->d_inode;
611 dout("__choose_mds using nonsnap parent %p\n", inode);
612 } else if (req->r_dentry->d_inode) {
613 /* dentry target */
587 inode = req->r_dentry->d_inode; 614 inode = req->r_dentry->d_inode;
588 } else { 615 } else {
589 inode = req->r_dentry->d_parent->d_inode; 616 /* dir + name */
617 inode = dir;
590 hash = req->r_dentry->d_name.hash; 618 hash = req->r_dentry->d_name.hash;
591 is_hash = true; 619 is_hash = true;
592 } 620 }
593 } 621 }
622
594 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, 623 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
595 (int)hash, mode); 624 (int)hash, mode);
596 if (!inode) 625 if (!inode)
@@ -665,10 +694,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
665 struct ceph_msg *msg; 694 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h; 695 struct ceph_mds_session_head *h;
667 696
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); 697 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
669 if (IS_ERR(msg)) { 698 if (!msg) {
670 pr_err("create_session_msg ENOMEM creating msg\n"); 699 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg)); 700 return NULL;
672 } 701 }
673 h = msg->front.iov_base; 702 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op); 703 h->op = cpu_to_le32(op);
@@ -687,7 +716,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
687 struct ceph_msg *msg; 716 struct ceph_msg *msg;
688 int mstate; 717 int mstate;
689 int mds = session->s_mds; 718 int mds = session->s_mds;
690 int err = 0;
691 719
692 /* wait for mds to go active? */ 720 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); 721 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,17 +726,58 @@ static int __open_session(struct ceph_mds_client *mdsc,
698 726
699 /* send connect message */ 727 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); 728 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) { 729 if (!msg)
702 err = PTR_ERR(msg); 730 return -ENOMEM;
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg); 731 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0; 732 return 0;
709} 733}
710 734
711/* 735/*
736 * open sessions for any export targets for the given mds
737 *
738 * called under mdsc->mutex
739 */
740static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
741 struct ceph_mds_session *session)
742{
743 struct ceph_mds_info *mi;
744 struct ceph_mds_session *ts;
745 int i, mds = session->s_mds;
746 int target;
747
748 if (mds >= mdsc->mdsmap->m_max_mds)
749 return;
750 mi = &mdsc->mdsmap->m_info[mds];
751 dout("open_export_target_sessions for mds%d (%d targets)\n",
752 session->s_mds, mi->num_export_targets);
753
754 for (i = 0; i < mi->num_export_targets; i++) {
755 target = mi->export_targets[i];
756 ts = __ceph_lookup_mds_session(mdsc, target);
757 if (!ts) {
758 ts = register_session(mdsc, target);
759 if (IS_ERR(ts))
760 return;
761 }
762 if (session->s_state == CEPH_MDS_SESSION_NEW ||
763 session->s_state == CEPH_MDS_SESSION_CLOSING)
764 __open_session(mdsc, session);
765 else
766 dout(" mds%d target mds%d %p is %s\n", session->s_mds,
767 i, ts, session_state_name(ts->s_state));
768 ceph_put_mds_session(ts);
769 }
770}
771
772void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
773 struct ceph_mds_session *session)
774{
775 mutex_lock(&mdsc->mutex);
776 __open_export_target_sessions(mdsc, session);
777 mutex_unlock(&mdsc->mutex);
778}
779
780/*
712 * session caps 781 * session caps
713 */ 782 */
714 783
@@ -769,7 +838,7 @@ static int iterate_session_caps(struct ceph_mds_session *session,
769 last_inode = NULL; 838 last_inode = NULL;
770 } 839 }
771 if (old_cap) { 840 if (old_cap) {
772 ceph_put_cap(old_cap); 841 ceph_put_cap(session->s_mdsc, old_cap);
773 old_cap = NULL; 842 old_cap = NULL;
774 } 843 }
775 844
@@ -798,18 +867,55 @@ out:
798 if (last_inode) 867 if (last_inode)
799 iput(last_inode); 868 iput(last_inode);
800 if (old_cap) 869 if (old_cap)
801 ceph_put_cap(old_cap); 870 ceph_put_cap(session->s_mdsc, old_cap);
802 871
803 return ret; 872 return ret;
804} 873}
805 874
806static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, 875static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
807 void *arg) 876 void *arg)
808{ 877{
809 struct ceph_inode_info *ci = ceph_inode(inode); 878 struct ceph_inode_info *ci = ceph_inode(inode);
879 int drop = 0;
880
810 dout("removing cap %p, ci is %p, inode is %p\n", 881 dout("removing cap %p, ci is %p, inode is %p\n",
811 cap, ci, &ci->vfs_inode); 882 cap, ci, &ci->vfs_inode);
812 ceph_remove_cap(cap); 883 spin_lock(&inode->i_lock);
884 __ceph_remove_cap(cap);
885 if (!__ceph_is_any_real_caps(ci)) {
886 struct ceph_mds_client *mdsc =
887 &ceph_sb_to_client(inode->i_sb)->mdsc;
888
889 spin_lock(&mdsc->cap_dirty_lock);
890 if (!list_empty(&ci->i_dirty_item)) {
891 pr_info(" dropping dirty %s state for %p %lld\n",
892 ceph_cap_string(ci->i_dirty_caps),
893 inode, ceph_ino(inode));
894 ci->i_dirty_caps = 0;
895 list_del_init(&ci->i_dirty_item);
896 drop = 1;
897 }
898 if (!list_empty(&ci->i_flushing_item)) {
899 pr_info(" dropping dirty+flushing %s state for %p %lld\n",
900 ceph_cap_string(ci->i_flushing_caps),
901 inode, ceph_ino(inode));
902 ci->i_flushing_caps = 0;
903 list_del_init(&ci->i_flushing_item);
904 mdsc->num_cap_flushing--;
905 drop = 1;
906 }
907 if (drop && ci->i_wrbuffer_ref) {
908 pr_info(" dropping dirty data for %p %lld\n",
909 inode, ceph_ino(inode));
910 ci->i_wrbuffer_ref = 0;
911 ci->i_wrbuffer_ref_head = 0;
912 drop++;
913 }
914 spin_unlock(&mdsc->cap_dirty_lock);
915 }
916 spin_unlock(&inode->i_lock);
917 while (drop--)
918 iput(inode);
813 return 0; 919 return 0;
814} 920}
815 921
@@ -821,6 +927,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
821 dout("remove_session_caps on %p\n", session); 927 dout("remove_session_caps on %p\n", session);
822 iterate_session_caps(session, remove_session_caps_cb, NULL); 928 iterate_session_caps(session, remove_session_caps_cb, NULL);
823 BUG_ON(session->s_nr_caps > 0); 929 BUG_ON(session->s_nr_caps > 0);
930 BUG_ON(!list_empty(&session->s_cap_flushing));
824 cleanup_cap_releases(session); 931 cleanup_cap_releases(session);
825} 932}
826 933
@@ -835,7 +942,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
835{ 942{
836 struct ceph_inode_info *ci = ceph_inode(inode); 943 struct ceph_inode_info *ci = ceph_inode(inode);
837 944
838 wake_up(&ci->i_cap_wq); 945 wake_up_all(&ci->i_cap_wq);
839 if (arg) { 946 if (arg) {
840 spin_lock(&inode->i_lock); 947 spin_lock(&inode->i_lock);
841 ci->i_wanted_max_size = 0; 948 ci->i_wanted_max_size = 0;
@@ -883,8 +990,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
883 ceph_mds_state_name(state)); 990 ceph_mds_state_name(state));
884 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, 991 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
885 ++session->s_renew_seq); 992 ++session->s_renew_seq);
886 if (IS_ERR(msg)) 993 if (!msg)
887 return PTR_ERR(msg); 994 return -ENOMEM;
888 ceph_con_send(&session->s_con, msg); 995 ceph_con_send(&session->s_con, msg);
889 return 0; 996 return 0;
890} 997}
@@ -931,17 +1038,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
931 struct ceph_mds_session *session) 1038 struct ceph_mds_session *session)
932{ 1039{
933 struct ceph_msg *msg; 1040 struct ceph_msg *msg;
934 int err = 0;
935 1041
936 dout("request_close_session mds%d state %s seq %lld\n", 1042 dout("request_close_session mds%d state %s seq %lld\n",
937 session->s_mds, session_state_name(session->s_state), 1043 session->s_mds, session_state_name(session->s_state),
938 session->s_seq); 1044 session->s_seq);
939 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); 1045 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
940 if (IS_ERR(msg)) 1046 if (!msg)
941 err = PTR_ERR(msg); 1047 return -ENOMEM;
942 else 1048 ceph_con_send(&session->s_con, msg);
943 ceph_con_send(&session->s_con, msg); 1049 return 0;
944 return err;
945} 1050}
946 1051
947/* 1052/*
@@ -1035,16 +1140,17 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1035 * 1140 *
1036 * Called under s_mutex. 1141 * Called under s_mutex.
1037 */ 1142 */
1038static int add_cap_releases(struct ceph_mds_client *mdsc, 1143int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1039 struct ceph_mds_session *session, 1144 struct ceph_mds_session *session)
1040 int extra)
1041{ 1145{
1042 struct ceph_msg *msg; 1146 struct ceph_msg *msg, *partial = NULL;
1043 struct ceph_mds_cap_release *head; 1147 struct ceph_mds_cap_release *head;
1044 int err = -ENOMEM; 1148 int err = -ENOMEM;
1149 int extra = mdsc->client->mount_args->cap_release_safety;
1150 int num;
1045 1151
1046 if (extra < 0) 1152 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
1047 extra = mdsc->client->mount_args->cap_release_safety; 1153 extra);
1048 1154
1049 spin_lock(&session->s_cap_lock); 1155 spin_lock(&session->s_cap_lock);
1050 1156
@@ -1053,13 +1159,18 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
1053 struct ceph_msg, 1159 struct ceph_msg,
1054 list_head); 1160 list_head);
1055 head = msg->front.iov_base; 1161 head = msg->front.iov_base;
1056 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); 1162 num = le32_to_cpu(head->num);
1163 if (num) {
1164 dout(" partial %p with (%d/%d)\n", msg, num,
1165 (int)CEPH_CAPS_PER_RELEASE);
1166 extra += CEPH_CAPS_PER_RELEASE - num;
1167 partial = msg;
1168 }
1057 } 1169 }
1058
1059 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1170 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1060 spin_unlock(&session->s_cap_lock); 1171 spin_unlock(&session->s_cap_lock);
1061 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1172 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1062 0, 0, NULL); 1173 GFP_NOFS);
1063 if (!msg) 1174 if (!msg)
1064 goto out_unlocked; 1175 goto out_unlocked;
1065 dout("add_cap_releases %p msg %p now %d\n", session, msg, 1176 dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1072,19 +1183,14 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
1072 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; 1183 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1073 } 1184 }
1074 1185
1075 if (!list_empty(&session->s_cap_releases)) { 1186 if (partial) {
1076 msg = list_first_entry(&session->s_cap_releases, 1187 head = partial->front.iov_base;
1077 struct ceph_msg, 1188 num = le32_to_cpu(head->num);
1078 list_head); 1189 dout(" queueing partial %p with %d/%d\n", partial, num,
1079 head = msg->front.iov_base; 1190 (int)CEPH_CAPS_PER_RELEASE);
1080 if (head->num) { 1191 list_move_tail(&partial->list_head,
1081 dout(" queueing non-full %p (%d)\n", msg, 1192 &session->s_cap_releases_done);
1082 le32_to_cpu(head->num)); 1193 session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
1083 list_move_tail(&msg->list_head,
1084 &session->s_cap_releases_done);
1085 session->s_num_cap_releases -=
1086 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1087 }
1088 } 1194 }
1089 err = 0; 1195 err = 0;
1090 spin_unlock(&session->s_cap_lock); 1196 spin_unlock(&session->s_cap_lock);
@@ -1145,16 +1251,14 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1145/* 1251/*
1146 * called under s_mutex 1252 * called under s_mutex
1147 */ 1253 */
1148static void send_cap_releases(struct ceph_mds_client *mdsc, 1254void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
1149 struct ceph_mds_session *session) 1255 struct ceph_mds_session *session)
1150{ 1256{
1151 struct ceph_msg *msg; 1257 struct ceph_msg *msg;
1152 1258
1153 dout("send_cap_releases mds%d\n", session->s_mds); 1259 dout("send_cap_releases mds%d\n", session->s_mds);
1154 while (1) { 1260 spin_lock(&session->s_cap_lock);
1155 spin_lock(&session->s_cap_lock); 1261 while (!list_empty(&session->s_cap_releases_done)) {
1156 if (list_empty(&session->s_cap_releases_done))
1157 break;
1158 msg = list_first_entry(&session->s_cap_releases_done, 1262 msg = list_first_entry(&session->s_cap_releases_done,
1159 struct ceph_msg, list_head); 1263 struct ceph_msg, list_head);
1160 list_del_init(&msg->list_head); 1264 list_del_init(&msg->list_head);
@@ -1162,7 +1266,46 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1162 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1266 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1163 dout("send_cap_releases mds%d %p\n", session->s_mds, msg); 1267 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1164 ceph_con_send(&session->s_con, msg); 1268 ceph_con_send(&session->s_con, msg);
1269 spin_lock(&session->s_cap_lock);
1270 }
1271 spin_unlock(&session->s_cap_lock);
1272}
1273
1274static void discard_cap_releases(struct ceph_mds_client *mdsc,
1275 struct ceph_mds_session *session)
1276{
1277 struct ceph_msg *msg;
1278 struct ceph_mds_cap_release *head;
1279 unsigned num;
1280
1281 dout("discard_cap_releases mds%d\n", session->s_mds);
1282 spin_lock(&session->s_cap_lock);
1283
1284 /* zero out the in-progress message */
1285 msg = list_first_entry(&session->s_cap_releases,
1286 struct ceph_msg, list_head);
1287 head = msg->front.iov_base;
1288 num = le32_to_cpu(head->num);
1289 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1290 head->num = cpu_to_le32(0);
1291 session->s_num_cap_releases += num;
1292
1293 /* requeue completed messages */
1294 while (!list_empty(&session->s_cap_releases_done)) {
1295 msg = list_first_entry(&session->s_cap_releases_done,
1296 struct ceph_msg, list_head);
1297 list_del_init(&msg->list_head);
1298
1299 head = msg->front.iov_base;
1300 num = le32_to_cpu(head->num);
1301 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
1302 num);
1303 session->s_num_cap_releases += num;
1304 head->num = cpu_to_le32(0);
1305 msg->front.iov_len = sizeof(*head);
1306 list_add(&msg->list_head, &session->s_cap_releases);
1165 } 1307 }
1308
1166 spin_unlock(&session->s_cap_lock); 1309 spin_unlock(&session->s_cap_lock);
1167} 1310}
1168 1311
@@ -1181,6 +1324,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1181 if (!req) 1324 if (!req)
1182 return ERR_PTR(-ENOMEM); 1325 return ERR_PTR(-ENOMEM);
1183 1326
1327 mutex_init(&req->r_fill_mutex);
1328 req->r_mdsc = mdsc;
1184 req->r_started = jiffies; 1329 req->r_started = jiffies;
1185 req->r_resend_mds = -1; 1330 req->r_resend_mds = -1;
1186 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1331 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1396,7 @@ retry:
1251 len += 1 + temp->d_name.len; 1396 len += 1 + temp->d_name.len;
1252 temp = temp->d_parent; 1397 temp = temp->d_parent;
1253 if (temp == NULL) { 1398 if (temp == NULL) {
1254 pr_err("build_path_dentry corrupt dentry %p\n", dentry); 1399 pr_err("build_path corrupt dentry %p\n", dentry);
1255 return ERR_PTR(-EINVAL); 1400 return ERR_PTR(-EINVAL);
1256 } 1401 }
1257 } 1402 }
@@ -1267,7 +1412,7 @@ retry:
1267 struct inode *inode = temp->d_inode; 1412 struct inode *inode = temp->d_inode;
1268 1413
1269 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { 1414 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1270 dout("build_path_dentry path+%d: %p SNAPDIR\n", 1415 dout("build_path path+%d: %p SNAPDIR\n",
1271 pos, temp); 1416 pos, temp);
1272 } else if (stop_on_nosnap && inode && 1417 } else if (stop_on_nosnap && inode &&
1273 ceph_snap(inode) == CEPH_NOSNAP) { 1418 ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1423,18 @@ retry:
1278 break; 1423 break;
1279 strncpy(path + pos, temp->d_name.name, 1424 strncpy(path + pos, temp->d_name.name,
1280 temp->d_name.len); 1425 temp->d_name.len);
1281 dout("build_path_dentry path+%d: %p '%.*s'\n",
1282 pos, temp, temp->d_name.len, path + pos);
1283 } 1426 }
1284 if (pos) 1427 if (pos)
1285 path[--pos] = '/'; 1428 path[--pos] = '/';
1286 temp = temp->d_parent; 1429 temp = temp->d_parent;
1287 if (temp == NULL) { 1430 if (temp == NULL) {
1288 pr_err("build_path_dentry corrupt dentry\n"); 1431 pr_err("build_path corrupt dentry\n");
1289 kfree(path); 1432 kfree(path);
1290 return ERR_PTR(-EINVAL); 1433 return ERR_PTR(-EINVAL);
1291 } 1434 }
1292 } 1435 }
1293 if (pos != 0) { 1436 if (pos != 0) {
1294 pr_err("build_path_dentry did not end path lookup where " 1437 pr_err("build_path did not end path lookup where "
1295 "expected, namelen is %d, pos is %d\n", len, pos); 1438 "expected, namelen is %d, pos is %d\n", len, pos);
1296 /* presumably this is only possible if racing with a 1439 /* presumably this is only possible if racing with a
1297 rename of one of the parent directories (we can not 1440 rename of one of the parent directories (we can not
@@ -1303,7 +1446,7 @@ retry:
1303 1446
1304 *base = ceph_ino(temp->d_inode); 1447 *base = ceph_ino(temp->d_inode);
1305 *plen = len; 1448 *plen = len;
1306 dout("build_path_dentry on %p %d built %llx '%.*s'\n", 1449 dout("build_path on %p %d built %llx '%.*s'\n",
1307 dentry, atomic_read(&dentry->d_count), *base, len, path); 1450 dentry, atomic_read(&dentry->d_count), *base, len, path);
1308 return path; 1451 return path;
1309} 1452}
@@ -1426,9 +1569,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1426 if (req->r_old_dentry_drop) 1569 if (req->r_old_dentry_drop)
1427 len += req->r_old_dentry->d_name.len; 1570 len += req->r_old_dentry->d_name.len;
1428 1571
1429 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); 1572 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
1430 if (IS_ERR(msg)) 1573 if (!msg) {
1574 msg = ERR_PTR(-ENOMEM);
1431 goto out_free2; 1575 goto out_free2;
1576 }
1432 1577
1433 msg->hdr.tid = cpu_to_le64(req->r_tid); 1578 msg->hdr.tid = cpu_to_le64(req->r_tid);
1434 1579
@@ -1445,6 +1590,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1445 ceph_encode_filepath(&p, end, ino1, path1); 1590 ceph_encode_filepath(&p, end, ino1, path1);
1446 ceph_encode_filepath(&p, end, ino2, path2); 1591 ceph_encode_filepath(&p, end, ino2, path2);
1447 1592
1593 /* make note of release offset, in case we need to replay */
1594 req->r_request_release_offset = p - msg->front.iov_base;
1595
1448 /* cap releases */ 1596 /* cap releases */
1449 releases = 0; 1597 releases = 0;
1450 if (req->r_inode_drop) 1598 if (req->r_inode_drop)
@@ -1492,7 +1640,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
1492 if (req->r_callback) 1640 if (req->r_callback)
1493 req->r_callback(mdsc, req); 1641 req->r_callback(mdsc, req);
1494 else 1642 else
1495 complete(&req->r_completion); 1643 complete_all(&req->r_completion);
1496} 1644}
1497 1645
1498/* 1646/*
@@ -1508,18 +1656,53 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1508 1656
1509 req->r_mds = mds; 1657 req->r_mds = mds;
1510 req->r_attempts++; 1658 req->r_attempts++;
1659 if (req->r_inode) {
1660 struct ceph_cap *cap =
1661 ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
1662
1663 if (cap)
1664 req->r_sent_on_mseq = cap->mseq;
1665 else
1666 req->r_sent_on_mseq = -1;
1667 }
1511 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, 1668 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1512 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 1669 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1513 1670
1671 if (req->r_got_unsafe) {
1672 /*
1673 * Replay. Do not regenerate message (and rebuild
1674 * paths, etc.); just use the original message.
1675 * Rebuilding paths will break for renames because
1676 * d_move mangles the src name.
1677 */
1678 msg = req->r_request;
1679 rhead = msg->front.iov_base;
1680
1681 flags = le32_to_cpu(rhead->flags);
1682 flags |= CEPH_MDS_FLAG_REPLAY;
1683 rhead->flags = cpu_to_le32(flags);
1684
1685 if (req->r_target_inode)
1686 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1687
1688 rhead->num_retry = req->r_attempts - 1;
1689
1690 /* remove cap/dentry releases from message */
1691 rhead->num_releases = 0;
1692 msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
1693 msg->front.iov_len = req->r_request_release_offset;
1694 return 0;
1695 }
1696
1514 if (req->r_request) { 1697 if (req->r_request) {
1515 ceph_msg_put(req->r_request); 1698 ceph_msg_put(req->r_request);
1516 req->r_request = NULL; 1699 req->r_request = NULL;
1517 } 1700 }
1518 msg = create_request_message(mdsc, req, mds); 1701 msg = create_request_message(mdsc, req, mds);
1519 if (IS_ERR(msg)) { 1702 if (IS_ERR(msg)) {
1520 req->r_reply = ERR_PTR(PTR_ERR(msg)); 1703 req->r_err = PTR_ERR(msg);
1521 complete_request(mdsc, req); 1704 complete_request(mdsc, req);
1522 return -PTR_ERR(msg); 1705 return PTR_ERR(msg);
1523 } 1706 }
1524 req->r_request = msg; 1707 req->r_request = msg;
1525 1708
@@ -1532,13 +1715,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1532 rhead->flags = cpu_to_le32(flags); 1715 rhead->flags = cpu_to_le32(flags);
1533 rhead->num_fwd = req->r_num_fwd; 1716 rhead->num_fwd = req->r_num_fwd;
1534 rhead->num_retry = req->r_attempts - 1; 1717 rhead->num_retry = req->r_attempts - 1;
1718 rhead->ino = 0;
1535 1719
1536 dout(" r_locked_dir = %p\n", req->r_locked_dir); 1720 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1537
1538 if (req->r_target_inode && req->r_got_unsafe)
1539 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1540 else
1541 rhead->ino = 0;
1542 return 0; 1721 return 0;
1543} 1722}
1544 1723
@@ -1552,7 +1731,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
1552 int mds = -1; 1731 int mds = -1;
1553 int err = -EAGAIN; 1732 int err = -EAGAIN;
1554 1733
1555 if (req->r_reply) 1734 if (req->r_err || req->r_got_result)
1556 goto out; 1735 goto out;
1557 1736
1558 if (req->r_timeout && 1737 if (req->r_timeout &&
@@ -1609,7 +1788,7 @@ out:
1609 return err; 1788 return err;
1610 1789
1611finish: 1790finish:
1612 req->r_reply = ERR_PTR(err); 1791 req->r_err = err;
1613 complete_request(mdsc, req); 1792 complete_request(mdsc, req);
1614 goto out; 1793 goto out;
1615} 1794}
@@ -1630,10 +1809,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
1630 1809
1631/* 1810/*
1632 * Wake up threads with requests pending for @mds, so that they can 1811 * Wake up threads with requests pending for @mds, so that they can
1633 * resubmit their requests to a possibly different mds. If @all is set, 1812 * resubmit their requests to a possibly different mds.
1634 * wake up if their requests has been forwarded to @mds, too.
1635 */ 1813 */
1636static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) 1814static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1637{ 1815{
1638 struct ceph_mds_request *req; 1816 struct ceph_mds_request *req;
1639 struct rb_node *p; 1817 struct rb_node *p;
@@ -1689,64 +1867,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1689 __register_request(mdsc, req, dir); 1867 __register_request(mdsc, req, dir);
1690 __do_request(mdsc, req); 1868 __do_request(mdsc, req);
1691 1869
1692 /* wait */ 1870 if (req->r_err) {
1693 if (!req->r_reply) { 1871 err = req->r_err;
1694 mutex_unlock(&mdsc->mutex); 1872 __unregister_request(mdsc, req);
1695 if (req->r_timeout) { 1873 dout("do_request early error %d\n", err);
1696 err = (long)wait_for_completion_interruptible_timeout( 1874 goto out;
1697 &req->r_completion, req->r_timeout);
1698 if (err == 0)
1699 req->r_reply = ERR_PTR(-EIO);
1700 else if (err < 0)
1701 req->r_reply = ERR_PTR(err);
1702 } else {
1703 err = wait_for_completion_interruptible(
1704 &req->r_completion);
1705 if (err)
1706 req->r_reply = ERR_PTR(err);
1707 }
1708 mutex_lock(&mdsc->mutex);
1709 } 1875 }
1710 1876
1711 if (IS_ERR(req->r_reply)) { 1877 /* wait */
1712 err = PTR_ERR(req->r_reply); 1878 mutex_unlock(&mdsc->mutex);
1713 req->r_reply = NULL; 1879 dout("do_request waiting\n");
1880 if (req->r_timeout) {
1881 err = (long)wait_for_completion_killable_timeout(
1882 &req->r_completion, req->r_timeout);
1883 if (err == 0)
1884 err = -EIO;
1885 } else {
1886 err = wait_for_completion_killable(&req->r_completion);
1887 }
1888 dout("do_request waited, got %d\n", err);
1889 mutex_lock(&mdsc->mutex);
1714 1890
1715 if (err == -ERESTARTSYS) { 1891 /* only abort if we didn't race with a real reply */
1716 /* aborted */ 1892 if (req->r_got_result) {
1717 req->r_aborted = true; 1893 err = le32_to_cpu(req->r_reply_info.head->result);
1894 } else if (err < 0) {
1895 dout("aborted request %lld with %d\n", req->r_tid, err);
1718 1896
1719 if (req->r_locked_dir && 1897 /*
1720 (req->r_op & CEPH_MDS_OP_WRITE)) { 1898 * ensure we aren't running concurrently with
1721 struct ceph_inode_info *ci = 1899 * ceph_fill_trace or ceph_readdir_prepopulate, which
1722 ceph_inode(req->r_locked_dir); 1900 * rely on locks (dir mutex) held by our caller.
1901 */
1902 mutex_lock(&req->r_fill_mutex);
1903 req->r_err = err;
1904 req->r_aborted = true;
1905 mutex_unlock(&req->r_fill_mutex);
1723 1906
1724 dout("aborted, clearing I_COMPLETE on %p\n", 1907 if (req->r_locked_dir &&
1725 req->r_locked_dir); 1908 (req->r_op & CEPH_MDS_OP_WRITE))
1726 spin_lock(&req->r_locked_dir->i_lock); 1909 ceph_invalidate_dir_request(req);
1727 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1728 ci->i_release_count++;
1729 spin_unlock(&req->r_locked_dir->i_lock);
1730 }
1731 } else {
1732 /* clean up this request */
1733 __unregister_request(mdsc, req);
1734 if (!list_empty(&req->r_unsafe_item))
1735 list_del_init(&req->r_unsafe_item);
1736 complete(&req->r_safe_completion);
1737 }
1738 } else if (req->r_err) {
1739 err = req->r_err;
1740 } else { 1910 } else {
1741 err = le32_to_cpu(req->r_reply_info.head->result); 1911 err = req->r_err;
1742 } 1912 }
1743 mutex_unlock(&mdsc->mutex);
1744 1913
1914out:
1915 mutex_unlock(&mdsc->mutex);
1745 dout("do_request %p done, result %d\n", req, err); 1916 dout("do_request %p done, result %d\n", req, err);
1746 return err; 1917 return err;
1747} 1918}
1748 1919
1749/* 1920/*
1921 * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
1922 * namespace request.
1923 */
1924void ceph_invalidate_dir_request(struct ceph_mds_request *req)
1925{
1926 struct inode *inode = req->r_locked_dir;
1927 struct ceph_inode_info *ci = ceph_inode(inode);
1928
1929 dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
1930 spin_lock(&inode->i_lock);
1931 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1932 ci->i_release_count++;
1933 spin_unlock(&inode->i_lock);
1934
1935 if (req->r_dentry)
1936 ceph_invalidate_dentry_lease(req->r_dentry);
1937 if (req->r_old_dentry)
1938 ceph_invalidate_dentry_lease(req->r_old_dentry);
1939}
1940
1941/*
1750 * Handle mds reply. 1942 * Handle mds reply.
1751 * 1943 *
1752 * We take the session mutex and parse and process the reply immediately. 1944 * We take the session mutex and parse and process the reply immediately.
@@ -1797,29 +1989,54 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1797 mutex_unlock(&mdsc->mutex); 1989 mutex_unlock(&mdsc->mutex);
1798 goto out; 1990 goto out;
1799 } 1991 }
1992 if (req->r_got_safe && !head->safe) {
1993 pr_warning("got unsafe after safe on %llu from mds%d\n",
1994 tid, mds);
1995 mutex_unlock(&mdsc->mutex);
1996 goto out;
1997 }
1800 1998
1801 result = le32_to_cpu(head->result); 1999 result = le32_to_cpu(head->result);
1802 2000
1803 /* 2001 /*
1804 * Tolerate 2 consecutive ESTALEs from the same mds. 2002 * Handle an ESTALE
1805 * FIXME: we should be looking at the cap migrate_seq. 2003 * if we're not talking to the authority, send to them
2004 * if the authority has changed while we weren't looking,
2005 * send to new authority
2006 * Otherwise we just have to return an ESTALE
1806 */ 2007 */
1807 if (result == -ESTALE) { 2008 if (result == -ESTALE) {
1808 req->r_direct_mode = USE_AUTH_MDS; 2009 dout("got ESTALE on request %llu", req->r_tid);
1809 req->r_num_stale++; 2010 if (!req->r_inode) {
1810 if (req->r_num_stale <= 2) { 2011 /* do nothing; not an authority problem */
2012 } else if (req->r_direct_mode != USE_AUTH_MDS) {
2013 dout("not using auth, setting for that now");
2014 req->r_direct_mode = USE_AUTH_MDS;
1811 __do_request(mdsc, req); 2015 __do_request(mdsc, req);
1812 mutex_unlock(&mdsc->mutex); 2016 mutex_unlock(&mdsc->mutex);
1813 goto out; 2017 goto out;
2018 } else {
2019 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
2020 struct ceph_cap *cap =
2021 ceph_get_cap_for_mds(ci, req->r_mds);;
2022
2023 dout("already using auth");
2024 if ((!cap || cap != ci->i_auth_cap) ||
2025 (cap->mseq != req->r_sent_on_mseq)) {
2026 dout("but cap changed, so resending");
2027 __do_request(mdsc, req);
2028 mutex_unlock(&mdsc->mutex);
2029 goto out;
2030 }
1814 } 2031 }
1815 } else { 2032 dout("have to return ESTALE on request %llu", req->r_tid);
1816 req->r_num_stale = 0;
1817 } 2033 }
1818 2034
2035
1819 if (head->safe) { 2036 if (head->safe) {
1820 req->r_got_safe = true; 2037 req->r_got_safe = true;
1821 __unregister_request(mdsc, req); 2038 __unregister_request(mdsc, req);
1822 complete(&req->r_safe_completion); 2039 complete_all(&req->r_safe_completion);
1823 2040
1824 if (req->r_got_unsafe) { 2041 if (req->r_got_unsafe) {
1825 /* 2042 /*
@@ -1834,15 +2051,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1834 2051
1835 /* last unsafe request during umount? */ 2052 /* last unsafe request during umount? */
1836 if (mdsc->stopping && !__get_oldest_req(mdsc)) 2053 if (mdsc->stopping && !__get_oldest_req(mdsc))
1837 complete(&mdsc->safe_umount_waiters); 2054 complete_all(&mdsc->safe_umount_waiters);
1838 mutex_unlock(&mdsc->mutex); 2055 mutex_unlock(&mdsc->mutex);
1839 goto out; 2056 goto out;
1840 } 2057 }
1841 } 2058 } else {
1842
1843 BUG_ON(req->r_reply);
1844
1845 if (!head->safe) {
1846 req->r_got_unsafe = true; 2059 req->r_got_unsafe = true;
1847 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); 2060 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1848 } 2061 }
@@ -1871,23 +2084,32 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1871 } 2084 }
1872 2085
1873 /* insert trace into our cache */ 2086 /* insert trace into our cache */
2087 mutex_lock(&req->r_fill_mutex);
1874 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 2088 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1875 if (err == 0) { 2089 if (err == 0) {
1876 if (result == 0 && rinfo->dir_nr) 2090 if (result == 0 && rinfo->dir_nr)
1877 ceph_readdir_prepopulate(req, req->r_session); 2091 ceph_readdir_prepopulate(req, req->r_session);
1878 ceph_unreserve_caps(&req->r_caps_reservation); 2092 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
1879 } 2093 }
2094 mutex_unlock(&req->r_fill_mutex);
1880 2095
1881 up_read(&mdsc->snap_rwsem); 2096 up_read(&mdsc->snap_rwsem);
1882out_err: 2097out_err:
1883 if (err) { 2098 mutex_lock(&mdsc->mutex);
1884 req->r_err = err; 2099 if (!req->r_aborted) {
2100 if (err) {
2101 req->r_err = err;
2102 } else {
2103 req->r_reply = msg;
2104 ceph_msg_get(msg);
2105 req->r_got_result = true;
2106 }
1885 } else { 2107 } else {
1886 req->r_reply = msg; 2108 dout("reply arrived after request %lld was aborted\n", tid);
1887 ceph_msg_get(msg);
1888 } 2109 }
2110 mutex_unlock(&mdsc->mutex);
1889 2111
1890 add_cap_releases(mdsc, req->r_session, -1); 2112 ceph_add_cap_releases(mdsc, req->r_session);
1891 mutex_unlock(&session->s_mutex); 2113 mutex_unlock(&session->s_mutex);
1892 2114
1893 /* kick calling process */ 2115 /* kick calling process */
@@ -1921,16 +2143,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
1921 mutex_lock(&mdsc->mutex); 2143 mutex_lock(&mdsc->mutex);
1922 req = __lookup_request(mdsc, tid); 2144 req = __lookup_request(mdsc, tid);
1923 if (!req) { 2145 if (!req) {
1924 dout("forward %llu to mds%d - req dne\n", tid, next_mds); 2146 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
1925 goto out; /* dup reply? */ 2147 goto out; /* dup reply? */
1926 } 2148 }
1927 2149
1928 if (fwd_seq <= req->r_num_fwd) { 2150 if (req->r_aborted) {
1929 dout("forward %llu to mds%d - old seq %d <= %d\n", 2151 dout("forward tid %llu aborted, unregistering\n", tid);
2152 __unregister_request(mdsc, req);
2153 } else if (fwd_seq <= req->r_num_fwd) {
2154 dout("forward tid %llu to mds%d - old seq %d <= %d\n",
1930 tid, next_mds, req->r_num_fwd, fwd_seq); 2155 tid, next_mds, req->r_num_fwd, fwd_seq);
1931 } else { 2156 } else {
1932 /* resend. forward race not possible; mds would drop */ 2157 /* resend. forward race not possible; mds would drop */
1933 dout("forward %llu to mds%d (we resend)\n", tid, next_mds); 2158 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
2159 BUG_ON(req->r_err);
2160 BUG_ON(req->r_got_result);
1934 req->r_num_fwd = fwd_seq; 2161 req->r_num_fwd = fwd_seq;
1935 req->r_resend_mds = next_mds; 2162 req->r_resend_mds = next_mds;
1936 put_request_session(req); 2163 put_request_session(req);
@@ -1984,6 +2211,8 @@ static void handle_session(struct ceph_mds_session *session,
1984 2211
1985 switch (op) { 2212 switch (op) {
1986 case CEPH_SESSION_OPEN: 2213 case CEPH_SESSION_OPEN:
2214 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2215 pr_info("mds%d reconnect success\n", session->s_mds);
1987 session->s_state = CEPH_MDS_SESSION_OPEN; 2216 session->s_state = CEPH_MDS_SESSION_OPEN;
1988 renewed_caps(mdsc, session, 0); 2217 renewed_caps(mdsc, session, 0);
1989 wake = 1; 2218 wake = 1;
@@ -1997,10 +2226,12 @@ static void handle_session(struct ceph_mds_session *session,
1997 break; 2226 break;
1998 2227
1999 case CEPH_SESSION_CLOSE: 2228 case CEPH_SESSION_CLOSE:
2229 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2230 pr_info("mds%d reconnect denied\n", session->s_mds);
2000 remove_session_caps(session); 2231 remove_session_caps(session);
2001 wake = 1; /* for good measure */ 2232 wake = 1; /* for good measure */
2002 complete(&mdsc->session_close_waiters); 2233 wake_up_all(&mdsc->session_close_wq);
2003 kick_requests(mdsc, mds, 0); /* cur only */ 2234 kick_requests(mdsc, mds);
2004 break; 2235 break;
2005 2236
2006 case CEPH_SESSION_STALE: 2237 case CEPH_SESSION_STALE:
@@ -2066,9 +2297,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2066static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, 2297static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2067 void *arg) 2298 void *arg)
2068{ 2299{
2069 struct ceph_mds_cap_reconnect rec; 2300 union {
2301 struct ceph_mds_cap_reconnect v2;
2302 struct ceph_mds_cap_reconnect_v1 v1;
2303 } rec;
2304 size_t reclen;
2070 struct ceph_inode_info *ci; 2305 struct ceph_inode_info *ci;
2071 struct ceph_pagelist *pagelist = arg; 2306 struct ceph_reconnect_state *recon_state = arg;
2307 struct ceph_pagelist *pagelist = recon_state->pagelist;
2072 char *path; 2308 char *path;
2073 int pathlen, err; 2309 int pathlen, err;
2074 u64 pathbase; 2310 u64 pathbase;
@@ -2088,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2088 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); 2324 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2089 if (IS_ERR(path)) { 2325 if (IS_ERR(path)) {
2090 err = PTR_ERR(path); 2326 err = PTR_ERR(path);
2091 BUG_ON(err); 2327 goto out_dput;
2092 } 2328 }
2093 } else { 2329 } else {
2094 path = NULL; 2330 path = NULL;
@@ -2096,25 +2332,55 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2096 } 2332 }
2097 err = ceph_pagelist_encode_string(pagelist, path, pathlen); 2333 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2098 if (err) 2334 if (err)
2099 goto out; 2335 goto out_free;
2100 2336
2101 spin_lock(&inode->i_lock); 2337 spin_lock(&inode->i_lock);
2102 cap->seq = 0; /* reset cap seq */ 2338 cap->seq = 0; /* reset cap seq */
2103 cap->issue_seq = 0; /* and issue_seq */ 2339 cap->issue_seq = 0; /* and issue_seq */
2104 rec.cap_id = cpu_to_le64(cap->cap_id); 2340
2105 rec.pathbase = cpu_to_le64(pathbase); 2341 if (recon_state->flock) {
2106 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); 2342 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
2107 rec.issued = cpu_to_le32(cap->issued); 2343 rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2108 rec.size = cpu_to_le64(inode->i_size); 2344 rec.v2.issued = cpu_to_le32(cap->issued);
2109 ceph_encode_timespec(&rec.mtime, &inode->i_mtime); 2345 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2110 ceph_encode_timespec(&rec.atime, &inode->i_atime); 2346 rec.v2.pathbase = cpu_to_le64(pathbase);
2111 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); 2347 rec.v2.flock_len = 0;
2348 reclen = sizeof(rec.v2);
2349 } else {
2350 rec.v1.cap_id = cpu_to_le64(cap->cap_id);
2351 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2352 rec.v1.issued = cpu_to_le32(cap->issued);
2353 rec.v1.size = cpu_to_le64(inode->i_size);
2354 ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
2355 ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
2356 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2357 rec.v1.pathbase = cpu_to_le64(pathbase);
2358 reclen = sizeof(rec.v1);
2359 }
2112 spin_unlock(&inode->i_lock); 2360 spin_unlock(&inode->i_lock);
2113 2361
2114 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec)); 2362 if (recon_state->flock) {
2363 int num_fcntl_locks, num_flock_locks;
2364
2365 lock_kernel();
2366 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2367 rec.v2.flock_len = (2*sizeof(u32) +
2368 (num_fcntl_locks+num_flock_locks) *
2369 sizeof(struct ceph_filelock));
2370
2371 err = ceph_pagelist_append(pagelist, &rec, reclen);
2372 if (!err)
2373 err = ceph_encode_locks(inode, pagelist,
2374 num_fcntl_locks,
2375 num_flock_locks);
2376 unlock_kernel();
2377 } else {
2378 err = ceph_pagelist_append(pagelist, &rec, reclen);
2379 }
2115 2380
2116out: 2381out_free:
2117 kfree(path); 2382 kfree(path);
2383out_dput:
2118 dput(dentry); 2384 dput(dentry);
2119 return err; 2385 return err;
2120} 2386}
@@ -2132,59 +2398,53 @@ out:
2132 * 2398 *
2133 * called with mdsc->mutex held. 2399 * called with mdsc->mutex held.
2134 */ 2400 */
2135static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) 2401static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2402 struct ceph_mds_session *session)
2136{ 2403{
2137 struct ceph_mds_session *session = NULL;
2138 struct ceph_msg *reply; 2404 struct ceph_msg *reply;
2139 struct rb_node *p; 2405 struct rb_node *p;
2406 int mds = session->s_mds;
2140 int err = -ENOMEM; 2407 int err = -ENOMEM;
2141 struct ceph_pagelist *pagelist; 2408 struct ceph_pagelist *pagelist;
2409 struct ceph_reconnect_state recon_state;
2142 2410
2143 pr_info("reconnect to recovering mds%d\n", mds); 2411 pr_info("mds%d reconnect start\n", mds);
2144 2412
2145 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 2413 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2146 if (!pagelist) 2414 if (!pagelist)
2147 goto fail_nopagelist; 2415 goto fail_nopagelist;
2148 ceph_pagelist_init(pagelist); 2416 ceph_pagelist_init(pagelist);
2149 2417
2150 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); 2418 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
2151 if (IS_ERR(reply)) { 2419 if (!reply)
2152 err = PTR_ERR(reply);
2153 goto fail_nomsg; 2420 goto fail_nomsg;
2154 }
2155
2156 /* find session */
2157 session = __ceph_lookup_mds_session(mdsc, mds);
2158 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2159 2421
2160 if (session) { 2422 mutex_lock(&session->s_mutex);
2161 mutex_lock(&session->s_mutex); 2423 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2424 session->s_seq = 0;
2162 2425
2163 session->s_state = CEPH_MDS_SESSION_RECONNECTING; 2426 ceph_con_open(&session->s_con,
2164 session->s_seq = 0; 2427 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2165 2428
2166 ceph_con_open(&session->s_con, 2429 /* replay unsafe requests */
2167 ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); 2430 replay_unsafe_requests(mdsc, session);
2168
2169 /* replay unsafe requests */
2170 replay_unsafe_requests(mdsc, session);
2171 } else {
2172 dout("no session for mds%d, will send short reconnect\n",
2173 mds);
2174 }
2175 2431
2176 down_read(&mdsc->snap_rwsem); 2432 down_read(&mdsc->snap_rwsem);
2177 2433
2178 if (!session)
2179 goto send;
2180 dout("session %p state %s\n", session, 2434 dout("session %p state %s\n", session,
2181 session_state_name(session->s_state)); 2435 session_state_name(session->s_state));
2182 2436
2437 /* drop old cap expires; we're about to reestablish that state */
2438 discard_cap_releases(mdsc, session);
2439
2183 /* traverse this session's caps */ 2440 /* traverse this session's caps */
2184 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2441 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2185 if (err) 2442 if (err)
2186 goto fail; 2443 goto fail;
2187 err = iterate_session_caps(session, encode_caps_cb, pagelist); 2444
2445 recon_state.pagelist = pagelist;
2446 recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
2447 err = iterate_session_caps(session, encode_caps_cb, &recon_state);
2188 if (err < 0) 2448 if (err < 0)
2189 goto fail; 2449 goto fail;
2190 2450
@@ -2208,36 +2468,31 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2208 goto fail; 2468 goto fail;
2209 } 2469 }
2210 2470
2211send:
2212 reply->pagelist = pagelist; 2471 reply->pagelist = pagelist;
2472 if (recon_state.flock)
2473 reply->hdr.version = cpu_to_le16(2);
2213 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2474 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2214 reply->nr_pages = calc_pages_for(0, pagelist->length); 2475 reply->nr_pages = calc_pages_for(0, pagelist->length);
2215 ceph_con_send(&session->s_con, reply); 2476 ceph_con_send(&session->s_con, reply);
2216 2477
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 mutex_unlock(&session->s_mutex); 2478 mutex_unlock(&session->s_mutex);
2219 2479
2220 mutex_lock(&mdsc->mutex); 2480 mutex_lock(&mdsc->mutex);
2221 __wake_requests(mdsc, &session->s_waiting); 2481 __wake_requests(mdsc, &session->s_waiting);
2222 mutex_unlock(&mdsc->mutex); 2482 mutex_unlock(&mdsc->mutex);
2223 2483
2224 ceph_put_mds_session(session);
2225
2226 up_read(&mdsc->snap_rwsem); 2484 up_read(&mdsc->snap_rwsem);
2227 mutex_lock(&mdsc->mutex);
2228 return; 2485 return;
2229 2486
2230fail: 2487fail:
2231 ceph_msg_put(reply); 2488 ceph_msg_put(reply);
2232 up_read(&mdsc->snap_rwsem); 2489 up_read(&mdsc->snap_rwsem);
2233 mutex_unlock(&session->s_mutex); 2490 mutex_unlock(&session->s_mutex);
2234 ceph_put_mds_session(session);
2235fail_nomsg: 2491fail_nomsg:
2236 ceph_pagelist_release(pagelist); 2492 ceph_pagelist_release(pagelist);
2237 kfree(pagelist); 2493 kfree(pagelist);
2238fail_nopagelist: 2494fail_nopagelist:
2239 pr_err("error %d preparing reconnect for mds%d\n", err, mds); 2495 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2240 mutex_lock(&mdsc->mutex);
2241 return; 2496 return;
2242} 2497}
2243 2498
@@ -2266,9 +2521,11 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2266 oldstate = ceph_mdsmap_get_state(oldmap, i); 2521 oldstate = ceph_mdsmap_get_state(oldmap, i);
2267 newstate = ceph_mdsmap_get_state(newmap, i); 2522 newstate = ceph_mdsmap_get_state(newmap, i);
2268 2523
2269 dout("check_new_map mds%d state %s -> %s (session %s)\n", 2524 dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
2270 i, ceph_mds_state_name(oldstate), 2525 i, ceph_mds_state_name(oldstate),
2526 ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
2271 ceph_mds_state_name(newstate), 2527 ceph_mds_state_name(newstate),
2528 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
2272 session_state_name(s->s_state)); 2529 session_state_name(s->s_state));
2273 2530
2274 if (memcmp(ceph_mdsmap_get_addr(oldmap, i), 2531 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2290,7 +2547,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2290 } 2547 }
2291 2548
2292 /* kick any requests waiting on the recovering mds */ 2549 /* kick any requests waiting on the recovering mds */
2293 kick_requests(mdsc, i, 1); 2550 kick_requests(mdsc, i);
2294 } else if (oldstate == newstate) { 2551 } else if (oldstate == newstate) {
2295 continue; /* nothing new with this mds */ 2552 continue; /* nothing new with this mds */
2296 } 2553 }
@@ -2299,26 +2556,40 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2299 * send reconnect? 2556 * send reconnect?
2300 */ 2557 */
2301 if (s->s_state == CEPH_MDS_SESSION_RESTARTING && 2558 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2302 newstate >= CEPH_MDS_STATE_RECONNECT) 2559 newstate >= CEPH_MDS_STATE_RECONNECT) {
2303 send_mds_reconnect(mdsc, i); 2560 mutex_unlock(&mdsc->mutex);
2561 send_mds_reconnect(mdsc, s);
2562 mutex_lock(&mdsc->mutex);
2563 }
2304 2564
2305 /* 2565 /*
2306 * kick requests on any mds that has gone active. 2566 * kick request on any mds that has gone active.
2307 *
2308 * kick requests on cur or forwarder: we may have sent
2309 * the request to mds1, mds1 told us it forwarded it
2310 * to mds2, but then we learn mds1 failed and can't be
2311 * sure it successfully forwarded our request before
2312 * it died.
2313 */ 2567 */
2314 if (oldstate < CEPH_MDS_STATE_ACTIVE && 2568 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2315 newstate >= CEPH_MDS_STATE_ACTIVE) { 2569 newstate >= CEPH_MDS_STATE_ACTIVE) {
2316 pr_info("mds%d reconnect completed\n", s->s_mds); 2570 if (oldstate != CEPH_MDS_STATE_CREATING &&
2317 kick_requests(mdsc, i, 1); 2571 oldstate != CEPH_MDS_STATE_STARTING)
2572 pr_info("mds%d recovery completed\n", s->s_mds);
2573 kick_requests(mdsc, i);
2318 ceph_kick_flushing_caps(mdsc, s); 2574 ceph_kick_flushing_caps(mdsc, s);
2319 wake_up_session_caps(s, 1); 2575 wake_up_session_caps(s, 1);
2320 } 2576 }
2321 } 2577 }
2578
2579 for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
2580 s = mdsc->sessions[i];
2581 if (!s)
2582 continue;
2583 if (!ceph_mdsmap_is_laggy(newmap, i))
2584 continue;
2585 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2586 s->s_state == CEPH_MDS_SESSION_HUNG ||
2587 s->s_state == CEPH_MDS_SESSION_CLOSING) {
2588 dout(" connecting to export targets of laggy mds%d\n",
2589 i);
2590 __open_export_target_sessions(mdsc, s);
2591 }
2592 }
2322} 2593}
2323 2594
2324 2595
@@ -2349,6 +2620,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2349 struct ceph_dentry_info *di; 2620 struct ceph_dentry_info *di;
2350 int mds = session->s_mds; 2621 int mds = session->s_mds;
2351 struct ceph_mds_lease *h = msg->front.iov_base; 2622 struct ceph_mds_lease *h = msg->front.iov_base;
2623 u32 seq;
2352 struct ceph_vino vino; 2624 struct ceph_vino vino;
2353 int mask; 2625 int mask;
2354 struct qstr dname; 2626 struct qstr dname;
@@ -2362,6 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2362 vino.ino = le64_to_cpu(h->ino); 2634 vino.ino = le64_to_cpu(h->ino);
2363 vino.snap = CEPH_NOSNAP; 2635 vino.snap = CEPH_NOSNAP;
2364 mask = le16_to_cpu(h->mask); 2636 mask = le16_to_cpu(h->mask);
2637 seq = le32_to_cpu(h->seq);
2365 dname.name = (void *)h + sizeof(*h) + sizeof(u32); 2638 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2366 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); 2639 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2367 if (dname.len != get_unaligned_le32(h+1)) 2640 if (dname.len != get_unaligned_le32(h+1))
@@ -2372,8 +2645,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2372 2645
2373 /* lookup inode */ 2646 /* lookup inode */
2374 inode = ceph_find_inode(sb, vino); 2647 inode = ceph_find_inode(sb, vino);
2375 dout("handle_lease '%s', mask %d, ino %llx %p\n", 2648 dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
2376 ceph_lease_op_name(h->action), mask, vino.ino, inode); 2649 ceph_lease_op_name(h->action), mask, vino.ino, inode,
2650 dname.len, dname.name);
2377 if (inode == NULL) { 2651 if (inode == NULL) {
2378 dout("handle_lease no inode %llx\n", vino.ino); 2652 dout("handle_lease no inode %llx\n", vino.ino);
2379 goto release; 2653 goto release;
@@ -2398,7 +2672,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2398 switch (h->action) { 2672 switch (h->action) {
2399 case CEPH_MDS_LEASE_REVOKE: 2673 case CEPH_MDS_LEASE_REVOKE:
2400 if (di && di->lease_session == session) { 2674 if (di && di->lease_session == session) {
2401 h->seq = cpu_to_le32(di->lease_seq); 2675 if (ceph_seq_cmp(di->lease_seq, seq) > 0)
2676 h->seq = cpu_to_le32(di->lease_seq);
2402 __ceph_mdsc_drop_dentry_lease(dentry); 2677 __ceph_mdsc_drop_dentry_lease(dentry);
2403 } 2678 }
2404 release = 1; 2679 release = 1;
@@ -2412,7 +2687,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2412 unsigned long duration = 2687 unsigned long duration =
2413 le32_to_cpu(h->duration_ms) * HZ / 1000; 2688 le32_to_cpu(h->duration_ms) * HZ / 1000;
2414 2689
2415 di->lease_seq = le32_to_cpu(h->seq); 2690 di->lease_seq = seq;
2416 dentry->d_time = di->lease_renew_from + duration; 2691 dentry->d_time = di->lease_renew_from + duration;
2417 di->lease_renew_after = di->lease_renew_from + 2692 di->lease_renew_after = di->lease_renew_from +
2418 (duration >> 1); 2693 (duration >> 1);
@@ -2457,12 +2732,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2457 dnamelen = dentry->d_name.len; 2732 dnamelen = dentry->d_name.len;
2458 len += dnamelen; 2733 len += dnamelen;
2459 2734
2460 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); 2735 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
2461 if (IS_ERR(msg)) 2736 if (!msg)
2462 return; 2737 return;
2463 lease = msg->front.iov_base; 2738 lease = msg->front.iov_base;
2464 lease->action = action; 2739 lease->action = action;
2465 lease->mask = cpu_to_le16(CEPH_LOCK_DN); 2740 lease->mask = cpu_to_le16(1);
2466 lease->ino = cpu_to_le64(ceph_vino(inode).ino); 2741 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2467 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); 2742 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2468 lease->seq = cpu_to_le32(seq); 2743 lease->seq = cpu_to_le32(seq);
@@ -2492,7 +2767,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2492 2767
2493 BUG_ON(inode == NULL); 2768 BUG_ON(inode == NULL);
2494 BUG_ON(dentry == NULL); 2769 BUG_ON(dentry == NULL);
2495 BUG_ON(mask != CEPH_LOCK_DN); 2770 BUG_ON(mask == 0);
2496 2771
2497 /* is dentry lease valid? */ 2772 /* is dentry lease valid? */
2498 spin_lock(&dentry->d_lock); 2773 spin_lock(&dentry->d_lock);
@@ -2602,8 +2877,10 @@ static void delayed_work(struct work_struct *work)
2602 send_renew_caps(mdsc, s); 2877 send_renew_caps(mdsc, s);
2603 else 2878 else
2604 ceph_con_keepalive(&s->s_con); 2879 ceph_con_keepalive(&s->s_con);
2605 add_cap_releases(mdsc, s, -1); 2880 ceph_add_cap_releases(mdsc, s);
2606 send_cap_releases(mdsc, s); 2881 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2882 s->s_state == CEPH_MDS_SESSION_HUNG)
2883 ceph_send_cap_releases(mdsc, s);
2607 mutex_unlock(&s->s_mutex); 2884 mutex_unlock(&s->s_mutex);
2608 ceph_put_mds_session(s); 2885 ceph_put_mds_session(s);
2609 2886
@@ -2620,8 +2897,11 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2620 mdsc->client = client; 2897 mdsc->client = client;
2621 mutex_init(&mdsc->mutex); 2898 mutex_init(&mdsc->mutex);
2622 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2899 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2900 if (mdsc->mdsmap == NULL)
2901 return -ENOMEM;
2902
2623 init_completion(&mdsc->safe_umount_waiters); 2903 init_completion(&mdsc->safe_umount_waiters);
2624 init_completion(&mdsc->session_close_waiters); 2904 init_waitqueue_head(&mdsc->session_close_wq);
2625 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2905 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2626 mdsc->sessions = NULL; 2906 mdsc->sessions = NULL;
2627 mdsc->max_sessions = 0; 2907 mdsc->max_sessions = 0;
@@ -2645,6 +2925,10 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2645 init_waitqueue_head(&mdsc->cap_flushing_wq); 2925 init_waitqueue_head(&mdsc->cap_flushing_wq);
2646 spin_lock_init(&mdsc->dentry_lru_lock); 2926 spin_lock_init(&mdsc->dentry_lru_lock);
2647 INIT_LIST_HEAD(&mdsc->dentry_lru); 2927 INIT_LIST_HEAD(&mdsc->dentry_lru);
2928
2929 ceph_caps_init(mdsc);
2930 ceph_adjust_min_caps(mdsc, client->min_caps);
2931
2648 return 0; 2932 return 0;
2649} 2933}
2650 2934
@@ -2689,6 +2973,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2689 drop_leases(mdsc); 2973 drop_leases(mdsc);
2690 ceph_flush_dirty_caps(mdsc); 2974 ceph_flush_dirty_caps(mdsc);
2691 wait_requests(mdsc); 2975 wait_requests(mdsc);
2976
2977 /*
2978 * wait for reply handlers to drop their request refs and
2979 * their inode/dcache refs
2980 */
2981 ceph_msgr_flush();
2692} 2982}
2693 2983
2694/* 2984/*
@@ -2740,6 +3030,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2740{ 3030{
2741 u64 want_tid, want_flush; 3031 u64 want_tid, want_flush;
2742 3032
3033 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
3034 return;
3035
2743 dout("sync\n"); 3036 dout("sync\n");
2744 mutex_lock(&mdsc->mutex); 3037 mutex_lock(&mdsc->mutex);
2745 want_tid = mdsc->last_tid; 3038 want_tid = mdsc->last_tid;
@@ -2753,6 +3046,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2753 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); 3046 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2754} 3047}
2755 3048
3049/*
3050 * true if all sessions are closed, or we force unmount
3051 */
3052bool done_closing_sessions(struct ceph_mds_client *mdsc)
3053{
3054 int i, n = 0;
3055
3056 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
3057 return true;
3058
3059 mutex_lock(&mdsc->mutex);
3060 for (i = 0; i < mdsc->max_sessions; i++)
3061 if (mdsc->sessions[i])
3062 n++;
3063 mutex_unlock(&mdsc->mutex);
3064 return n == 0;
3065}
2756 3066
2757/* 3067/*
2758 * called after sb is ro. 3068 * called after sb is ro.
@@ -2761,45 +3071,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2761{ 3071{
2762 struct ceph_mds_session *session; 3072 struct ceph_mds_session *session;
2763 int i; 3073 int i;
2764 int n;
2765 struct ceph_client *client = mdsc->client; 3074 struct ceph_client *client = mdsc->client;
2766 unsigned long started, timeout = client->mount_args->mount_timeout * HZ; 3075 unsigned long timeout = client->mount_args->mount_timeout * HZ;
2767 3076
2768 dout("close_sessions\n"); 3077 dout("close_sessions\n");
2769 3078
2770 mutex_lock(&mdsc->mutex);
2771
2772 /* close sessions */ 3079 /* close sessions */
2773 started = jiffies; 3080 mutex_lock(&mdsc->mutex);
2774 while (time_before(jiffies, started + timeout)) { 3081 for (i = 0; i < mdsc->max_sessions; i++) {
2775 dout("closing sessions\n"); 3082 session = __ceph_lookup_mds_session(mdsc, i);
2776 n = 0; 3083 if (!session)
2777 for (i = 0; i < mdsc->max_sessions; i++) { 3084 continue;
2778 session = __ceph_lookup_mds_session(mdsc, i);
2779 if (!session)
2780 continue;
2781 mutex_unlock(&mdsc->mutex);
2782 mutex_lock(&session->s_mutex);
2783 __close_session(mdsc, session);
2784 mutex_unlock(&session->s_mutex);
2785 ceph_put_mds_session(session);
2786 mutex_lock(&mdsc->mutex);
2787 n++;
2788 }
2789 if (n == 0)
2790 break;
2791
2792 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2793 break;
2794
2795 dout("waiting for sessions to close\n");
2796 mutex_unlock(&mdsc->mutex); 3085 mutex_unlock(&mdsc->mutex);
2797 wait_for_completion_timeout(&mdsc->session_close_waiters, 3086 mutex_lock(&session->s_mutex);
2798 timeout); 3087 __close_session(mdsc, session);
3088 mutex_unlock(&session->s_mutex);
3089 ceph_put_mds_session(session);
2799 mutex_lock(&mdsc->mutex); 3090 mutex_lock(&mdsc->mutex);
2800 } 3091 }
3092 mutex_unlock(&mdsc->mutex);
3093
3094 dout("waiting for sessions to close\n");
3095 wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
3096 timeout);
2801 3097
2802 /* tear down remaining sessions */ 3098 /* tear down remaining sessions */
3099 mutex_lock(&mdsc->mutex);
2803 for (i = 0; i < mdsc->max_sessions; i++) { 3100 for (i = 0; i < mdsc->max_sessions; i++) {
2804 if (mdsc->sessions[i]) { 3101 if (mdsc->sessions[i]) {
2805 session = get_session(mdsc->sessions[i]); 3102 session = get_session(mdsc->sessions[i]);
@@ -2812,9 +3109,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2812 mutex_lock(&mdsc->mutex); 3109 mutex_lock(&mdsc->mutex);
2813 } 3110 }
2814 } 3111 }
2815
2816 WARN_ON(!list_empty(&mdsc->cap_delay_list)); 3112 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2817
2818 mutex_unlock(&mdsc->mutex); 3113 mutex_unlock(&mdsc->mutex);
2819 3114
2820 ceph_cleanup_empty_realms(mdsc); 3115 ceph_cleanup_empty_realms(mdsc);
@@ -2831,6 +3126,7 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2831 if (mdsc->mdsmap) 3126 if (mdsc->mdsmap)
2832 ceph_mdsmap_destroy(mdsc->mdsmap); 3127 ceph_mdsmap_destroy(mdsc->mdsmap);
2833 kfree(mdsc->sessions); 3128 kfree(mdsc->sessions);
3129 ceph_caps_finalize(mdsc);
2834} 3130}
2835 3131
2836 3132
@@ -2922,9 +3218,10 @@ static void con_put(struct ceph_connection *con)
2922static void peer_reset(struct ceph_connection *con) 3218static void peer_reset(struct ceph_connection *con)
2923{ 3219{
2924 struct ceph_mds_session *s = con->private; 3220 struct ceph_mds_session *s = con->private;
3221 struct ceph_mds_client *mdsc = s->s_mdsc;
2925 3222
2926 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", 3223 pr_warning("mds%d closed our session\n", s->s_mds);
2927 s->s_mds); 3224 send_mds_reconnect(mdsc, s);
2928} 3225}
2929 3226
2930static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 3227static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3328,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
3031 return ceph_monc_validate_auth(&mdsc->client->monc); 3328 return ceph_monc_validate_auth(&mdsc->client->monc);
3032} 3329}
3033 3330
3034const static struct ceph_connection_operations mds_con_ops = { 3331static const struct ceph_connection_operations mds_con_ops = {
3035 .get = con_get, 3332 .get = con_get,
3036 .put = con_put, 3333 .put = con_put,
3037 .dispatch = dispatch, 3334 .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..c98267ce6d2a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -151,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
151struct ceph_mds_request { 151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */ 152 u64 r_tid; /* transaction id */
153 struct rb_node r_node; 153 struct rb_node r_node;
154 struct ceph_mds_client *r_mdsc;
154 155
155 int r_op; /* mds op code */ 156 int r_op; /* mds op code */
156 int r_mds; 157 int r_mds;
@@ -165,6 +166,8 @@ struct ceph_mds_request {
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ 166 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */ 167 struct inode *r_target_inode; /* resulting inode */
167 168
169 struct mutex r_fill_mutex;
170
168 union ceph_mds_request_args r_args; 171 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */ 172 int r_fmode; /* file mode, if expecting cap */
170 173
@@ -186,6 +189,7 @@ struct ceph_mds_request {
186 int r_old_inode_drop, r_old_inode_unless; 189 int r_old_inode_drop, r_old_inode_unless;
187 190
188 struct ceph_msg *r_request; /* original request */ 191 struct ceph_msg *r_request; /* original request */
192 int r_request_release_offset;
189 struct ceph_msg *r_reply; 193 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info; 194 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err; 195 int r_err;
@@ -204,8 +208,8 @@ struct ceph_mds_request {
204 208
205 int r_attempts; /* resend attempts */ 209 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */ 210 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/ 211 int r_resend_mds; /* mds to resend to next, if any*/
212 u32 r_sent_on_mseq; /* cap mseq request was sent at*/
209 213
210 struct kref r_kref; 214 struct kref r_kref;
211 struct list_head r_wait; 215 struct list_head r_wait;
@@ -213,7 +217,7 @@ struct ceph_mds_request {
213 struct completion r_safe_completion; 217 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback; 218 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */ 219 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe; 220 bool r_got_unsafe, r_got_safe, r_got_result;
217 221
218 bool r_did_prepopulate; 222 bool r_did_prepopulate;
219 u32 r_readdir_offset; 223 u32 r_readdir_offset;
@@ -230,7 +234,8 @@ struct ceph_mds_client {
230 struct mutex mutex; /* all nested structures */ 234 struct mutex mutex; /* all nested structures */
231 235
232 struct ceph_mdsmap *mdsmap; 236 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters; 237 struct completion safe_umount_waiters;
238 wait_queue_head_t session_close_wq;
234 struct list_head waiting_for_map; 239 struct list_head waiting_for_map;
235 240
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 241 struct ceph_mds_session **sessions; /* NULL for mds if no session */
@@ -264,6 +269,27 @@ struct ceph_mds_client {
264 spinlock_t cap_dirty_lock; /* protects above items */ 269 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq; 270 wait_queue_head_t cap_flushing_wq;
266 271
272 /*
273 * Cap reservations
274 *
275 * Maintain a global pool of preallocated struct ceph_caps, referenced
276 * by struct ceph_caps_reservations. This ensures that we preallocate
277 * memory needed to successfully process an MDS response. (If an MDS
278 * sends us cap information and we fail to process it, we will have
279 * problems due to the client and MDS being out of sync.)
280 *
281 * Reservations are 'owned' by a ceph_cap_reservation context.
282 */
283 spinlock_t caps_list_lock;
284 struct list_head caps_list; /* unused (reserved or
285 unreserved) */
286 int caps_total_count; /* total caps allocated */
287 int caps_use_count; /* in use */
288 int caps_reserve_count; /* unused, reserved */
289 int caps_avail_count; /* unused, unreserved */
290 int caps_min_count; /* keep at least this many
291 (unreserved) */
292
267#ifdef CONFIG_DEBUG_FS 293#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file; 294 struct dentry *debugfs_file;
269#endif 295#endif
@@ -301,6 +327,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode, 327 struct inode *inode,
302 struct dentry *dn, int mask); 328 struct dentry *dn, int mask);
303 329
330extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
331
304extern struct ceph_mds_request * 332extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 333ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 334extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
@@ -318,6 +346,11 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
318 kref_put(&req->r_kref, ceph_mdsc_release_request); 346 kref_put(&req->r_kref, ceph_mdsc_release_request);
319} 347}
320 348
349extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
350 struct ceph_mds_session *session);
351extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
352 struct ceph_mds_session *session);
353
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); 354extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322 355
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, 356extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
@@ -332,4 +365,7 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, 365extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg); 366 struct ceph_msg *msg);
334 367
368extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
369 struct ceph_mds_session *session);
370
335#endif 371#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index c4c498e6dfef..040be6d1150b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -85,6 +85,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
85 struct ceph_entity_addr addr; 85 struct ceph_entity_addr addr;
86 u32 num_export_targets; 86 u32 num_export_targets;
87 void *pexport_targets = NULL; 87 void *pexport_targets = NULL;
88 struct ceph_timespec laggy_since;
88 89
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); 90 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p); 91 global_id = ceph_decode_64(p);
@@ -103,7 +104,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
103 state_seq = ceph_decode_64(p); 104 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr)); 105 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr); 106 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec); 107 ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
107 *p += sizeof(u32); 108 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad); 109 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen; 110 *p += namelen;
@@ -122,6 +123,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
122 m->m_info[mds].global_id = global_id; 123 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state; 124 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr; 125 m->m_info[mds].addr = addr;
126 m->m_info[mds].laggy =
127 (laggy_since.tv_sec != 0 ||
128 laggy_since.tv_nsec != 0);
125 m->m_info[mds].num_export_targets = num_export_targets; 129 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) { 130 if (num_export_targets) {
127 m->m_info[mds].export_targets = 131 m->m_info[mds].export_targets =
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
index eacc131aa5cb..4c5cb0880bba 100644
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -13,6 +13,7 @@ struct ceph_mds_info {
13 struct ceph_entity_addr addr; 13 struct ceph_entity_addr addr;
14 s32 state; 14 s32 state;
15 int num_export_targets; 15 int num_export_targets;
16 bool laggy;
16 u32 *export_targets; 17 u32 *export_targets;
17}; 18};
18 19
@@ -47,6 +48,13 @@ static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
47 return m->m_info[w].state; 48 return m->m_info[w].state;
48} 49}
49 50
51static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
52{
53 if (w >= 0 && w < m->m_max_mds)
54 return m->m_info[w].laggy;
55 return false;
56}
57
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); 58extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); 59extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); 60extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491a..2502d76fcec1 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,23 +39,12 @@ static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *); 39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con); 40static void ceph_fault(struct ceph_connection *con);
41 41
42const char *ceph_name_type_str(int t)
43{
44 switch (t) {
45 case CEPH_ENTITY_TYPE_MON: return "mon";
46 case CEPH_ENTITY_TYPE_MDS: return "mds";
47 case CEPH_ENTITY_TYPE_OSD: return "osd";
48 case CEPH_ENTITY_TYPE_CLIENT: return "client";
49 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
50 default: return "???";
51 }
52}
53
54/* 42/*
55 * nicely render a sockaddr as a string. 43 * nicely render a sockaddr as a string.
56 */ 44 */
57#define MAX_ADDR_STR 20 45#define MAX_ADDR_STR 20
58static char addr_str[MAX_ADDR_STR][40]; 46#define MAX_ADDR_STR_LEN 60
47static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
59static DEFINE_SPINLOCK(addr_str_lock); 48static DEFINE_SPINLOCK(addr_str_lock);
60static int last_addr_str; 49static int last_addr_str;
61 50
@@ -64,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss)
64 int i; 53 int i;
65 char *s; 54 char *s;
66 struct sockaddr_in *in4 = (void *)ss; 55 struct sockaddr_in *in4 = (void *)ss;
67 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
68 struct sockaddr_in6 *in6 = (void *)ss; 56 struct sockaddr_in6 *in6 = (void *)ss;
69 57
70 spin_lock(&addr_str_lock); 58 spin_lock(&addr_str_lock);
@@ -76,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss)
76 64
77 switch (ss->ss_family) { 65 switch (ss->ss_family) {
78 case AF_INET: 66 case AF_INET:
79 sprintf(s, "%u.%u.%u.%u:%u", 67 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
80 (unsigned int)quad[0], 68 (unsigned int)ntohs(in4->sin_port));
81 (unsigned int)quad[1],
82 (unsigned int)quad[2],
83 (unsigned int)quad[3],
84 (unsigned int)ntohs(in4->sin_port));
85 break; 69 break;
86 70
87 case AF_INET6: 71 case AF_INET6:
88 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", 72 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
89 in6->sin6_addr.s6_addr16[0], 73 (unsigned int)ntohs(in6->sin6_port));
90 in6->sin6_addr.s6_addr16[1],
91 in6->sin6_addr.s6_addr16[2],
92 in6->sin6_addr.s6_addr16[3],
93 in6->sin6_addr.s6_addr16[4],
94 in6->sin6_addr.s6_addr16[5],
95 in6->sin6_addr.s6_addr16[6],
96 in6->sin6_addr.s6_addr16[7],
97 (unsigned int)ntohs(in6->sin6_port));
98 break; 74 break;
99 75
100 default: 76 default:
@@ -132,6 +108,12 @@ void ceph_msgr_exit(void)
132 destroy_workqueue(ceph_msgr_wq); 108 destroy_workqueue(ceph_msgr_wq);
133} 109}
134 110
111void ceph_msgr_flush(void)
112{
113 flush_workqueue(ceph_msgr_wq);
114}
115
116
135/* 117/*
136 * socket callback functions 118 * socket callback functions
137 */ 119 */
@@ -221,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock,
221 */ 203 */
222static struct socket *ceph_tcp_connect(struct ceph_connection *con) 204static struct socket *ceph_tcp_connect(struct ceph_connection *con)
223{ 205{
224 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; 206 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
225 struct socket *sock; 207 struct socket *sock;
226 int ret; 208 int ret;
227 209
228 BUG_ON(con->sock); 210 BUG_ON(con->sock);
229 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 211 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
212 IPPROTO_TCP, &sock);
230 if (ret) 213 if (ret)
231 return ERR_PTR(ret); 214 return ERR_PTR(ret);
232 con->sock = sock; 215 con->sock = sock;
@@ -240,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
240 223
241 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); 224 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
242 225
243 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); 226 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
227 O_NONBLOCK);
244 if (ret == -EINPROGRESS) { 228 if (ret == -EINPROGRESS) {
245 dout("connect %s EINPROGRESS sk_state = %u\n", 229 dout("connect %s EINPROGRESS sk_state = %u\n",
246 pr_addr(&con->peer_addr.in_addr), 230 pr_addr(&con->peer_addr.in_addr),
@@ -340,6 +324,7 @@ static void reset_connection(struct ceph_connection *con)
340 ceph_msg_put(con->out_msg); 324 ceph_msg_put(con->out_msg);
341 con->out_msg = NULL; 325 con->out_msg = NULL;
342 } 326 }
327 con->out_keepalive_pending = false;
343 con->in_seq = 0; 328 con->in_seq = 0;
344 con->in_seq_acked = 0; 329 con->in_seq_acked = 0;
345} 330}
@@ -357,6 +342,7 @@ void ceph_con_close(struct ceph_connection *con)
357 clear_bit(WRITE_PENDING, &con->state); 342 clear_bit(WRITE_PENDING, &con->state);
358 mutex_lock(&con->mutex); 343 mutex_lock(&con->mutex);
359 reset_connection(con); 344 reset_connection(con);
345 con->peer_global_seq = 0;
360 cancel_delayed_work(&con->work); 346 cancel_delayed_work(&con->work);
361 mutex_unlock(&con->mutex); 347 mutex_unlock(&con->mutex);
362 queue_con(con); 348 queue_con(con);
@@ -661,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
661 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
662 con->connect_seq, global_seq, proto); 648 con->connect_seq, global_seq, proto);
663 649
664 con->out_connect.features = CEPH_FEATURE_SUPPORTED; 650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
665 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
666 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
667 con->out_connect.global_seq = cpu_to_le32(global_seq); 653 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1013,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end,
1013 struct sockaddr_in *in4 = (void *)ss; 999 struct sockaddr_in *in4 = (void *)ss;
1014 struct sockaddr_in6 *in6 = (void *)ss; 1000 struct sockaddr_in6 *in6 = (void *)ss;
1015 int port; 1001 int port;
1002 char delim = ',';
1003
1004 if (*p == '[') {
1005 delim = ']';
1006 p++;
1007 }
1016 1008
1017 memset(ss, 0, sizeof(*ss)); 1009 memset(ss, 0, sizeof(*ss));
1018 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, 1010 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1019 ',', &ipend)) { 1011 delim, &ipend))
1020 ss->ss_family = AF_INET; 1012 ss->ss_family = AF_INET;
1021 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, 1013 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1022 ',', &ipend)) { 1014 delim, &ipend))
1023 ss->ss_family = AF_INET6; 1015 ss->ss_family = AF_INET6;
1024 } else { 1016 else
1025 goto bad; 1017 goto bad;
1026 }
1027 p = ipend; 1018 p = ipend;
1028 1019
1020 if (delim == ']') {
1021 if (*p != ']') {
1022 dout("missing matching ']'\n");
1023 goto bad;
1024 }
1025 p++;
1026 }
1027
1029 /* port? */ 1028 /* port? */
1030 if (p < end && *p == ':') { 1029 if (p < end && *p == ':') {
1031 port = 0; 1030 port = 0;
@@ -1059,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end,
1059 return 0; 1058 return 0;
1060 1059
1061bad: 1060bad:
1062 pr_err("parse_ips bad ip '%s'\n", c); 1061 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1063 return -EINVAL; 1062 return -EINVAL;
1064} 1063}
1065 1064
@@ -1082,11 +1081,11 @@ static int process_banner(struct ceph_connection *con)
1082 sizeof(con->peer_addr)) != 0 && 1081 sizeof(con->peer_addr)) != 0 &&
1083 !(addr_is_blank(&con->actual_peer_addr.in_addr) && 1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1084 con->actual_peer_addr.nonce == con->peer_addr.nonce)) { 1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1085 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n", 1084 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1086 pr_addr(&con->peer_addr.in_addr), 1085 pr_addr(&con->peer_addr.in_addr),
1087 le64_to_cpu(con->peer_addr.nonce), 1086 (int)le32_to_cpu(con->peer_addr.nonce),
1088 pr_addr(&con->actual_peer_addr.in_addr), 1087 pr_addr(&con->actual_peer_addr.in_addr),
1089 le64_to_cpu(con->actual_peer_addr.nonce)); 1088 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1090 con->error_msg = "wrong peer at address"; 1089 con->error_msg = "wrong peer at address";
1091 return -1; 1090 return -1;
1092 } 1091 }
@@ -1233,6 +1232,7 @@ static int process_connect(struct ceph_connection *con)
1233 clear_bit(CONNECTING, &con->state); 1232 clear_bit(CONNECTING, &con->state);
1234 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 1233 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1235 con->connect_seq++; 1234 con->connect_seq++;
1235 con->peer_features = server_feat;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n", 1236 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq, 1237 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq), 1238 le32_to_cpu(con->in_reply.connect_seq),
@@ -1302,8 +1302,8 @@ static void process_ack(struct ceph_connection *con)
1302 1302
1303 1303
1304static int read_partial_message_section(struct ceph_connection *con, 1304static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section, unsigned int sec_len, 1305 struct kvec *section,
1306 u32 *crc) 1306 unsigned int sec_len, u32 *crc)
1307{ 1307{
1308 int left; 1308 int left;
1309 int ret; 1309 int ret;
@@ -1399,22 +1399,22 @@ static int read_partial_message(struct ceph_connection *con)
1399 if (!con->in_msg) { 1399 if (!con->in_msg) {
1400 dout("got hdr type %d front %d data %d\n", con->in_hdr.type, 1400 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1401 con->in_hdr.front_len, con->in_hdr.data_len); 1401 con->in_hdr.front_len, con->in_hdr.data_len);
1402 skip = 0;
1402 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); 1403 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1403 if (skip) { 1404 if (skip) {
1404 /* skip this message */ 1405 /* skip this message */
1405 dout("alloc_msg returned NULL, skipping message\n"); 1406 dout("alloc_msg said skip message\n");
1407 BUG_ON(con->in_msg);
1406 con->in_base_pos = -front_len - middle_len - data_len - 1408 con->in_base_pos = -front_len - middle_len - data_len -
1407 sizeof(m->footer); 1409 sizeof(m->footer);
1408 con->in_tag = CEPH_MSGR_TAG_READY; 1410 con->in_tag = CEPH_MSGR_TAG_READY;
1409 con->in_seq++; 1411 con->in_seq++;
1410 return 0; 1412 return 0;
1411 } 1413 }
1412 if (IS_ERR(con->in_msg)) { 1414 if (!con->in_msg) {
1413 ret = PTR_ERR(con->in_msg);
1414 con->in_msg = NULL;
1415 con->error_msg = 1415 con->error_msg =
1416 "error allocating memory for incoming message"; 1416 "error allocating memory for incoming message";
1417 return ret; 1417 return -ENOMEM;
1418 } 1418 }
1419 m = con->in_msg; 1419 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */ 1420 m->front.iov_len = 0; /* haven't read it yet */
@@ -1434,7 +1434,8 @@ static int read_partial_message(struct ceph_connection *con)
1434 1434
1435 /* middle */ 1435 /* middle */
1436 if (m->middle) { 1436 if (m->middle) {
1437 ret = read_partial_message_section(con, &m->middle->vec, middle_len, 1437 ret = read_partial_message_section(con, &m->middle->vec,
1438 middle_len,
1438 &con->in_middle_crc); 1439 &con->in_middle_crc);
1439 if (ret <= 0) 1440 if (ret <= 0)
1440 return ret; 1441 return ret;
@@ -1514,14 +1515,14 @@ static void process_message(struct ceph_connection *con)
1514 1515
1515 /* if first message, set peer_name */ 1516 /* if first message, set peer_name */
1516 if (con->peer_name.type == 0) 1517 if (con->peer_name.type == 0)
1517 con->peer_name = msg->hdr.src.name; 1518 con->peer_name = msg->hdr.src;
1518 1519
1519 con->in_seq++; 1520 con->in_seq++;
1520 mutex_unlock(&con->mutex); 1521 mutex_unlock(&con->mutex);
1521 1522
1522 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", 1523 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1523 msg, le64_to_cpu(msg->hdr.seq), 1524 msg, le64_to_cpu(msg->hdr.seq),
1524 ENTITY_NAME(msg->hdr.src.name), 1525 ENTITY_NAME(msg->hdr.src),
1525 le16_to_cpu(msg->hdr.type), 1526 le16_to_cpu(msg->hdr.type),
1526 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), 1527 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1527 le32_to_cpu(msg->hdr.front_len), 1528 le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1547,6 @@ static int try_write(struct ceph_connection *con)
1546 dout("try_write start %p state %lu nref %d\n", con, con->state, 1547 dout("try_write start %p state %lu nref %d\n", con, con->state,
1547 atomic_read(&con->nref)); 1548 atomic_read(&con->nref));
1548 1549
1549 mutex_lock(&con->mutex);
1550more: 1550more:
1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1552 1552
@@ -1639,7 +1639,6 @@ do_next:
1639done: 1639done:
1640 ret = 0; 1640 ret = 0;
1641out: 1641out:
1642 mutex_unlock(&con->mutex);
1643 dout("try_write done on %p\n", con); 1642 dout("try_write done on %p\n", con);
1644 return ret; 1643 return ret;
1645} 1644}
@@ -1651,7 +1650,6 @@ out:
1651 */ 1650 */
1652static int try_read(struct ceph_connection *con) 1651static int try_read(struct ceph_connection *con)
1653{ 1652{
1654 struct ceph_messenger *msgr;
1655 int ret = -1; 1653 int ret = -1;
1656 1654
1657 if (!con->sock) 1655 if (!con->sock)
@@ -1661,9 +1659,6 @@ static int try_read(struct ceph_connection *con)
1661 return 0; 1659 return 0;
1662 1660
1663 dout("try_read start on %p\n", con); 1661 dout("try_read start on %p\n", con);
1664 msgr = con->msgr;
1665
1666 mutex_lock(&con->mutex);
1667 1662
1668more: 1663more:
1669 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, 1664 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1753,6 @@ more:
1758done: 1753done:
1759 ret = 0; 1754 ret = 0;
1760out: 1755out:
1761 mutex_unlock(&con->mutex);
1762 dout("try_read done on %p\n", con); 1756 dout("try_read done on %p\n", con);
1763 return ret; 1757 return ret;
1764 1758
@@ -1830,6 +1824,8 @@ more:
1830 dout("con_work %p start, clearing QUEUED\n", con); 1824 dout("con_work %p start, clearing QUEUED\n", con);
1831 clear_bit(QUEUED, &con->state); 1825 clear_bit(QUEUED, &con->state);
1832 1826
1827 mutex_lock(&con->mutex);
1828
1833 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ 1829 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1834 dout("con_work CLOSED\n"); 1830 dout("con_work CLOSED\n");
1835 con_close_socket(con); 1831 con_close_socket(con);
@@ -1844,11 +1840,16 @@ more:
1844 if (test_and_clear_bit(SOCK_CLOSED, &con->state) || 1840 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1845 try_read(con) < 0 || 1841 try_read(con) < 0 ||
1846 try_write(con) < 0) { 1842 try_write(con) < 0) {
1843 mutex_unlock(&con->mutex);
1847 backoff = 1; 1844 backoff = 1;
1848 ceph_fault(con); /* error/fault path */ 1845 ceph_fault(con); /* error/fault path */
1846 goto done_unlocked;
1849 } 1847 }
1850 1848
1851done: 1849done:
1850 mutex_unlock(&con->mutex);
1851
1852done_unlocked:
1852 clear_bit(BUSY, &con->state); 1853 clear_bit(BUSY, &con->state);
1853 dout("con->state=%lu\n", con->state); 1854 dout("con->state=%lu\n", con->state);
1854 if (test_bit(QUEUED, &con->state)) { 1855 if (test_bit(QUEUED, &con->state)) {
@@ -1920,7 +1921,7 @@ out:
1920 /* 1921 /*
1921 * in case we faulted due to authentication, invalidate our 1922 * in case we faulted due to authentication, invalidate our
1922 * current tickets so that we can get new ones. 1923 * current tickets so that we can get new ones.
1923 */ 1924 */
1924 if (con->auth_retry && con->ops->invalidate_authorizer) { 1925 if (con->auth_retry && con->ops->invalidate_authorizer) {
1925 dout("calling invalidate_authorizer()\n"); 1926 dout("calling invalidate_authorizer()\n");
1926 con->ops->invalidate_authorizer(con); 1927 con->ops->invalidate_authorizer(con);
@@ -1947,7 +1948,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1947 1948
1948 /* the zero page is needed if a request is "canceled" while the message 1949 /* the zero page is needed if a request is "canceled" while the message
1949 * is being written over the socket */ 1950 * is being written over the socket */
1950 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1951 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1951 if (!msgr->zero_page) { 1952 if (!msgr->zero_page) {
1952 kfree(msgr); 1953 kfree(msgr);
1953 return ERR_PTR(-ENOMEM); 1954 return ERR_PTR(-ENOMEM);
@@ -1987,9 +1988,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1987 } 1988 }
1988 1989
1989 /* set src+dst */ 1990 /* set src+dst */
1990 msg->hdr.src.name = con->msgr->inst.name; 1991 msg->hdr.src = con->msgr->inst.name;
1991 msg->hdr.src.addr = con->msgr->my_enc_addr;
1992 msg->hdr.orig_src = msg->hdr.src;
1993 1992
1994 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); 1993 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1995 1994
@@ -2020,20 +2019,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2020{ 2019{
2021 mutex_lock(&con->mutex); 2020 mutex_lock(&con->mutex);
2022 if (!list_empty(&msg->list_head)) { 2021 if (!list_empty(&msg->list_head)) {
2023 dout("con_revoke %p msg %p\n", con, msg); 2022 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2024 list_del_init(&msg->list_head); 2023 list_del_init(&msg->list_head);
2025 ceph_msg_put(msg); 2024 ceph_msg_put(msg);
2026 msg->hdr.seq = 0; 2025 msg->hdr.seq = 0;
2027 if (con->out_msg == msg) { 2026 }
2028 ceph_msg_put(con->out_msg); 2027 if (con->out_msg == msg) {
2029 con->out_msg = NULL; 2028 dout("con_revoke %p msg %p - was sending\n", con, msg);
2030 } 2029 con->out_msg = NULL;
2031 if (con->out_kvec_is_msg) { 2030 if (con->out_kvec_is_msg) {
2032 con->out_skip = con->out_kvec_bytes; 2031 con->out_skip = con->out_kvec_bytes;
2033 con->out_kvec_is_msg = false; 2032 con->out_kvec_is_msg = false;
2034 } 2033 }
2035 } else { 2034 ceph_msg_put(msg);
2036 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); 2035 msg->hdr.seq = 0;
2037 } 2036 }
2038 mutex_unlock(&con->mutex); 2037 mutex_unlock(&con->mutex);
2039} 2038}
@@ -2083,12 +2082,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
2083 * construct a new message with given type, size 2082 * construct a new message with given type, size
2084 * the new msg has a ref count of 1. 2083 * the new msg has a ref count of 1.
2085 */ 2084 */
2086struct ceph_msg *ceph_msg_new(int type, int front_len, 2085struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2087 int page_len, int page_off, struct page **pages)
2088{ 2086{
2089 struct ceph_msg *m; 2087 struct ceph_msg *m;
2090 2088
2091 m = kmalloc(sizeof(*m), GFP_NOFS); 2089 m = kmalloc(sizeof(*m), flags);
2092 if (m == NULL) 2090 if (m == NULL)
2093 goto out; 2091 goto out;
2094 kref_init(&m->kref); 2092 kref_init(&m->kref);
@@ -2100,8 +2098,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2100 m->hdr.version = 0; 2098 m->hdr.version = 0;
2101 m->hdr.front_len = cpu_to_le32(front_len); 2099 m->hdr.front_len = cpu_to_le32(front_len);
2102 m->hdr.middle_len = 0; 2100 m->hdr.middle_len = 0;
2103 m->hdr.data_len = cpu_to_le32(page_len); 2101 m->hdr.data_len = 0;
2104 m->hdr.data_off = cpu_to_le16(page_off); 2102 m->hdr.data_off = 0;
2105 m->hdr.reserved = 0; 2103 m->hdr.reserved = 0;
2106 m->footer.front_crc = 0; 2104 m->footer.front_crc = 0;
2107 m->footer.middle_crc = 0; 2105 m->footer.middle_crc = 0;
@@ -2115,11 +2113,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2115 /* front */ 2113 /* front */
2116 if (front_len) { 2114 if (front_len) {
2117 if (front_len > PAGE_CACHE_SIZE) { 2115 if (front_len > PAGE_CACHE_SIZE) {
2118 m->front.iov_base = __vmalloc(front_len, GFP_NOFS, 2116 m->front.iov_base = __vmalloc(front_len, flags,
2119 PAGE_KERNEL); 2117 PAGE_KERNEL);
2120 m->front_is_vmalloc = true; 2118 m->front_is_vmalloc = true;
2121 } else { 2119 } else {
2122 m->front.iov_base = kmalloc(front_len, GFP_NOFS); 2120 m->front.iov_base = kmalloc(front_len, flags);
2123 } 2121 }
2124 if (m->front.iov_base == NULL) { 2122 if (m->front.iov_base == NULL) {
2125 pr_err("msg_new can't allocate %d bytes\n", 2123 pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2133,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2135 m->middle = NULL; 2133 m->middle = NULL;
2136 2134
2137 /* data */ 2135 /* data */
2138 m->nr_pages = calc_pages_for(page_off, page_len); 2136 m->nr_pages = 0;
2139 m->pages = pages; 2137 m->pages = NULL;
2140 m->pagelist = NULL; 2138 m->pagelist = NULL;
2141 2139
2142 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, 2140 dout("ceph_msg_new %p front %d\n", m, front_len);
2143 m->nr_pages);
2144 return m; 2141 return m;
2145 2142
2146out2: 2143out2:
2147 ceph_msg_put(m); 2144 ceph_msg_put(m);
2148out: 2145out:
2149 pr_err("msg_new can't create type %d len %d\n", type, front_len); 2146 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2150 return ERR_PTR(-ENOMEM); 2147 return NULL;
2151} 2148}
2152 2149
2153/* 2150/*
@@ -2190,29 +2187,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2190 mutex_unlock(&con->mutex); 2187 mutex_unlock(&con->mutex);
2191 msg = con->ops->alloc_msg(con, hdr, skip); 2188 msg = con->ops->alloc_msg(con, hdr, skip);
2192 mutex_lock(&con->mutex); 2189 mutex_lock(&con->mutex);
2193 if (IS_ERR(msg)) 2190 if (!msg || *skip)
2194 return msg;
2195
2196 if (*skip)
2197 return NULL; 2191 return NULL;
2198 } 2192 }
2199 if (!msg) { 2193 if (!msg) {
2200 *skip = 0; 2194 *skip = 0;
2201 msg = ceph_msg_new(type, front_len, 0, 0, NULL); 2195 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2202 if (!msg) { 2196 if (!msg) {
2203 pr_err("unable to allocate msg type %d len %d\n", 2197 pr_err("unable to allocate msg type %d len %d\n",
2204 type, front_len); 2198 type, front_len);
2205 return ERR_PTR(-ENOMEM); 2199 return NULL;
2206 } 2200 }
2207 } 2201 }
2208 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 2202 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2209 2203
2210 if (middle_len) { 2204 if (middle_len && !msg->middle) {
2211 ret = ceph_alloc_middle(con, msg); 2205 ret = ceph_alloc_middle(con, msg);
2212
2213 if (ret < 0) { 2206 if (ret < 0) {
2214 ceph_msg_put(msg); 2207 ceph_msg_put(msg);
2215 return msg; 2208 return NULL;
2216 } 2209 }
2217 } 2210 }
2218 2211
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc971..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
49 int *skip); 49 int *skip);
50}; 50};
51 51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */ 52/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) 53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
56 54
57struct ceph_messenger { 55struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */ 56 struct ceph_entity_inst inst; /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
144 struct ceph_entity_addr peer_addr; /* peer address */ 142 struct ceph_entity_addr peer_addr; /* peer address */
145 struct ceph_entity_name peer_name; /* peer name */ 143 struct ceph_entity_name peer_name; /* peer name */
146 struct ceph_entity_addr peer_addr_for_me; 144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
147 u32 connect_seq; /* identify the most recent connection 146 u32 connect_seq; /* identify the most recent connection
148 attempt for this connection, client */ 147 attempt for this connection, client */
149 u32 peer_global_seq; /* peer's global seq for this connection */ 148 u32 peer_global_seq; /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
158 struct list_head out_queue; 157 struct list_head out_queue;
159 struct list_head out_sent; /* sending or sent but unacked */ 158 struct list_head out_sent; /* sending or sent but unacked */
160 u64 out_seq; /* last message queued for send */ 159 u64 out_seq; /* last message queued for send */
161 u64 out_seq_sent; /* last message sent */
162 bool out_keepalive_pending; 160 bool out_keepalive_pending;
163 161
164 u64 in_seq, in_seq_acked; /* last message received, acked */ 162 u64 in_seq, in_seq_acked; /* last message received, acked */
@@ -215,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
215 213
216extern int ceph_msgr_init(void); 214extern int ceph_msgr_init(void);
217extern void ceph_msgr_exit(void); 215extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void);
218 217
219extern struct ceph_messenger *ceph_messenger_create( 218extern struct ceph_messenger *ceph_messenger_create(
220 struct ceph_entity_addr *myaddr); 219 struct ceph_entity_addr *myaddr);
@@ -234,9 +233,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
234extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); 233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
235extern void ceph_con_put(struct ceph_connection *con); 234extern void ceph_con_put(struct ceph_connection *con);
236 235
237extern struct ceph_msg *ceph_msg_new(int type, int front_len, 236extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
238 int page_len, int page_off,
239 struct page **pages);
240extern void ceph_msg_kfree(struct ceph_msg *m); 237extern void ceph_msg_kfree(struct ceph_msg *m);
241 238
242 239
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca956..b2a5a3e4a671 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
28 * resend any outstanding requests. 28 * resend any outstanding requests.
29 */ 29 */
30 30
31const static struct ceph_connection_operations mon_con_ops; 31static const struct ceph_connection_operations mon_con_ops;
32 32
33static int __validate_auth(struct ceph_mon_client *monc); 33static int __validate_auth(struct ceph_mon_client *monc);
34 34
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
104 monc->pending_auth = 1; 104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len; 105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len); 106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
107 ceph_msg_get(monc->m_auth); /* keep our ref */ 108 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth); 109 ceph_con_send(monc->con, monc->m_auth);
109} 110}
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
187 monc->want_next_osdmap); 188 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) || 189 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) { 190 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg; 191 struct ceph_msg *msg = monc->m_subscribe;
191 struct ceph_mon_subscribe_item *i; 192 struct ceph_mon_subscribe_item *i;
192 void *p, *end; 193 void *p, *end;
193 194
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base; 195 p = msg->front.iov_base;
199 end = p + msg->front.iov_len; 196 end = p + msg->front_max;
200 197
201 dout("__send_subscribe to 'mdsmap' %u+\n", 198 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap); 199 (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
226 223
227 msg->front.iov_len = p - msg->front.iov_base; 224 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg); 226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
230 228
231 monc->sub_sent = jiffies | 1; /* never 0 */ 229 monc->sub_sent = jiffies | 1; /* never 0 */
232 } 230 }
@@ -347,20 +345,20 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
347 345
348out: 346out:
349 mutex_unlock(&monc->mutex); 347 mutex_unlock(&monc->mutex);
350 wake_up(&client->auth_wq); 348 wake_up_all(&client->auth_wq);
351} 349}
352 350
353/* 351/*
354 * statfs 352 * generic requests (e.g., statfs, poolop)
355 */ 353 */
356static struct ceph_mon_statfs_request *__lookup_statfs( 354static struct ceph_mon_generic_request *__lookup_generic_req(
357 struct ceph_mon_client *monc, u64 tid) 355 struct ceph_mon_client *monc, u64 tid)
358{ 356{
359 struct ceph_mon_statfs_request *req; 357 struct ceph_mon_generic_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node; 358 struct rb_node *n = monc->generic_request_tree.rb_node;
361 359
362 while (n) { 360 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node); 361 req = rb_entry(n, struct ceph_mon_generic_request, node);
364 if (tid < req->tid) 362 if (tid < req->tid)
365 n = n->rb_left; 363 n = n->rb_left;
366 else if (tid > req->tid) 364 else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
371 return NULL; 369 return NULL;
372} 370}
373 371
374static void __insert_statfs(struct ceph_mon_client *monc, 372static void __insert_generic_request(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new) 373 struct ceph_mon_generic_request *new)
376{ 374{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node; 375 struct rb_node **p = &monc->generic_request_tree.rb_node;
378 struct rb_node *parent = NULL; 376 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL; 377 struct ceph_mon_generic_request *req = NULL;
380 378
381 while (*p) { 379 while (*p) {
382 parent = *p; 380 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node); 381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
384 if (new->tid < req->tid) 382 if (new->tid < req->tid)
385 p = &(*p)->rb_left; 383 p = &(*p)->rb_left;
386 else if (new->tid > req->tid) 384 else if (new->tid > req->tid)
@@ -390,113 +388,290 @@ static void __insert_statfs(struct ceph_mon_client *monc,
390 } 388 }
391 389
392 rb_link_node(&new->node, parent, p); 390 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree); 391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403
404 kfree(req);
405}
406
407static void put_generic_request(struct ceph_mon_generic_request *req)
408{
409 kref_put(&req->kref, release_generic_request);
410}
411
412static void get_generic_request(struct ceph_mon_generic_request *req)
413{
414 kref_get(&req->kref);
415}
416
417static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
418 struct ceph_msg_header *hdr,
419 int *skip)
420{
421 struct ceph_mon_client *monc = con->private;
422 struct ceph_mon_generic_request *req;
423 u64 tid = le64_to_cpu(hdr->tid);
424 struct ceph_msg *m;
425
426 mutex_lock(&monc->mutex);
427 req = __lookup_generic_req(monc, tid);
428 if (!req) {
429 dout("get_generic_reply %lld dne\n", tid);
430 *skip = 1;
431 m = NULL;
432 } else {
433 dout("get_generic_reply %lld got %p\n", tid, req->reply);
434 m = ceph_msg_get(req->reply);
435 /*
436 * we don't need to track the connection reading into
437 * this reply because we only have one open connection
438 * at a time, ever.
439 */
440 }
441 mutex_unlock(&monc->mutex);
442 return m;
394} 443}
395 444
445static int do_generic_request(struct ceph_mon_client *monc,
446 struct ceph_mon_generic_request *req)
447{
448 int err;
449
450 /* register request */
451 mutex_lock(&monc->mutex);
452 req->tid = ++monc->last_tid;
453 req->request->hdr.tid = cpu_to_le64(req->tid);
454 __insert_generic_request(monc, req);
455 monc->num_generic_requests++;
456 ceph_con_send(monc->con, ceph_msg_get(req->request));
457 mutex_unlock(&monc->mutex);
458
459 err = wait_for_completion_interruptible(&req->completion);
460
461 mutex_lock(&monc->mutex);
462 rb_erase(&req->node, &monc->generic_request_tree);
463 monc->num_generic_requests--;
464 mutex_unlock(&monc->mutex);
465
466 if (!err)
467 err = req->result;
468 return err;
469}
470
471/*
472 * statfs
473 */
396static void handle_statfs_reply(struct ceph_mon_client *monc, 474static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg) 475 struct ceph_msg *msg)
398{ 476{
399 struct ceph_mon_statfs_request *req; 477 struct ceph_mon_generic_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 478 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid; 479 u64 tid = le64_to_cpu(msg->hdr.tid);
402 480
403 if (msg->front.iov_len != sizeof(*reply)) 481 if (msg->front.iov_len != sizeof(*reply))
404 goto bad; 482 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid); 483 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407 484
408 mutex_lock(&monc->mutex); 485 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid); 486 req = __lookup_generic_req(monc, tid);
410 if (req) { 487 if (req) {
411 *req->buf = reply->st; 488 *(struct ceph_statfs *)req->buf = reply->st;
412 req->result = 0; 489 req->result = 0;
490 get_generic_request(req);
413 } 491 }
414 mutex_unlock(&monc->mutex); 492 mutex_unlock(&monc->mutex);
415 if (req) 493 if (req) {
416 complete(&req->completion); 494 complete_all(&req->completion);
495 put_generic_request(req);
496 }
417 return; 497 return;
418 498
419bad: 499bad:
420 pr_err("corrupt statfs reply, no tid\n"); 500 pr_err("corrupt generic reply, tid %llu\n", tid);
421 ceph_msg_dump(msg); 501 ceph_msg_dump(msg);
422} 502}
423 503
424/* 504/*
425 * (re)send a statfs request 505 * Do a synchronous statfs().
426 */ 506 */
427static int send_statfs(struct ceph_mon_client *monc, 507int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
428 struct ceph_mon_statfs_request *req)
429{ 508{
430 struct ceph_msg *msg; 509 struct ceph_mon_generic_request *req;
431 struct ceph_mon_statfs *h; 510 struct ceph_mon_statfs *h;
511 int err;
512
513 req = kzalloc(sizeof(*req), GFP_NOFS);
514 if (!req)
515 return -ENOMEM;
516
517 kref_init(&req->kref);
518 req->buf = buf;
519 req->buf_len = sizeof(*buf);
520 init_completion(&req->completion);
521
522 err = -ENOMEM;
523 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
524 if (!req->request)
525 goto out;
526 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
527 if (!req->reply)
528 goto out;
432 529
433 dout("send_statfs tid %llu\n", req->tid); 530 /* fill out request */
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); 531 h = req->request->front.iov_base;
435 if (IS_ERR(msg))
436 return PTR_ERR(msg);
437 req->request = msg;
438 msg->hdr.tid = cpu_to_le64(req->tid);
439 h = msg->front.iov_base;
440 h->monhdr.have_version = 0; 532 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1); 533 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0; 534 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid; 535 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg); 536
445 return 0; 537 err = do_generic_request(monc, req);
538
539out:
540 kref_put(&req->kref, release_generic_request);
541 return err;
446} 542}
447 543
448/* 544/*
449 * Do a synchronous statfs(). 545 * pool ops
450 */ 546 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) 547static int get_poolop_reply_buf(const char *src, size_t src_len,
548 char *dst, size_t dst_len)
452{ 549{
453 struct ceph_mon_statfs_request req; 550 u32 buf_len;
454 int err;
455 551
456 req.buf = buf; 552 if (src_len != sizeof(u32) + dst_len)
457 init_completion(&req.completion); 553 return -EINVAL;
458 554
459 /* allocate memory for reply */ 555 buf_len = le32_to_cpu(*(u32 *)src);
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1); 556 if (buf_len != dst_len)
461 if (err) 557 return -EINVAL;
462 return err;
463 558
464 /* register request */ 559 memcpy(dst, src + sizeof(u32), dst_len);
465 mutex_lock(&monc->mutex); 560 return 0;
466 req.tid = ++monc->last_tid; 561}
467 req.last_attempt = jiffies;
468 req.delay = BASE_DELAY_INTERVAL;
469 __insert_statfs(monc, &req);
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex);
472 562
473 /* send request and wait */ 563static void handle_poolop_reply(struct ceph_mon_client *monc,
474 err = send_statfs(monc, &req); 564 struct ceph_msg *msg)
475 if (!err) 565{
476 err = wait_for_completion_interruptible(&req.completion); 566 struct ceph_mon_generic_request *req;
567 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
568 u64 tid = le64_to_cpu(msg->hdr.tid);
569
570 if (msg->front.iov_len < sizeof(*reply))
571 goto bad;
572 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
477 573
478 mutex_lock(&monc->mutex); 574 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree); 575 req = __lookup_generic_req(monc, tid);
480 monc->num_statfs_requests--; 576 if (req) {
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1); 577 if (req->buf_len &&
578 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
579 msg->front.iov_len - sizeof(*reply),
580 req->buf, req->buf_len) < 0) {
581 mutex_unlock(&monc->mutex);
582 goto bad;
583 }
584 req->result = le32_to_cpu(reply->reply_code);
585 get_generic_request(req);
586 }
482 mutex_unlock(&monc->mutex); 587 mutex_unlock(&monc->mutex);
588 if (req) {
589 complete(&req->completion);
590 put_generic_request(req);
591 }
592 return;
483 593
484 if (!err) 594bad:
485 err = req.result; 595 pr_err("corrupt generic reply, tid %llu\n", tid);
596 ceph_msg_dump(msg);
597}
598
599/*
600 * Do a synchronous pool op.
601 */
602int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
603 u32 pool, u64 snapid,
604 char *buf, int len)
605{
606 struct ceph_mon_generic_request *req;
607 struct ceph_mon_poolop *h;
608 int err;
609
610 req = kzalloc(sizeof(*req), GFP_NOFS);
611 if (!req)
612 return -ENOMEM;
613
614 kref_init(&req->kref);
615 req->buf = buf;
616 req->buf_len = len;
617 init_completion(&req->completion);
618
619 err = -ENOMEM;
620 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
621 if (!req->request)
622 goto out;
623 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
624 if (!req->reply)
625 goto out;
626
627 /* fill out request */
628 req->request->hdr.version = cpu_to_le16(2);
629 h = req->request->front.iov_base;
630 h->monhdr.have_version = 0;
631 h->monhdr.session_mon = cpu_to_le16(-1);
632 h->monhdr.session_mon_tid = 0;
633 h->fsid = monc->monmap->fsid;
634 h->pool = cpu_to_le32(pool);
635 h->op = cpu_to_le32(op);
636 h->auid = 0;
637 h->snapid = cpu_to_le64(snapid);
638 h->name_len = 0;
639
640 err = do_generic_request(monc, req);
641
642out:
643 kref_put(&req->kref, release_generic_request);
486 return err; 644 return err;
487} 645}
488 646
647int ceph_monc_create_snapid(struct ceph_mon_client *monc,
648 u32 pool, u64 *snapid)
649{
650 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
651 pool, 0, (char *)snapid, sizeof(*snapid));
652
653}
654
655int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
656 u32 pool, u64 snapid)
657{
658 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
659 pool, snapid, 0, 0);
660
661}
662
489/* 663/*
490 * Resend pending statfs requests. 664 * Resend pending generic requests.
491 */ 665 */
492static void __resend_statfs(struct ceph_mon_client *monc) 666static void __resend_generic_request(struct ceph_mon_client *monc)
493{ 667{
494 struct ceph_mon_statfs_request *req; 668 struct ceph_mon_generic_request *req;
495 struct rb_node *p; 669 struct rb_node *p;
496 670
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { 671 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node); 672 req = rb_entry(p, struct ceph_mon_generic_request, node);
499 send_statfs(monc, req); 673 ceph_con_revoke(monc->con, req->request);
674 ceph_con_send(monc->con, ceph_msg_get(req->request));
500 } 675 }
501} 676}
502 677
@@ -586,26 +761,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 761 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; 762 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588 763
589 /* msg pools */ 764 /* msgs */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, 765 err = -ENOMEM;
591 sizeof(struct ceph_mon_subscribe_ack), 1, false); 766 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
592 if (err < 0) 767 sizeof(struct ceph_mon_subscribe_ack),
768 GFP_NOFS);
769 if (!monc->m_subscribe_ack)
593 goto out_monmap; 770 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply, 771
595 sizeof(struct ceph_mon_statfs_reply), 0, false); 772 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
596 if (err < 0) 773 if (!monc->m_subscribe)
597 goto out_pool1; 774 goto out_subscribe_ack;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); 775
599 if (err < 0) 776 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
600 goto out_pool2; 777 if (!monc->m_auth_reply)
601 778 goto out_subscribe;
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); 779
780 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
603 monc->pending_auth = 0; 781 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) { 782 if (!monc->m_auth)
605 err = PTR_ERR(monc->m_auth); 783 goto out_auth_reply;
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609 784
610 monc->cur_mon = -1; 785 monc->cur_mon = -1;
611 monc->hunting = true; 786 monc->hunting = true;
@@ -613,8 +788,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
613 monc->sub_sent = 0; 788 monc->sub_sent = 0;
614 789
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 790 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT; 791 monc->generic_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0; 792 monc->num_generic_requests = 0;
618 monc->last_tid = 0; 793 monc->last_tid = 0;
619 794
620 monc->have_mdsmap = 0; 795 monc->have_mdsmap = 0;
@@ -622,12 +797,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
622 monc->want_next_osdmap = 1; 797 monc->want_next_osdmap = 1;
623 return 0; 798 return 0;
624 799
625out_pool3: 800out_auth_reply:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 801 ceph_msg_put(monc->m_auth_reply);
627out_pool2: 802out_subscribe:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 803 ceph_msg_put(monc->m_subscribe);
629out_pool1: 804out_subscribe_ack:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 805 ceph_msg_put(monc->m_subscribe_ack);
631out_monmap: 806out_monmap:
632 kfree(monc->monmap); 807 kfree(monc->monmap);
633out: 808out:
@@ -651,9 +826,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
651 ceph_auth_destroy(monc->auth); 826 ceph_auth_destroy(monc->auth);
652 827
653 ceph_msg_put(monc->m_auth); 828 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 829 ceph_msg_put(monc->m_auth_reply);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 830 ceph_msg_put(monc->m_subscribe);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 831 ceph_msg_put(monc->m_subscribe_ack);
657 832
658 kfree(monc->monmap); 833 kfree(monc->monmap);
659} 834}
@@ -662,8 +837,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
662 struct ceph_msg *msg) 837 struct ceph_msg *msg)
663{ 838{
664 int ret; 839 int ret;
840 int was_auth = 0;
665 841
666 mutex_lock(&monc->mutex); 842 mutex_lock(&monc->mutex);
843 if (monc->auth->ops)
844 was_auth = monc->auth->ops->is_authenticated(monc->auth);
667 monc->pending_auth = 0; 845 monc->pending_auth = 0;
668 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 846 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
669 msg->front.iov_len, 847 msg->front.iov_len,
@@ -671,17 +849,18 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
671 monc->m_auth->front_max); 849 monc->m_auth->front_max);
672 if (ret < 0) { 850 if (ret < 0) {
673 monc->client->auth_err = ret; 851 monc->client->auth_err = ret;
674 wake_up(&monc->client->auth_wq); 852 wake_up_all(&monc->client->auth_wq);
675 } else if (ret > 0) { 853 } else if (ret > 0) {
676 __send_prepared_auth_request(monc, ret); 854 __send_prepared_auth_request(monc, ret);
677 } else if (monc->auth->ops->is_authenticated(monc->auth)) { 855 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
678 dout("authenticated, starting session\n"); 856 dout("authenticated, starting session\n");
679 857
680 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 858 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
681 monc->client->msgr->inst.name.num = monc->auth->global_id; 859 monc->client->msgr->inst.name.num =
860 cpu_to_le64(monc->auth->global_id);
682 861
683 __send_subscribe(monc); 862 __send_subscribe(monc);
684 __resend_statfs(monc); 863 __resend_generic_request(monc);
685 } 864 }
686 mutex_unlock(&monc->mutex); 865 mutex_unlock(&monc->mutex);
687} 866}
@@ -735,6 +914,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
735 handle_statfs_reply(monc, msg); 914 handle_statfs_reply(monc, msg);
736 break; 915 break;
737 916
917 case CEPH_MSG_POOLOP_REPLY:
918 handle_poolop_reply(monc, msg);
919 break;
920
738 case CEPH_MSG_MON_MAP: 921 case CEPH_MSG_MON_MAP:
739 ceph_monc_handle_map(monc, msg); 922 ceph_monc_handle_map(monc, msg);
740 break; 923 break;
@@ -770,18 +953,18 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
770 953
771 switch (type) { 954 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK: 955 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); 956 m = ceph_msg_get(monc->m_subscribe_ack);
774 break; 957 break;
958 case CEPH_MSG_POOLOP_REPLY:
775 case CEPH_MSG_STATFS_REPLY: 959 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); 960 return get_generic_reply(con, hdr, skip);
777 break;
778 case CEPH_MSG_AUTH_REPLY: 961 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); 962 m = ceph_msg_get(monc->m_auth_reply);
780 break; 963 break;
781 case CEPH_MSG_MON_MAP: 964 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP: 965 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP: 966 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL); 967 m = ceph_msg_new(type, front_len, GFP_NOFS);
785 break; 968 break;
786 } 969 }
787 970
@@ -826,7 +1009,7 @@ out:
826 mutex_unlock(&monc->mutex); 1009 mutex_unlock(&monc->mutex);
827} 1010}
828 1011
829const static struct ceph_connection_operations mon_con_ops = { 1012static const struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get, 1013 .get = ceph_con_get,
831 .put = ceph_con_put, 1014 .put = ceph_con_put,
832 .dispatch = dispatch, 1015 .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..8e396f2c0963 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
2#define _FS_CEPH_MON_CLIENT_H 2#define _FS_CEPH_MON_CLIENT_H
3 3
4#include <linux/completion.h> 4#include <linux/completion.h>
5#include <linux/kref.h>
5#include <linux/rbtree.h> 6#include <linux/rbtree.h>
6 7
7#include "messenger.h" 8#include "messenger.h"
8#include "msgpool.h"
9 9
10struct ceph_client; 10struct ceph_client;
11struct ceph_mount_args; 11struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
22}; 22};
23 23
24struct ceph_mon_client; 24struct ceph_mon_client;
25struct ceph_mon_statfs_request; 25struct ceph_mon_generic_request;
26 26
27 27
28/* 28/*
@@ -40,17 +40,20 @@ struct ceph_mon_request {
40}; 40};
41 41
42/* 42/*
43 * statfs() is done a bit differently because we need to get data back 43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
44 * to the caller 45 * to the caller
45 */ 46 */
46struct ceph_mon_statfs_request { 47struct ceph_mon_generic_request {
48 struct kref kref;
47 u64 tid; 49 u64 tid;
48 struct rb_node node; 50 struct rb_node node;
49 int result; 51 int result;
50 struct ceph_statfs *buf; 52 void *buf;
53 int buf_len;
51 struct completion completion; 54 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */ 55 struct ceph_msg *request; /* original request */
56 struct ceph_msg *reply; /* and reply */
54}; 57};
55 58
56struct ceph_mon_client { 59struct ceph_mon_client {
@@ -61,7 +64,7 @@ struct ceph_mon_client {
61 struct delayed_work delayed_work; 64 struct delayed_work delayed_work;
62 65
63 struct ceph_auth_client *auth; 66 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth; 67 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
65 int pending_auth; 68 int pending_auth;
66 69
67 bool hunting; 70 bool hunting;
@@ -70,14 +73,9 @@ struct ceph_mon_client {
70 struct ceph_connection *con; 73 struct ceph_connection *con;
71 bool have_fsid; 74 bool have_fsid;
72 75
73 /* msg pools */ 76 /* pending generic requests */
74 struct ceph_msgpool msgpool_subscribe_ack; 77 struct rb_root generic_request_tree;
75 struct ceph_msgpool msgpool_statfs_reply; 78 int num_generic_requests;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid; 79 u64 last_tid;
82 80
83 /* mds/osd map */ 81 /* mds/osd map */
@@ -114,6 +112,10 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114 112
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); 113extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116 114
115extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
116 u32 pool, u64 *snapid);
117 117
118extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
119 u32 pool, u64 snapid);
118 120
119#endif 121#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
7 7
8#include "msgpool.h" 8#include "msgpool.h"
9 9
10/* 10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11 * We use msg pools to preallocate memory for messages we expect to 11{
12 * receive over the wire, to avoid getting ourselves into OOM 12 struct ceph_msgpool *pool = arg;
13 * conditions at unexpected times. We take use a few different 13 void *p;
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31 14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
32 20
33/* 21static void free_fn(void *element, void *arg)
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{ 22{
38 struct ceph_msg *msg; 23 ceph_msg_put(element);
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61} 24}
62 25
63int ceph_msgpool_init(struct ceph_msgpool *pool, 26int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking) 27 int front_len, int size, bool blocking, const char *name)
65{ 28{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len; 29 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs); 30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
72 pool->num = 0; 31 if (!pool->pool)
73 pool->min = min; 32 return -ENOMEM;
74 pool->blocking = blocking; 33 pool->name = name;
75 init_waitqueue_head(&pool->wait); 34 return 0;
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81} 35}
82 36
83void ceph_msgpool_destroy(struct ceph_msgpool *pool) 37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{ 38{
85 dout("msgpool_destroy %p\n", pool); 39 mempool_destroy(pool->pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90} 40}
91 41
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) 42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
93{ 44{
94 int ret; 45 if (front_len > pool->front_len) {
95 46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
96 spin_lock(&pool->lock); 47 pool->name, front_len, pool->front_len);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1); 48 WARN_ON(1);
113 49
114 /* try to alloc a fresh message */ 50 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL); 51 return ceph_msg_new(0, front_len, GFP_NOFS);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 } 52 }
129 53
130 while (1) { 54 return mempool_alloc(pool->pool, GFP_NOFS);
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163} 55}
164 56
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) 57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{ 58{
167 spin_lock(&pool->lock); 59 /* reset msg front_len; user may have changed it */
168 if (pool->num < pool->min) { 60 msg->front.iov_len = pool->front_len;
169 /* reset msg front_len; user may have changed it */ 61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172 62
173 kref_set(&msg->kref, 1); /* retake a single ref */ 63 kref_init(&msg->kref); /* retake single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186} 64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
1#ifndef _FS_CEPH_MSGPOOL 1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL 2#define _FS_CEPH_MSGPOOL
3 3
4#include <linux/mempool.h>
4#include "messenger.h" 5#include "messenger.h"
5 6
6/* 7/*
@@ -8,18 +9,15 @@
8 * avoid unexpected OOM conditions. 9 * avoid unexpected OOM conditions.
9 */ 10 */
10struct ceph_msgpool { 11struct ceph_msgpool {
11 spinlock_t lock; 12 const char *name;
13 mempool_t *pool;
12 int front_len; /* preallocated payload size */ 14 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17}; 15};
18 16
19extern int ceph_msgpool_init(struct ceph_msgpool *pool, 17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking); 18 int front_len, int size, bool blocking,
19 const char *name);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); 20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, 21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len); 22 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); 23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..680d3d648cac 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -1,5 +1,5 @@
1#ifndef __MSGR_H 1#ifndef CEPH_MSGR_H
2#define __MSGR_H 2#define CEPH_MSGR_H
3 3
4/* 4/*
5 * Data types for message passing layer used by Ceph. 5 * Data types for message passing layer used by Ceph.
@@ -50,7 +50,6 @@ struct ceph_entity_name {
50#define CEPH_ENTITY_TYPE_MDS 0x02 50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04 51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08 52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20 53#define CEPH_ENTITY_TYPE_AUTH 0x20
55 54
56#define CEPH_ENTITY_TYPE_ANY 0xFF 55#define CEPH_ENTITY_TYPE_ANY 0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
120/* 119/*
121 * message header 120 * message header
122 */ 121 */
123struct ceph_msg_header { 122struct ceph_msg_header_old {
124 __le64 seq; /* message seq# for this session */ 123 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */ 124 __le64 tid; /* transaction id */
126 __le16 type; /* message type */ 125 __le16 type; /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
138 __le32 crc; /* header crc32c */ 137 __le32 crc; /* header crc32c */
139} __attribute__ ((packed)); 138} __attribute__ ((packed));
140 139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
141#define CEPH_MSG_PRIO_LOW 64 158#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127 159#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196 160#define CEPH_MSG_PRIO_HIGH 196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85f..3b5571b8ce22 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
16#define OSD_OP_FRONT_LEN 4096 16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512 17#define OSD_OPREPLY_FRONT_LEN 512
18 18
19const static struct ceph_connection_operations osd_con_ops; 19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc, 20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd); 21 struct ceph_osd *kickosd);
22 22
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
147 req = kzalloc(sizeof(*req), GFP_NOFS); 147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 } 148 }
149 if (req == NULL) 149 if (req == NULL)
150 return ERR_PTR(-ENOMEM); 150 return NULL;
151 151
152 req->r_osdc = osdc; 152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool; 153 req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else 165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); 167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (IS_ERR(msg)) { 168 if (!msg) {
169 ceph_osdc_put_request(req); 169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg)); 170 return NULL;
171 } 171 }
172 req->r_reply = msg; 172 req->r_reply = msg;
173 173
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
178 if (use_mempool) 178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else 180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); 181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (IS_ERR(msg)) { 182 if (!msg) {
183 ceph_osdc_put_request(req); 183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg)); 184 return NULL;
185 } 185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); 186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len); 187 memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
361{ 361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1); 363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref)) 364 if (atomic_dec_and_test(&osd->o_ref)) {
365 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
366
367 if (osd->o_authorizer)
368 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
365 kfree(osd); 369 kfree(osd);
370 }
366} 371}
367 372
368/* 373/*
@@ -544,7 +549,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
544 */ 549 */
545static void __cancel_request(struct ceph_osd_request *req) 550static void __cancel_request(struct ceph_osd_request *req)
546{ 551{
547 if (req->r_sent) { 552 if (req->r_sent && req->r_osd) {
548 ceph_con_revoke(&req->r_osd->o_con, req->r_request); 553 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
549 req->r_sent = 0; 554 req->r_sent = 0;
550 } 555 }
@@ -656,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc,
656 reqhead->reassert_version = req->r_reassert_version; 661 reqhead->reassert_version = req->r_reassert_version;
657 662
658 req->r_stamp = jiffies; 663 req->r_stamp = jiffies;
659 list_move_tail(&osdc->req_lru, &req->r_req_lru_item); 664 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
660 665
661 ceph_msg_get(req->r_request); /* send consumes a ref */ 666 ceph_msg_get(req->r_request); /* send consumes a ref */
662 ceph_con_send(&req->r_osd->o_con, req->r_request); 667 ceph_con_send(&req->r_osd->o_con, req->r_request);
@@ -715,7 +720,7 @@ static void handle_timeout(struct work_struct *work)
715 * should mark the osd as failed and we should find out about 720 * should mark the osd as failed and we should find out about
716 * it from an updated osd map. 721 * it from an updated osd map.
717 */ 722 */
718 while (!list_empty(&osdc->req_lru)) { 723 while (timeout && !list_empty(&osdc->req_lru)) {
719 req = list_entry(osdc->req_lru.next, struct ceph_osd_request, 724 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
720 r_req_lru_item); 725 r_req_lru_item);
721 726
@@ -857,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
857 if (req->r_callback) 862 if (req->r_callback)
858 req->r_callback(req, msg); 863 req->r_callback(req, msg);
859 else 864 else
860 complete(&req->r_completion); 865 complete_all(&req->r_completion);
861 866
862 if (flags & CEPH_OSD_FLAG_ONDISK) { 867 if (flags & CEPH_OSD_FLAG_ONDISK) {
863 if (req->r_safe_callback) 868 if (req->r_safe_callback)
864 req->r_safe_callback(req, msg); 869 req->r_safe_callback(req, msg);
865 complete(&req->r_safe_completion); /* fsync waiter */ 870 complete_all(&req->r_safe_completion); /* fsync waiter */
866 } 871 }
867 872
868done: 873done:
@@ -1078,6 +1083,7 @@ done:
1078 if (newmap) 1083 if (newmap)
1079 kick_requests(osdc, NULL); 1084 kick_requests(osdc, NULL);
1080 up_read(&osdc->map_sem); 1085 up_read(&osdc->map_sem);
1086 wake_up_all(&osdc->client->auth_wq);
1081 return; 1087 return;
1082 1088
1083bad: 1089bad:
@@ -1087,45 +1093,6 @@ bad:
1087 return; 1093 return;
1088} 1094}
1089 1095
1090
1091/*
1092 * A read request prepares specific pages that data is to be read into.
1093 * When a message is being read off the wire, we call prepare_pages to
1094 * find those pages.
1095 * 0 = success, -1 failure.
1096 */
1097static int __prepare_pages(struct ceph_connection *con,
1098 struct ceph_msg_header *hdr,
1099 struct ceph_osd_request *req,
1100 u64 tid,
1101 struct ceph_msg *m)
1102{
1103 struct ceph_osd *osd = con->private;
1104 struct ceph_osd_client *osdc;
1105 int ret = -1;
1106 int data_len = le32_to_cpu(hdr->data_len);
1107 unsigned data_off = le16_to_cpu(hdr->data_off);
1108
1109 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1110
1111 if (!osd)
1112 return -1;
1113
1114 osdc = osd->o_osdc;
1115
1116 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1117 tid, req->r_num_pages, want);
1118 if (unlikely(req->r_num_pages < want))
1119 goto out;
1120 m->pages = req->r_pages;
1121 m->nr_pages = req->r_num_pages;
1122 ret = 0; /* success */
1123out:
1124 BUG_ON(ret < 0 || m->nr_pages < want);
1125
1126 return ret;
1127}
1128
1129/* 1096/*
1130 * Register request, send initial attempt. 1097 * Register request, send initial attempt.
1131 */ 1098 */
@@ -1252,11 +1219,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1252 if (!osdc->req_mempool) 1219 if (!osdc->req_mempool)
1253 goto out; 1220 goto out;
1254 1221
1255 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); 1222 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1223 "osd_op");
1256 if (err < 0) 1224 if (err < 0)
1257 goto out_mempool; 1225 goto out_mempool;
1258 err = ceph_msgpool_init(&osdc->msgpool_op_reply, 1226 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1259 OSD_OPREPLY_FRONT_LEN, 10, true); 1227 OSD_OPREPLY_FRONT_LEN, 10, true,
1228 "osd_op_reply");
1260 if (err < 0) 1229 if (err < 0)
1261 goto out_msgpool; 1230 goto out_msgpool;
1262 return 0; 1231 return 0;
@@ -1302,13 +1271,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1302 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1271 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1303 NULL, 0, truncate_seq, truncate_size, NULL, 1272 NULL, 0, truncate_seq, truncate_size, NULL,
1304 false, 1); 1273 false, 1);
1305 if (IS_ERR(req)) 1274 if (!req)
1306 return PTR_ERR(req); 1275 return -ENOMEM;
1307 1276
1308 /* it may be a short read due to an object boundary */ 1277 /* it may be a short read due to an object boundary */
1309 req->r_pages = pages; 1278 req->r_pages = pages;
1310 num_pages = calc_pages_for(off, *plen);
1311 req->r_num_pages = num_pages;
1312 1279
1313 dout("readpages final extent is %llu~%llu (%d pages)\n", 1280 dout("readpages final extent is %llu~%llu (%d pages)\n",
1314 off, *plen, req->r_num_pages); 1281 off, *plen, req->r_num_pages);
@@ -1345,12 +1312,11 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1345 snapc, do_sync, 1312 snapc, do_sync,
1346 truncate_seq, truncate_size, mtime, 1313 truncate_seq, truncate_size, mtime,
1347 nofail, 1); 1314 nofail, 1);
1348 if (IS_ERR(req)) 1315 if (!req)
1349 return PTR_ERR(req); 1316 return -ENOMEM;
1350 1317
1351 /* it may be a short write due to an object boundary */ 1318 /* it may be a short write due to an object boundary */
1352 req->r_pages = pages; 1319 req->r_pages = pages;
1353 req->r_num_pages = calc_pages_for(off, len);
1354 dout("writepages %llu~%llu (%d pages)\n", off, len, 1320 dout("writepages %llu~%llu (%d pages)\n", off, len,
1355 req->r_num_pages); 1321 req->r_num_pages);
1356 1322
@@ -1375,7 +1341,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1375 int type = le16_to_cpu(msg->hdr.type); 1341 int type = le16_to_cpu(msg->hdr.type);
1376 1342
1377 if (!osd) 1343 if (!osd)
1378 return; 1344 goto out;
1379 osdc = osd->o_osdc; 1345 osdc = osd->o_osdc;
1380 1346
1381 switch (type) { 1347 switch (type) {
@@ -1390,11 +1356,13 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1390 pr_err("received unknown message type %d %s\n", type, 1356 pr_err("received unknown message type %d %s\n", type,
1391 ceph_msg_type_name(type)); 1357 ceph_msg_type_name(type));
1392 } 1358 }
1359out:
1393 ceph_msg_put(msg); 1360 ceph_msg_put(msg);
1394} 1361}
1395 1362
1396/* 1363/*
1397 * lookup and return message for incoming reply 1364 * lookup and return message for incoming reply. set up reply message
1365 * pages.
1398 */ 1366 */
1399static struct ceph_msg *get_reply(struct ceph_connection *con, 1367static struct ceph_msg *get_reply(struct ceph_connection *con,
1400 struct ceph_msg_header *hdr, 1368 struct ceph_msg_header *hdr,
@@ -1407,7 +1375,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1407 int front = le32_to_cpu(hdr->front_len); 1375 int front = le32_to_cpu(hdr->front_len);
1408 int data_len = le32_to_cpu(hdr->data_len); 1376 int data_len = le32_to_cpu(hdr->data_len);
1409 u64 tid; 1377 u64 tid;
1410 int err;
1411 1378
1412 tid = le64_to_cpu(hdr->tid); 1379 tid = le64_to_cpu(hdr->tid);
1413 mutex_lock(&osdc->request_mutex); 1380 mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1392,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1425 req->r_reply, req->r_con_filling_msg); 1392 req->r_reply, req->r_con_filling_msg);
1426 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); 1393 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1427 ceph_con_put(req->r_con_filling_msg); 1394 ceph_con_put(req->r_con_filling_msg);
1395 req->r_con_filling_msg = NULL;
1428 } 1396 }
1429 1397
1430 if (front > req->r_reply->front.iov_len) { 1398 if (front > req->r_reply->front.iov_len) {
1431 pr_warning("get_reply front %d > preallocated %d\n", 1399 pr_warning("get_reply front %d > preallocated %d\n",
1432 front, (int)req->r_reply->front.iov_len); 1400 front, (int)req->r_reply->front.iov_len);
1433 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); 1401 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1434 if (IS_ERR(m)) 1402 if (!m)
1435 goto out; 1403 goto out;
1436 ceph_msg_put(req->r_reply); 1404 ceph_msg_put(req->r_reply);
1437 req->r_reply = m; 1405 req->r_reply = m;
@@ -1439,12 +1407,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1439 m = ceph_msg_get(req->r_reply); 1407 m = ceph_msg_get(req->r_reply);
1440 1408
1441 if (data_len > 0) { 1409 if (data_len > 0) {
1442 err = __prepare_pages(con, hdr, req, tid, m); 1410 unsigned data_off = le16_to_cpu(hdr->data_off);
1443 if (err < 0) { 1411 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1412
1413 if (unlikely(req->r_num_pages < want)) {
1414 pr_warning("tid %lld reply %d > expected %d pages\n",
1415 tid, want, m->nr_pages);
1444 *skip = 1; 1416 *skip = 1;
1445 ceph_msg_put(m); 1417 ceph_msg_put(m);
1446 m = ERR_PTR(err); 1418 m = NULL;
1419 goto out;
1447 } 1420 }
1421 m->pages = req->r_pages;
1422 m->nr_pages = req->r_num_pages;
1448 } 1423 }
1449 *skip = 0; 1424 *skip = 0;
1450 req->r_con_filling_msg = ceph_con_get(con); 1425 req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1441,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1466 1441
1467 switch (type) { 1442 switch (type) {
1468 case CEPH_MSG_OSD_MAP: 1443 case CEPH_MSG_OSD_MAP:
1469 return ceph_msg_new(type, front, 0, 0, NULL); 1444 return ceph_msg_new(type, front, GFP_NOFS);
1470 case CEPH_MSG_OSD_OPREPLY: 1445 case CEPH_MSG_OSD_OPREPLY:
1471 return get_reply(con, hdr, skip); 1446 return get_reply(con, hdr, skip);
1472 default: 1447 default:
@@ -1498,8 +1473,8 @@ static void put_osd_con(struct ceph_connection *con)
1498 * authentication 1473 * authentication
1499 */ 1474 */
1500static int get_authorizer(struct ceph_connection *con, 1475static int get_authorizer(struct ceph_connection *con,
1501 void **buf, int *len, int *proto, 1476 void **buf, int *len, int *proto,
1502 void **reply_buf, int *reply_len, int force_new) 1477 void **reply_buf, int *reply_len, int force_new)
1503{ 1478{
1504 struct ceph_osd *o = con->private; 1479 struct ceph_osd *o = con->private;
1505 struct ceph_osd_client *osdc = o->o_osdc; 1480 struct ceph_osd_client *osdc = o->o_osdc;
@@ -1519,7 +1494,7 @@ static int get_authorizer(struct ceph_connection *con,
1519 &o->o_authorizer_reply_buf, 1494 &o->o_authorizer_reply_buf,
1520 &o->o_authorizer_reply_buf_len); 1495 &o->o_authorizer_reply_buf_len);
1521 if (ret) 1496 if (ret)
1522 return ret; 1497 return ret;
1523 } 1498 }
1524 1499
1525 *proto = ac->protocol; 1500 *proto = ac->protocol;
@@ -1552,7 +1527,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
1552 return ceph_monc_validate_auth(&osdc->client->monc); 1527 return ceph_monc_validate_auth(&osdc->client->monc);
1553} 1528}
1554 1529
1555const static struct ceph_connection_operations osd_con_ops = { 1530static const struct ceph_connection_operations osd_con_ops = {
1556 .get = get_osd_con, 1531 .get = get_osd_con,
1557 .put = put_osd_con, 1532 .put = put_osd_con,
1558 .dispatch = dispatch, 1533 .dispatch = dispatch,
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index cfdd8f4388b7..e31f118f1392 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -424,12 +424,30 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
424 kfree(pi); 424 kfree(pi);
425} 425}
426 426
427void __decode_pool(void **p, struct ceph_pg_pool_info *pi) 427static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
428{ 428{
429 unsigned n, m;
430
429 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 431 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
430 calc_pg_masks(pi); 432 calc_pg_masks(pi);
431 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); 433
434 /* num_snaps * snap_info_t */
435 n = le32_to_cpu(pi->v.num_snaps);
436 while (n--) {
437 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
438 sizeof(struct ceph_timespec), bad);
439 *p += sizeof(u64) + /* key */
440 1 + sizeof(u64) + /* u8, snapid */
441 sizeof(struct ceph_timespec);
442 m = ceph_decode_32(p); /* snap name */
443 *p += m;
444 }
445
432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; 446 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
447 return 0;
448
449bad:
450 return -EINVAL;
433} 451}
434 452
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 453static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
@@ -568,9 +586,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
568 if (ev > CEPH_PG_POOL_VERSION) { 586 if (ev > CEPH_PG_POOL_VERSION) {
569 pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 587 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
570 ev, CEPH_PG_POOL_VERSION); 588 ev, CEPH_PG_POOL_VERSION);
589 kfree(pi);
571 goto bad; 590 goto bad;
572 } 591 }
573 __decode_pool(p, pi); 592 err = __decode_pool(p, end, pi);
593 if (err < 0)
594 goto bad;
574 __insert_pg_pool(&map->pg_pools, pi); 595 __insert_pg_pool(&map->pg_pools, pi);
575 } 596 }
576 597
@@ -706,7 +727,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
706 len, *p, end); 727 len, *p, end);
707 newcrush = crush_decode(*p, min(*p+len, end)); 728 newcrush = crush_decode(*p, min(*p+len, end));
708 if (IS_ERR(newcrush)) 729 if (IS_ERR(newcrush))
709 return ERR_PTR(PTR_ERR(newcrush)); 730 return ERR_CAST(newcrush);
731 *p += len;
710 } 732 }
711 733
712 /* new flags? */ 734 /* new flags? */
@@ -758,7 +780,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
758 pi->id = pool; 780 pi->id = pool;
759 __insert_pg_pool(&map->pg_pools, pi); 781 __insert_pg_pool(&map->pg_pools, pi);
760 } 782 }
761 __decode_pool(p, pi); 783 err = __decode_pool(p, end, pi);
784 if (err < 0)
785 goto bad;
762 } 786 }
763 if (version >= 5 && __decode_pool_names(p, end, map) < 0) 787 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
764 goto bad; 788 goto bad;
@@ -829,12 +853,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
829 /* remove any? */ 853 /* remove any? */
830 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, 854 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
831 node)->pgid, pgid) <= 0) { 855 node)->pgid, pgid) <= 0) {
832 struct rb_node *cur = rbp; 856 struct ceph_pg_mapping *cur =
857 rb_entry(rbp, struct ceph_pg_mapping, node);
858
833 rbp = rb_next(rbp); 859 rbp = rb_next(rbp);
834 dout(" removed pg_temp %llx\n", 860 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
835 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, 861 rb_erase(&cur->node, &map->pg_temp);
836 node)->pgid); 862 kfree(cur);
837 rb_erase(cur, &map->pg_temp);
838 } 863 }
839 864
840 if (pglen) { 865 if (pglen) {
@@ -850,19 +875,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
850 for (j = 0; j < pglen; j++) 875 for (j = 0; j < pglen; j++)
851 pg->osds[j] = ceph_decode_32(p); 876 pg->osds[j] = ceph_decode_32(p);
852 err = __insert_pg_mapping(pg, &map->pg_temp); 877 err = __insert_pg_mapping(pg, &map->pg_temp);
853 if (err) 878 if (err) {
879 kfree(pg);
854 goto bad; 880 goto bad;
881 }
855 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, 882 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
856 pglen); 883 pglen);
857 } 884 }
858 } 885 }
859 while (rbp) { 886 while (rbp) {
860 struct rb_node *cur = rbp; 887 struct ceph_pg_mapping *cur =
888 rb_entry(rbp, struct ceph_pg_mapping, node);
889
861 rbp = rb_next(rbp); 890 rbp = rb_next(rbp);
862 dout(" removed pg_temp %llx\n", 891 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
863 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, 892 rb_erase(&cur->node, &map->pg_temp);
864 node)->pgid); 893 kfree(cur);
865 rb_erase(cur, &map->pg_temp);
866 } 894 }
867 895
868 /* ignore the rest */ 896 /* ignore the rest */
@@ -1020,8 +1048,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1020 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, 1048 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1021 pool->v.type, pool->v.size); 1049 pool->v.type, pool->v.size);
1022 if (ruleno < 0) { 1050 if (ruleno < 0) {
1023 pr_err("no crush rule pool %d type %d size %d\n", 1051 pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1024 poolid, pool->v.type, pool->v.size); 1052 poolid, pool->v.crush_ruleset, pool->v.type,
1053 pool->v.size);
1025 return NULL; 1054 return NULL;
1026 } 1055 }
1027 1056
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745a..46a368b6dce5 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -5,10 +5,18 @@
5 5
6#include "pagelist.h" 6#include "pagelist.h"
7 7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 struct page *page = list_entry(pl->head.prev, struct page,
11 lru);
12 kunmap(page);
13}
14
8int ceph_pagelist_release(struct ceph_pagelist *pl) 15int ceph_pagelist_release(struct ceph_pagelist *pl)
9{ 16{
10 if (pl->mapped_tail) 17 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail); 18 ceph_pagelist_unmap_tail(pl);
19
12 while (!list_empty(&pl->head)) { 20 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page, 21 struct page *page = list_first_entry(&pl->head, struct page,
14 lru); 22 lru);
@@ -20,13 +28,13 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
20 28
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl) 29static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{ 30{
23 struct page *page = alloc_page(GFP_NOFS); 31 struct page *page = __page_cache_alloc(GFP_NOFS);
24 if (!page) 32 if (!page)
25 return -ENOMEM; 33 return -ENOMEM;
26 pl->room += PAGE_SIZE; 34 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head); 35 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail) 36 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail); 37 ceph_pagelist_unmap_tail(pl);
30 pl->mapped_tail = kmap(page); 38 pl->mapped_tail = kmap(page);
31 return 0; 39 return 0;
32} 40}
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871f..6d5247f2e81b 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -1,5 +1,5 @@
1#ifndef __RADOS_H 1#ifndef CEPH_RADOS_H
2#define __RADOS_H 2#define CEPH_RADOS_H
3 3
4/* 4/*
5 * Data types for the Ceph distributed object storage layer RADOS 5 * Data types for the Ceph distributed object storage layer RADOS
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
101 __le64 snap_seq; /* seq for per-pool snapshot */ 101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */ 102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps; 103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; 104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 uid; 105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed)); 106} __attribute__ ((packed));
107 107
108/* 108/*
@@ -203,11 +203,13 @@ enum {
203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, 203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
204 204
205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, 205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
206 207
207 /** attrs **/ 208 /** attrs **/
208 /* read */ 209 /* read */
209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 210 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, 211 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
211 213
212 /* write */ 214 /* write */
213 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, 215 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -271,6 +273,10 @@ static inline int ceph_osd_op_mode_modify(int op)
271 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; 273 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
272} 274}
273 275
276/*
277 * note that the following tmap stuff is also defined in the ceph librados.h
278 * any modification here needs to be updated there
279 */
274#define CEPH_OSD_TMAP_HDR 'h' 280#define CEPH_OSD_TMAP_HDR 'h'
275#define CEPH_OSD_TMAP_SET 's' 281#define CEPH_OSD_TMAP_SET 's'
276#define CEPH_OSD_TMAP_RM 'r' 282#define CEPH_OSD_TMAP_RM 'r'
@@ -296,6 +302,7 @@ enum {
296 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ 302 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
297 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ 303 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
298 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ 304 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
305 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
299}; 306};
300 307
301enum { 308enum {
@@ -305,6 +312,22 @@ enum {
305#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 312#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
306#define EBLACKLISTED ESHUTDOWN /* blacklisted */ 313#define EBLACKLISTED ESHUTDOWN /* blacklisted */
307 314
315/* xattr comparison */
316enum {
317 CEPH_OSD_CMPXATTR_OP_NOP = 0,
318 CEPH_OSD_CMPXATTR_OP_EQ = 1,
319 CEPH_OSD_CMPXATTR_OP_NE = 2,
320 CEPH_OSD_CMPXATTR_OP_GT = 3,
321 CEPH_OSD_CMPXATTR_OP_GTE = 4,
322 CEPH_OSD_CMPXATTR_OP_LT = 5,
323 CEPH_OSD_CMPXATTR_OP_LTE = 6
324};
325
326enum {
327 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
328 CEPH_OSD_CMPXATTR_MODE_U64 = 2
329};
330
308/* 331/*
309 * an individual object operation. each may be accompanied by some data 332 * an individual object operation. each may be accompanied by some data
310 * payload 333 * payload
@@ -321,6 +344,8 @@ struct ceph_osd_op {
321 struct { 344 struct {
322 __le32 name_len; 345 __le32 name_len;
323 __le32 value_len; 346 __le32 value_len;
347 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
348 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
324 } __attribute__ ((packed)) xattr; 349 } __attribute__ ((packed)) xattr;
325 struct { 350 struct {
326 __u8 class_len; 351 __u8 class_len;
@@ -331,6 +356,9 @@ struct ceph_osd_op {
331 struct { 356 struct {
332 __le64 cookie, count; 357 __le64 cookie, count;
333 } __attribute__ ((packed)) pgls; 358 } __attribute__ ((packed)) pgls;
359 struct {
360 __le64 snapid;
361 } __attribute__ ((packed)) snap;
334 }; 362 };
335 __le32 payload_len; 363 __le32 payload_len;
336} __attribute__ ((packed)); 364} __attribute__ ((packed));
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db70453..190b6c4a6f2b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -119,6 +119,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
119 INIT_LIST_HEAD(&realm->children); 119 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item); 120 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item); 121 INIT_LIST_HEAD(&realm->empty_item);
122 INIT_LIST_HEAD(&realm->dirty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps); 123 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock); 124 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm); 125 __insert_snap_realm(&mdsc->snap_realms, realm);
@@ -435,7 +436,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
435{ 436{
436 struct inode *inode = &ci->vfs_inode; 437 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap; 438 struct ceph_cap_snap *capsnap;
438 int used; 439 int used, dirty;
439 440
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); 441 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) { 442 if (!capsnap) {
@@ -445,6 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
445 446
446 spin_lock(&inode->i_lock); 447 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci); 448 used = __ceph_caps_used(ci);
449 dirty = __ceph_caps_dirty(ci);
448 if (__ceph_have_pending_cap_snap(ci)) { 450 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps, 451 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any 452 as no new writes are allowed to start when pending, so any
@@ -452,27 +454,37 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
452 cap_snap. lucky us. */ 454 cap_snap. lucky us. */
453 dout("queue_cap_snap %p already pending\n", inode); 455 dout("queue_cap_snap %p already pending\n", inode);
454 kfree(capsnap); 456 kfree(capsnap);
455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { 457 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
458 (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
459 CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc; 460 struct ceph_snap_context *snapc = ci->i_head_snapc;
457 461
462 dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
463 capsnap, snapc);
458 igrab(inode); 464 igrab(inode);
459 465
460 atomic_set(&capsnap->nref, 1); 466 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci; 467 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item); 468 INIT_LIST_HEAD(&capsnap->ci_item);
463 INIT_LIST_HEAD(&capsnap->flushing_item); 469 INIT_LIST_HEAD(&capsnap->flushing_item);
464 470
465 capsnap->follows = snapc->seq - 1; 471 capsnap->follows = snapc->seq;
466 capsnap->issued = __ceph_caps_issued(ci, NULL); 472 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci); 473 capsnap->dirty = dirty;
468 474
469 capsnap->mode = inode->i_mode; 475 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid; 476 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid; 477 capsnap->gid = inode->i_gid;
472 478
473 /* fixme? */ 479 if (dirty & CEPH_CAP_XATTR_EXCL) {
474 capsnap->xattr_blob = NULL; 480 __ceph_build_xattrs_blob(ci);
475 capsnap->xattr_len = 0; 481 capsnap->xattr_blob =
482 ceph_buffer_get(ci->i_xattrs.blob);
483 capsnap->xattr_version = ci->i_xattrs.version;
484 } else {
485 capsnap->xattr_blob = NULL;
486 capsnap->xattr_version = 0;
487 }
476 488
477 /* dirty page count moved from _head to this cap_snap; 489 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this 490 all subsequent writes page dirties occur _after_ this
@@ -480,7 +492,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 492 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0; 493 ci->i_wrbuffer_ref_head = 0;
482 capsnap->context = snapc; 494 capsnap->context = snapc;
483 ci->i_head_snapc = NULL; 495 ci->i_head_snapc =
496 ceph_get_snap_context(ci->i_snap_realm->cached_context);
497 dout(" new snapc is %p\n", ci->i_head_snapc);
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 498 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485 499
486 if (used & CEPH_CAP_FILE_WR) { 500 if (used & CEPH_CAP_FILE_WR) {
@@ -512,7 +526,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap) 526 struct ceph_cap_snap *capsnap)
513{ 527{
514 struct inode *inode = &ci->vfs_inode; 528 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 529 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
516 530
517 BUG_ON(capsnap->writing); 531 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size; 532 capsnap->size = inode->i_size;
@@ -539,6 +553,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
539 return 1; /* caller may want to ceph_flush_snaps */ 553 return 1; /* caller may want to ceph_flush_snaps */
540} 554}
541 555
556/*
557 * Queue cap_snaps for snap writeback for this realm and its children.
558 * Called under snap_rwsem, so realm topology won't change.
559 */
560static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
561{
562 struct ceph_inode_info *ci;
563 struct inode *lastinode = NULL;
564 struct ceph_snap_realm *child;
565
566 dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
567
568 spin_lock(&realm->inodes_with_caps_lock);
569 list_for_each_entry(ci, &realm->inodes_with_caps,
570 i_snap_realm_item) {
571 struct inode *inode = igrab(&ci->vfs_inode);
572 if (!inode)
573 continue;
574 spin_unlock(&realm->inodes_with_caps_lock);
575 if (lastinode)
576 iput(lastinode);
577 lastinode = inode;
578 ceph_queue_cap_snap(ci);
579 spin_lock(&realm->inodes_with_caps_lock);
580 }
581 spin_unlock(&realm->inodes_with_caps_lock);
582 if (lastinode)
583 iput(lastinode);
584
585 dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
586 list_for_each_entry(child, &realm->children, child_item)
587 queue_realm_cap_snaps(child);
588
589 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
590}
542 591
543/* 592/*
544 * Parse and apply a snapblob "snap trace" from the MDS. This specifies 593 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
@@ -556,6 +605,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
556 struct ceph_snap_realm *realm; 605 struct ceph_snap_realm *realm;
557 int invalidate = 0; 606 int invalidate = 0;
558 int err = -ENOMEM; 607 int err = -ENOMEM;
608 LIST_HEAD(dirty_realms);
559 609
560 dout("update_snap_trace deletion=%d\n", deletion); 610 dout("update_snap_trace deletion=%d\n", deletion);
561more: 611more:
@@ -578,45 +628,6 @@ more:
578 } 628 }
579 } 629 }
580 630
581 if (le64_to_cpu(ri->seq) > realm->seq) {
582 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
583 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
584 /*
585 * if the realm seq has changed, queue a cap_snap for every
586 * inode with open caps. we do this _before_ we update
587 * the realm info so that we prepare for writeback under the
588 * _previous_ snap context.
589 *
590 * ...unless it's a snap deletion!
591 */
592 if (!deletion) {
593 struct ceph_inode_info *ci;
594 struct inode *lastinode = NULL;
595
596 spin_lock(&realm->inodes_with_caps_lock);
597 list_for_each_entry(ci, &realm->inodes_with_caps,
598 i_snap_realm_item) {
599 struct inode *inode = igrab(&ci->vfs_inode);
600 if (!inode)
601 continue;
602 spin_unlock(&realm->inodes_with_caps_lock);
603 if (lastinode)
604 iput(lastinode);
605 lastinode = inode;
606 ceph_queue_cap_snap(ci);
607 spin_lock(&realm->inodes_with_caps_lock);
608 }
609 spin_unlock(&realm->inodes_with_caps_lock);
610 if (lastinode)
611 iput(lastinode);
612 dout("update_snap_trace cap_snaps queued\n");
613 }
614
615 } else {
616 dout("update_snap_trace %llx %p seq %lld unchanged\n",
617 realm->ino, realm, realm->seq);
618 }
619
620 /* ensure the parent is correct */ 631 /* ensure the parent is correct */
621 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); 632 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
622 if (err < 0) 633 if (err < 0)
@@ -624,6 +635,8 @@ more:
624 invalidate += err; 635 invalidate += err;
625 636
626 if (le64_to_cpu(ri->seq) > realm->seq) { 637 if (le64_to_cpu(ri->seq) > realm->seq) {
638 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
639 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
627 /* update realm parameters, snap lists */ 640 /* update realm parameters, snap lists */
628 realm->seq = le64_to_cpu(ri->seq); 641 realm->seq = le64_to_cpu(ri->seq);
629 realm->created = le64_to_cpu(ri->created); 642 realm->created = le64_to_cpu(ri->created);
@@ -641,9 +654,17 @@ more:
641 if (err < 0) 654 if (err < 0)
642 goto fail; 655 goto fail;
643 656
657 /* queue realm for cap_snap creation */
658 list_add(&realm->dirty_item, &dirty_realms);
659
644 invalidate = 1; 660 invalidate = 1;
645 } else if (!realm->cached_context) { 661 } else if (!realm->cached_context) {
662 dout("update_snap_trace %llx %p seq %lld new\n",
663 realm->ino, realm, realm->seq);
646 invalidate = 1; 664 invalidate = 1;
665 } else {
666 dout("update_snap_trace %llx %p seq %lld unchanged\n",
667 realm->ino, realm, realm->seq);
647 } 668 }
648 669
649 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, 670 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
@@ -656,6 +677,14 @@ more:
656 if (invalidate) 677 if (invalidate)
657 rebuild_snap_realms(realm); 678 rebuild_snap_realms(realm);
658 679
680 /*
681 * queue cap snaps _after_ we've built the new snap contexts,
682 * so that i_head_snapc can be set appropriately.
683 */
684 list_for_each_entry(realm, &dirty_realms, dirty_item) {
685 queue_realm_cap_snaps(realm);
686 }
687
659 __cleanup_empty_realms(mdsc); 688 __cleanup_empty_realms(mdsc);
660 return 0; 689 return 0;
661 690
@@ -688,7 +717,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
688 igrab(inode); 717 igrab(inode);
689 spin_unlock(&mdsc->snap_flush_lock); 718 spin_unlock(&mdsc->snap_flush_lock);
690 spin_lock(&inode->i_lock); 719 spin_lock(&inode->i_lock);
691 __ceph_flush_snaps(ci, &session); 720 __ceph_flush_snaps(ci, &session, 0);
692 spin_unlock(&inode->i_lock); 721 spin_unlock(&inode->i_lock);
693 iput(inode); 722 iput(inode);
694 spin_lock(&mdsc->snap_flush_lock); 723 spin_lock(&mdsc->snap_flush_lock);
@@ -789,6 +818,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
789 }; 818 };
790 struct inode *inode = ceph_find_inode(sb, vino); 819 struct inode *inode = ceph_find_inode(sb, vino);
791 struct ceph_inode_info *ci; 820 struct ceph_inode_info *ci;
821 struct ceph_snap_realm *oldrealm;
792 822
793 if (!inode) 823 if (!inode)
794 continue; 824 continue;
@@ -814,18 +844,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
814 dout(" will move %p to split realm %llx %p\n", 844 dout(" will move %p to split realm %llx %p\n",
815 inode, realm->ino, realm); 845 inode, realm->ino, realm);
816 /* 846 /*
817 * Remove the inode from the realm's inode 847 * Move the inode to the new realm
818 * list, but don't add it to the new realm
819 * yet. We don't want the cap_snap to be
820 * queued (again) by ceph_update_snap_trace()
821 * below. Queue it _now_, under the old context.
822 */ 848 */
823 spin_lock(&realm->inodes_with_caps_lock); 849 spin_lock(&realm->inodes_with_caps_lock);
824 list_del_init(&ci->i_snap_realm_item); 850 list_del_init(&ci->i_snap_realm_item);
851 list_add(&ci->i_snap_realm_item,
852 &realm->inodes_with_caps);
853 oldrealm = ci->i_snap_realm;
854 ci->i_snap_realm = realm;
825 spin_unlock(&realm->inodes_with_caps_lock); 855 spin_unlock(&realm->inodes_with_caps_lock);
826 spin_unlock(&inode->i_lock); 856 spin_unlock(&inode->i_lock);
827 857
828 ceph_queue_cap_snap(ci); 858 ceph_get_snap_realm(mdsc, realm);
859 ceph_put_snap_realm(mdsc, oldrealm);
829 860
830 iput(inode); 861 iput(inode);
831 continue; 862 continue;
@@ -853,43 +884,9 @@ skip_inode:
853 ceph_update_snap_trace(mdsc, p, e, 884 ceph_update_snap_trace(mdsc, p, e,
854 op == CEPH_SNAP_OP_DESTROY); 885 op == CEPH_SNAP_OP_DESTROY);
855 886
856 if (op == CEPH_SNAP_OP_SPLIT) { 887 if (op == CEPH_SNAP_OP_SPLIT)
857 /*
858 * ok, _now_ add the inodes into the new realm.
859 */
860 for (i = 0; i < num_split_inos; i++) {
861 struct ceph_vino vino = {
862 .ino = le64_to_cpu(split_inos[i]),
863 .snap = CEPH_NOSNAP,
864 };
865 struct inode *inode = ceph_find_inode(sb, vino);
866 struct ceph_inode_info *ci;
867
868 if (!inode)
869 continue;
870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock);
872 if (list_empty(&ci->i_snap_realm_item)) {
873 struct ceph_snap_realm *oldrealm =
874 ci->i_snap_realm;
875
876 dout(" moving %p to split realm %llx %p\n",
877 inode, realm->ino, realm);
878 spin_lock(&realm->inodes_with_caps_lock);
879 list_add(&ci->i_snap_realm_item,
880 &realm->inodes_with_caps);
881 ci->i_snap_realm = realm;
882 spin_unlock(&realm->inodes_with_caps_lock);
883 ceph_get_snap_realm(mdsc, realm);
884 ceph_put_snap_realm(mdsc, oldrealm);
885 }
886 spin_unlock(&inode->i_lock);
887 iput(inode);
888 }
889
890 /* we took a reference when we created the realm, above */ 888 /* we took a reference when we created the realm, above */
891 ceph_put_snap_realm(mdsc, realm); 889 ceph_put_snap_realm(mdsc, realm);
892 }
893 890
894 __cleanup_empty_realms(mdsc); 891 __cleanup_empty_realms(mdsc);
895 892
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 110857ba9269..9922628532b2 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -2,20 +2,18 @@
2#include "ceph_debug.h" 2#include "ceph_debug.h"
3 3
4#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
5#include <linux/ctype.h>
5#include <linux/fs.h> 6#include <linux/fs.h>
6#include <linux/inet.h> 7#include <linux/inet.h>
7#include <linux/in6.h> 8#include <linux/in6.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/mount.h> 10#include <linux/mount.h>
10#include <linux/parser.h> 11#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/seq_file.h> 13#include <linux/seq_file.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/statfs.h> 15#include <linux/statfs.h>
16#include <linux/string.h> 16#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19 17
20#include "decode.h" 18#include "decode.h"
21#include "super.h" 19#include "super.h"
@@ -92,7 +90,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
92 90
93 buf->f_files = le64_to_cpu(st.num_objects); 91 buf->f_files = le64_to_cpu(st.num_objects);
94 buf->f_ffree = -1; 92 buf->f_ffree = -1;
95 buf->f_namelen = PATH_MAX; 93 buf->f_namelen = NAME_MAX;
96 buf->f_frsize = PAGE_CACHE_SIZE; 94 buf->f_frsize = PAGE_CACHE_SIZE;
97 95
98 /* leave fsid little-endian, regardless of host endianness */ 96 /* leave fsid little-endian, regardless of host endianness */
@@ -104,15 +102,52 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
104} 102}
105 103
106 104
107static int ceph_syncfs(struct super_block *sb, int wait) 105static int ceph_sync_fs(struct super_block *sb, int wait)
108{ 106{
109 dout("sync_fs %d\n", wait); 107 struct ceph_client *client = ceph_sb_to_client(sb);
110 ceph_osdc_sync(&ceph_client(sb)->osdc); 108
111 ceph_mdsc_sync(&ceph_client(sb)->mdsc); 109 if (!wait) {
112 dout("sync_fs %d done\n", wait); 110 dout("sync_fs (non-blocking)\n");
111 ceph_flush_dirty_caps(&client->mdsc);
112 dout("sync_fs (non-blocking) done\n");
113 return 0;
114 }
115
116 dout("sync_fs (blocking)\n");
117 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
118 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
119 dout("sync_fs (blocking) done\n");
113 return 0; 120 return 0;
114} 121}
115 122
123static int default_congestion_kb(void)
124{
125 int congestion_kb;
126
127 /*
128 * Copied from NFS
129 *
130 * congestion size, scale with available memory.
131 *
132 * 64MB: 8192k
133 * 128MB: 11585k
134 * 256MB: 16384k
135 * 512MB: 23170k
136 * 1GB: 32768k
137 * 2GB: 46340k
138 * 4GB: 65536k
139 * 8GB: 92681k
140 * 16GB: 131072k
141 *
142 * This allows larger machines to have larger/more transfers.
143 * Limit the default to 256M
144 */
145 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
146 if (congestion_kb > 256*1024)
147 congestion_kb = 256*1024;
148
149 return congestion_kb;
150}
116 151
117/** 152/**
118 * ceph_show_options - Show mount options in /proc/mounts 153 * ceph_show_options - Show mount options in /proc/mounts
@@ -125,9 +160,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
125 struct ceph_mount_args *args = client->mount_args; 160 struct ceph_mount_args *args = client->mount_args;
126 161
127 if (args->flags & CEPH_OPT_FSID) 162 if (args->flags & CEPH_OPT_FSID)
128 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu", 163 seq_printf(m, ",fsid=%pU", &args->fsid);
129 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
130 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
131 if (args->flags & CEPH_OPT_NOSHARE) 164 if (args->flags & CEPH_OPT_NOSHARE)
132 seq_puts(m, ",noshare"); 165 seq_puts(m, ",noshare");
133 if (args->flags & CEPH_OPT_DIRSTAT) 166 if (args->flags & CEPH_OPT_DIRSTAT)
@@ -138,6 +171,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
138 seq_puts(m, ",nocrc"); 171 seq_puts(m, ",nocrc");
139 if (args->flags & CEPH_OPT_NOASYNCREADDIR) 172 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
140 seq_puts(m, ",noasyncreaddir"); 173 seq_puts(m, ",noasyncreaddir");
174
175 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
176 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
177 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
178 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
179 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
180 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
181 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
182 seq_printf(m, ",osdkeepalivetimeout=%d",
183 args->osd_keepalive_timeout);
184 if (args->wsize)
185 seq_printf(m, ",wsize=%d", args->wsize);
186 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
187 seq_printf(m, ",rsize=%d", args->rsize);
188 if (args->congestion_kb != default_congestion_kb())
189 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
190 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
191 seq_printf(m, ",caps_wanted_delay_min=%d",
192 args->caps_wanted_delay_min);
193 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
194 seq_printf(m, ",caps_wanted_delay_max=%d",
195 args->caps_wanted_delay_max);
196 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
197 seq_printf(m, ",cap_release_safety=%d",
198 args->cap_release_safety);
199 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
200 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
201 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
202 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
141 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) 203 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
142 seq_printf(m, ",snapdirname=%s", args->snapdir_name); 204 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
143 if (args->name) 205 if (args->name)
@@ -161,35 +223,6 @@ static void ceph_inode_init_once(void *foo)
161 inode_init_once(&ci->vfs_inode); 223 inode_init_once(&ci->vfs_inode);
162} 224}
163 225
164static int default_congestion_kb(void)
165{
166 int congestion_kb;
167
168 /*
169 * Copied from NFS
170 *
171 * congestion size, scale with available memory.
172 *
173 * 64MB: 8192k
174 * 128MB: 11585k
175 * 256MB: 16384k
176 * 512MB: 23170k
177 * 1GB: 32768k
178 * 2GB: 46340k
179 * 4GB: 65536k
180 * 8GB: 92681k
181 * 16GB: 131072k
182 *
183 * This allows larger machines to have larger/more transfers.
184 * Limit the default to 256M
185 */
186 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
187 if (congestion_kb > 256*1024)
188 congestion_kb = 256*1024;
189
190 return congestion_kb;
191}
192
193static int __init init_caches(void) 226static int __init init_caches(void)
194{ 227{
195 ceph_inode_cachep = kmem_cache_create("ceph_inode_info", 228 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -254,7 +287,7 @@ static const struct super_operations ceph_super_ops = {
254 .alloc_inode = ceph_alloc_inode, 287 .alloc_inode = ceph_alloc_inode,
255 .destroy_inode = ceph_destroy_inode, 288 .destroy_inode = ceph_destroy_inode,
256 .write_inode = ceph_write_inode, 289 .write_inode = ceph_write_inode,
257 .sync_fs = ceph_syncfs, 290 .sync_fs = ceph_sync_fs,
258 .put_super = ceph_put_super, 291 .put_super = ceph_put_super,
259 .show_options = ceph_show_options, 292 .show_options = ceph_show_options,
260 .statfs = ceph_statfs, 293 .statfs = ceph_statfs,
@@ -297,9 +330,6 @@ const char *ceph_msg_type_name(int type)
297 * mount options 330 * mount options
298 */ 331 */
299enum { 332enum {
300 Opt_fsidmajor,
301 Opt_fsidminor,
302 Opt_monport,
303 Opt_wsize, 333 Opt_wsize,
304 Opt_rsize, 334 Opt_rsize,
305 Opt_osdtimeout, 335 Opt_osdtimeout,
@@ -308,10 +338,13 @@ enum {
308 Opt_osd_idle_ttl, 338 Opt_osd_idle_ttl,
309 Opt_caps_wanted_delay_min, 339 Opt_caps_wanted_delay_min,
310 Opt_caps_wanted_delay_max, 340 Opt_caps_wanted_delay_max,
341 Opt_cap_release_safety,
311 Opt_readdir_max_entries, 342 Opt_readdir_max_entries,
343 Opt_readdir_max_bytes,
312 Opt_congestion_kb, 344 Opt_congestion_kb,
313 Opt_last_int, 345 Opt_last_int,
314 /* int args above */ 346 /* int args above */
347 Opt_fsid,
315 Opt_snapdirname, 348 Opt_snapdirname,
316 Opt_name, 349 Opt_name,
317 Opt_secret, 350 Opt_secret,
@@ -328,9 +361,6 @@ enum {
328}; 361};
329 362
330static match_table_t arg_tokens = { 363static match_table_t arg_tokens = {
331 {Opt_fsidmajor, "fsidmajor=%ld"},
332 {Opt_fsidminor, "fsidminor=%ld"},
333 {Opt_monport, "monport=%d"},
334 {Opt_wsize, "wsize=%d"}, 364 {Opt_wsize, "wsize=%d"},
335 {Opt_rsize, "rsize=%d"}, 365 {Opt_rsize, "rsize=%d"},
336 {Opt_osdtimeout, "osdtimeout=%d"}, 366 {Opt_osdtimeout, "osdtimeout=%d"},
@@ -339,9 +369,12 @@ static match_table_t arg_tokens = {
339 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, 369 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
340 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 370 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
341 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 371 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
372 {Opt_cap_release_safety, "cap_release_safety=%d"},
342 {Opt_readdir_max_entries, "readdir_max_entries=%d"}, 373 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
374 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
343 {Opt_congestion_kb, "write_congestion_kb=%d"}, 375 {Opt_congestion_kb, "write_congestion_kb=%d"},
344 /* int args above */ 376 /* int args above */
377 {Opt_fsid, "fsid=%s"},
345 {Opt_snapdirname, "snapdirname=%s"}, 378 {Opt_snapdirname, "snapdirname=%s"},
346 {Opt_name, "name=%s"}, 379 {Opt_name, "name=%s"},
347 {Opt_secret, "secret=%s"}, 380 {Opt_secret, "secret=%s"},
@@ -357,6 +390,36 @@ static match_table_t arg_tokens = {
357 {-1, NULL} 390 {-1, NULL}
358}; 391};
359 392
393static int parse_fsid(const char *str, struct ceph_fsid *fsid)
394{
395 int i = 0;
396 char tmp[3];
397 int err = -EINVAL;
398 int d;
399
400 dout("parse_fsid '%s'\n", str);
401 tmp[2] = 0;
402 while (*str && i < 16) {
403 if (ispunct(*str)) {
404 str++;
405 continue;
406 }
407 if (!isxdigit(str[0]) || !isxdigit(str[1]))
408 break;
409 tmp[0] = str[0];
410 tmp[1] = str[1];
411 if (sscanf(tmp, "%x", &d) < 1)
412 break;
413 fsid->fsid[i] = d & 0xff;
414 i++;
415 str += 2;
416 }
417
418 if (i == 16)
419 err = 0;
420 dout("parse_fsid ret %d got fsid %pU", err, fsid);
421 return err;
422}
360 423
361static struct ceph_mount_args *parse_mount_args(int flags, char *options, 424static struct ceph_mount_args *parse_mount_args(int flags, char *options,
362 const char *dev_name, 425 const char *dev_name,
@@ -388,8 +451,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
388 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 451 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
389 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 452 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
390 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 453 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
391 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; 454 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
392 args->max_readdir = 1024; 455 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
456 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
393 args->congestion_kb = default_congestion_kb(); 457 args->congestion_kb = default_congestion_kb();
394 458
395 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 459 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -439,12 +503,6 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
439 dout("got token %d\n", token); 503 dout("got token %d\n", token);
440 } 504 }
441 switch (token) { 505 switch (token) {
442 case Opt_fsidmajor:
443 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
444 break;
445 case Opt_fsidminor:
446 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
447 break;
448 case Opt_ip: 506 case Opt_ip:
449 err = ceph_parse_ips(argstr[0].from, 507 err = ceph_parse_ips(argstr[0].from,
450 argstr[0].to, 508 argstr[0].to,
@@ -455,6 +513,11 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
455 args->flags |= CEPH_OPT_MYIP; 513 args->flags |= CEPH_OPT_MYIP;
456 break; 514 break;
457 515
516 case Opt_fsid:
517 err = parse_fsid(argstr[0].from, &args->fsid);
518 if (err == 0)
519 args->flags |= CEPH_OPT_FSID;
520 break;
458 case Opt_snapdirname: 521 case Opt_snapdirname:
459 kfree(args->snapdir_name); 522 kfree(args->snapdir_name);
460 args->snapdir_name = kstrndup(argstr[0].from, 523 args->snapdir_name = kstrndup(argstr[0].from,
@@ -485,6 +548,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
485 case Opt_osdkeepalivetimeout: 548 case Opt_osdkeepalivetimeout:
486 args->osd_keepalive_timeout = intval; 549 args->osd_keepalive_timeout = intval;
487 break; 550 break;
551 case Opt_osd_idle_ttl:
552 args->osd_idle_ttl = intval;
553 break;
488 case Opt_mount_timeout: 554 case Opt_mount_timeout:
489 args->mount_timeout = intval; 555 args->mount_timeout = intval;
490 break; 556 break;
@@ -497,6 +563,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
497 case Opt_readdir_max_entries: 563 case Opt_readdir_max_entries:
498 args->max_readdir = intval; 564 args->max_readdir = intval;
499 break; 565 break;
566 case Opt_readdir_max_bytes:
567 args->max_readdir_bytes = intval;
568 break;
500 case Opt_congestion_kb: 569 case Opt_congestion_kb:
501 args->congestion_kb = intval; 570 args->congestion_kb = intval;
502 break; 571 break;
@@ -597,7 +666,6 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
597 666
598 /* caps */ 667 /* caps */
599 client->min_caps = args->max_readdir; 668 client->min_caps = args->max_readdir;
600 ceph_adjust_min_caps(client->min_caps);
601 669
602 /* subsystems */ 670 /* subsystems */
603 err = ceph_monc_init(&client->monc, client); 671 err = ceph_monc_init(&client->monc, client);
@@ -636,10 +704,16 @@ static void ceph_destroy_client(struct ceph_client *client)
636 704
637 /* unmount */ 705 /* unmount */
638 ceph_mdsc_stop(&client->mdsc); 706 ceph_mdsc_stop(&client->mdsc);
639 ceph_monc_stop(&client->monc);
640 ceph_osdc_stop(&client->osdc); 707 ceph_osdc_stop(&client->osdc);
641 708
642 ceph_adjust_min_caps(-client->min_caps); 709 /*
710 * make sure mds and osd connections close out before destroying
711 * the auth module, which is needed to free those connections'
712 * ceph_authorizers.
713 */
714 ceph_msgr_flush();
715
716 ceph_monc_stop(&client->monc);
643 717
644 ceph_debugfs_client_cleanup(client); 718 ceph_debugfs_client_cleanup(client);
645 destroy_workqueue(client->wb_wq); 719 destroy_workqueue(client->wb_wq);
@@ -665,13 +739,13 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
665{ 739{
666 if (client->have_fsid) { 740 if (client->have_fsid) {
667 if (ceph_fsid_compare(&client->fsid, fsid)) { 741 if (ceph_fsid_compare(&client->fsid, fsid)) {
668 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT, 742 pr_err("bad fsid, had %pU got %pU",
669 PR_FSID(&client->fsid), PR_FSID(fsid)); 743 &client->fsid, fsid);
670 return -1; 744 return -1;
671 } 745 }
672 } else { 746 } else {
673 pr_info("client%lld fsid " FSID_FORMAT "\n", 747 pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
674 client->monc.auth->global_id, PR_FSID(fsid)); 748 fsid);
675 memcpy(&client->fsid, fsid, sizeof(*fsid)); 749 memcpy(&client->fsid, fsid, sizeof(*fsid));
676 ceph_debugfs_client_init(client); 750 ceph_debugfs_client_init(client);
677 client->have_fsid = true; 751 client->have_fsid = true;
@@ -682,9 +756,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
682/* 756/*
683 * true if we have the mon map (and have thus joined the cluster) 757 * true if we have the mon map (and have thus joined the cluster)
684 */ 758 */
685static int have_mon_map(struct ceph_client *client) 759static int have_mon_and_osd_map(struct ceph_client *client)
686{ 760{
687 return client->monc.monmap && client->monc.monmap->epoch; 761 return client->monc.monmap && client->monc.monmap->epoch &&
762 client->osdc.osdmap && client->osdc.osdmap->epoch;
688} 763}
689 764
690/* 765/*
@@ -704,7 +779,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
704 dout("open_root_inode opening '%s'\n", path); 779 dout("open_root_inode opening '%s'\n", path);
705 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); 780 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
706 if (IS_ERR(req)) 781 if (IS_ERR(req))
707 return ERR_PTR(PTR_ERR(req)); 782 return ERR_CAST(req);
708 req->r_path1 = kstrdup(path, GFP_NOFS); 783 req->r_path1 = kstrdup(path, GFP_NOFS);
709 req->r_ino1.ino = CEPH_INO_ROOT; 784 req->r_ino1.ino = CEPH_INO_ROOT;
710 req->r_ino1.snap = CEPH_NOSNAP; 785 req->r_ino1.snap = CEPH_NOSNAP;
@@ -762,7 +837,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
762 if (err < 0) 837 if (err < 0)
763 goto out; 838 goto out;
764 839
765 while (!have_mon_map(client)) { 840 while (!have_mon_and_osd_map(client)) {
766 err = -EIO; 841 err = -EIO;
767 if (timeout && time_after_eq(jiffies, started + timeout)) 842 if (timeout && time_after_eq(jiffies, started + timeout))
768 goto out; 843 goto out;
@@ -770,8 +845,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
770 /* wait */ 845 /* wait */
771 dout("mount waiting for mon_map\n"); 846 dout("mount waiting for mon_map\n");
772 err = wait_event_interruptible_timeout(client->auth_wq, 847 err = wait_event_interruptible_timeout(client->auth_wq,
773 have_mon_map(client) || (client->auth_err < 0), 848 have_mon_and_osd_map(client) || (client->auth_err < 0),
774 timeout); 849 timeout);
775 if (err == -EINTR || err == -ERESTARTSYS) 850 if (err == -EINTR || err == -ERESTARTSYS)
776 goto out; 851 goto out;
777 if (client->auth_err < 0) { 852 if (client->auth_err < 0) {
@@ -884,6 +959,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
884/* 959/*
885 * construct our own bdi so we can control readahead, etc. 960 * construct our own bdi so we can control readahead, etc.
886 */ 961 */
962static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
963
887static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 964static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
888{ 965{
889 int err; 966 int err;
@@ -893,7 +970,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
893 client->backing_dev_info.ra_pages = 970 client->backing_dev_info.ra_pages =
894 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 971 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
895 >> PAGE_SHIFT; 972 >> PAGE_SHIFT;
896 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); 973 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
974 atomic_long_inc_return(&bdi_seq));
897 if (!err) 975 if (!err)
898 sb->s_bdi = &client->backing_dev_info; 976 sb->s_bdi = &client->backing_dev_info;
899 return err; 977 return err;
@@ -932,9 +1010,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
932 goto out; 1010 goto out;
933 } 1011 }
934 1012
935 if (ceph_client(sb) != client) { 1013 if (ceph_sb_to_client(sb) != client) {
936 ceph_destroy_client(client); 1014 ceph_destroy_client(client);
937 client = ceph_client(sb); 1015 client = ceph_sb_to_client(sb);
938 dout("get_sb got existing client %p\n", client); 1016 dout("get_sb got existing client %p\n", client);
939 } else { 1017 } else {
940 dout("get_sb using new client %p\n", client); 1018 dout("get_sb using new client %p\n", client);
@@ -952,8 +1030,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
952 1030
953out_splat: 1031out_splat:
954 ceph_mdsc_close_sessions(&client->mdsc); 1032 ceph_mdsc_close_sessions(&client->mdsc);
955 up_write(&sb->s_umount); 1033 deactivate_locked_super(sb);
956 deactivate_super(sb);
957 goto out_final; 1034 goto out_final;
958 1035
959out: 1036out:
@@ -999,8 +1076,6 @@ static int __init init_ceph(void)
999 if (ret) 1076 if (ret)
1000 goto out_msgr; 1077 goto out_msgr;
1001 1078
1002 ceph_caps_init();
1003
1004 ret = register_filesystem(&ceph_fs_type); 1079 ret = register_filesystem(&ceph_fs_type);
1005 if (ret) 1080 if (ret)
1006 goto out_icache; 1081 goto out_icache;
@@ -1025,7 +1100,6 @@ static void __exit exit_ceph(void)
1025{ 1100{
1026 dout("exit_ceph\n"); 1101 dout("exit_ceph\n");
1027 unregister_filesystem(&ceph_fs_type); 1102 unregister_filesystem(&ceph_fs_type);
1028 ceph_caps_finalize();
1029 destroy_caches(); 1103 destroy_caches();
1030 ceph_msgr_exit(); 1104 ceph_msgr_exit();
1031 ceph_debugfs_cleanup(); 1105 ceph_debugfs_cleanup();
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87f..b87638e84c4b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,7 +10,6 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mempool.h> 11#include <linux/mempool.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/slab.h>
14#include <linux/wait.h> 13#include <linux/wait.h>
15#include <linux/writeback.h> 14#include <linux/writeback.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
@@ -32,6 +31,12 @@
32#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
33 32
34/* 33/*
34 * Supported features
35 */
36#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
37#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
38
39/*
35 * mount options 40 * mount options
36 */ 41 */
37#define CEPH_OPT_FSID (1<<0) 42#define CEPH_OPT_FSID (1<<0)
@@ -52,24 +57,25 @@
52 57
53struct ceph_mount_args { 58struct ceph_mount_args {
54 int sb_flags; 59 int sb_flags;
60 int flags;
61 struct ceph_fsid fsid;
62 struct ceph_entity_addr my_addr;
55 int num_mon; 63 int num_mon;
56 struct ceph_entity_addr *mon_addr; 64 struct ceph_entity_addr *mon_addr;
57 int flags;
58 int mount_timeout; 65 int mount_timeout;
59 int osd_idle_ttl; 66 int osd_idle_ttl;
60 int caps_wanted_delay_min, caps_wanted_delay_max;
61 struct ceph_fsid fsid;
62 struct ceph_entity_addr my_addr;
63 int wsize;
64 int rsize; /* max readahead */
65 int max_readdir; /* max readdir size */
66 int congestion_kb; /* max readdir size */
67 int osd_timeout; 67 int osd_timeout;
68 int osd_keepalive_timeout; 68 int osd_keepalive_timeout;
69 int wsize;
70 int rsize; /* max readahead */
71 int congestion_kb; /* max writeback in flight */
72 int caps_wanted_delay_min, caps_wanted_delay_max;
73 int cap_release_safety;
74 int max_readdir; /* max readdir result (entires) */
75 int max_readdir_bytes; /* max readdir result (bytes) */
69 char *snapdir_name; /* default ".snap" */ 76 char *snapdir_name; /* default ".snap" */
70 char *name; 77 char *name;
71 char *secret; 78 char *secret;
72 int cap_release_safety;
73}; 79};
74 80
75/* 81/*
@@ -80,13 +86,14 @@ struct ceph_mount_args {
80#define CEPH_OSD_KEEPALIVE_DEFAULT 5 86#define CEPH_OSD_KEEPALIVE_DEFAULT 5
81#define CEPH_OSD_IDLE_TTL_DEFAULT 60 87#define CEPH_OSD_IDLE_TTL_DEFAULT 60
82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ 88#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
89#define CEPH_MAX_READDIR_DEFAULT 1024
90#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
83 91
84#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 92#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
85#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) 93#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
86 94
87#define CEPH_SNAPDIRNAME_DEFAULT ".snap" 95#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
88#define CEPH_AUTH_NAME_DEFAULT "guest" 96#define CEPH_AUTH_NAME_DEFAULT "guest"
89
90/* 97/*
91 * Delay telling the MDS we no longer want caps, in case we reopen 98 * Delay telling the MDS we no longer want caps, in case we reopen
92 * the file. Delay a minimum amount of time, even if we send a cap 99 * the file. Delay a minimum amount of time, even if we send a cap
@@ -96,6 +103,7 @@ struct ceph_mount_args {
96#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ 103#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
97#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ 104#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
98 105
106#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
99 107
100/* mount state */ 108/* mount state */
101enum { 109enum {
@@ -160,12 +168,6 @@ struct ceph_client {
160#endif 168#endif
161}; 169};
162 170
163static inline struct ceph_client *ceph_client(struct super_block *sb)
164{
165 return sb->s_fs_info;
166}
167
168
169/* 171/*
170 * File i/o capability. This tracks shared state with the metadata 172 * File i/o capability. This tracks shared state with the metadata
171 * server that allows us to cache or writeback attributes or to read 173 * server that allows us to cache or writeback attributes or to read
@@ -214,8 +216,7 @@ struct ceph_cap_snap {
214 uid_t uid; 216 uid_t uid;
215 gid_t gid; 217 gid_t gid;
216 218
217 void *xattr_blob; 219 struct ceph_buffer *xattr_blob;
218 int xattr_len;
219 u64 xattr_version; 220 u64 xattr_version;
220 221
221 u64 size; 222 u64 size;
@@ -227,8 +228,11 @@ struct ceph_cap_snap {
227 228
228static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) 229static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
229{ 230{
230 if (atomic_dec_and_test(&capsnap->nref)) 231 if (atomic_dec_and_test(&capsnap->nref)) {
232 if (capsnap->xattr_blob)
233 ceph_buffer_put(capsnap->xattr_blob);
231 kfree(capsnap); 234 kfree(capsnap);
235 }
232} 236}
233 237
234/* 238/*
@@ -340,7 +344,8 @@ struct ceph_inode_info {
340 unsigned i_cap_exporting_issued; 344 unsigned i_cap_exporting_issued;
341 struct ceph_cap_reservation i_cap_migration_resv; 345 struct ceph_cap_reservation i_cap_migration_resv;
342 struct list_head i_cap_snaps; /* snapped state pending flush to mds */ 346 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
343 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ 347 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
348 dirty|flushing caps */
344 unsigned i_snap_caps; /* cap bits for snapped files */ 349 unsigned i_snap_caps; /* cap bits for snapped files */
345 350
346 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 351 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
@@ -564,11 +569,13 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
564/* what the mds thinks we want */ 569/* what the mds thinks we want */
565extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); 570extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
566 571
567extern void ceph_caps_init(void); 572extern void ceph_caps_init(struct ceph_mds_client *mdsc);
568extern void ceph_caps_finalize(void); 573extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
569extern void ceph_adjust_min_caps(int delta); 574extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
570extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need); 575extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
571extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx); 576 struct ceph_cap_reservation *ctx, int need);
577extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
578 struct ceph_cap_reservation *ctx);
572extern void ceph_reservation_status(struct ceph_client *client, 579extern void ceph_reservation_status(struct ceph_client *client,
573 int *total, int *avail, int *used, 580 int *total, int *avail, int *used,
574 int *reserved, int *min); 581 int *reserved, int *min);
@@ -683,6 +690,8 @@ struct ceph_snap_realm {
683 690
684 struct list_head empty_item; /* if i have ref==0 */ 691 struct list_head empty_item; /* if i have ref==0 */
685 692
693 struct list_head dirty_item; /* if realm needs new context */
694
686 /* the current set of snaps for this realm */ 695 /* the current set of snaps for this realm */
687 struct ceph_snap_context *cached_context; 696 struct ceph_snap_context *cached_context;
688 697
@@ -742,13 +751,6 @@ extern struct kmem_cache *ceph_file_cachep;
742extern const char *ceph_msg_type_name(int type); 751extern const char *ceph_msg_type_name(int type);
743extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 752extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
744 753
745#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
746 "%02x%02x%02x%02x%02x%02x"
747#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
748 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
749 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
750 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
751
752/* inode.c */ 754/* inode.c */
753extern const struct inode_operations ceph_file_iops; 755extern const struct inode_operations ceph_file_iops;
754 756
@@ -810,20 +812,24 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
810 __ceph_remove_cap(cap); 812 __ceph_remove_cap(cap);
811 spin_unlock(&inode->i_lock); 813 spin_unlock(&inode->i_lock);
812} 814}
813extern void ceph_put_cap(struct ceph_cap *cap); 815extern void ceph_put_cap(struct ceph_mds_client *mdsc,
816 struct ceph_cap *cap);
814 817
815extern void ceph_queue_caps_release(struct inode *inode); 818extern void ceph_queue_caps_release(struct inode *inode);
816extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); 819extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
817extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync); 820extern int ceph_fsync(struct file *file, int datasync);
818extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 821extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
819 struct ceph_mds_session *session); 822 struct ceph_mds_session *session);
823extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
824 int mds);
820extern int ceph_get_cap_mds(struct inode *inode); 825extern int ceph_get_cap_mds(struct inode *inode);
821extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); 826extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
822extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); 827extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
823extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, 828extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
824 struct ceph_snap_context *snapc); 829 struct ceph_snap_context *snapc);
825extern void __ceph_flush_snaps(struct ceph_inode_info *ci, 830extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
826 struct ceph_mds_session **psession); 831 struct ceph_mds_session **psession,
832 int again);
827extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, 833extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
828 struct ceph_mds_session *session); 834 struct ceph_mds_session *session);
829extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); 835extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -861,7 +867,7 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages);
861/* dir.c */ 867/* dir.c */
862extern const struct file_operations ceph_dir_fops; 868extern const struct file_operations ceph_dir_fops;
863extern const struct inode_operations ceph_dir_iops; 869extern const struct inode_operations ceph_dir_iops;
864extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 870extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
865 ceph_snapdir_dentry_ops; 871 ceph_snapdir_dentry_ops;
866 872
867extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); 873extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
@@ -871,6 +877,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
871extern void ceph_dentry_lru_add(struct dentry *dn); 877extern void ceph_dentry_lru_add(struct dentry *dn);
872extern void ceph_dentry_lru_touch(struct dentry *dn); 878extern void ceph_dentry_lru_touch(struct dentry *dn);
873extern void ceph_dentry_lru_del(struct dentry *dn); 879extern void ceph_dentry_lru_del(struct dentry *dn);
880extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
874 881
875/* 882/*
876 * our d_ops vary depending on whether the inode is live, 883 * our d_ops vary depending on whether the inode is live,
@@ -891,6 +898,14 @@ extern void ceph_debugfs_cleanup(void);
891extern int ceph_debugfs_client_init(struct ceph_client *client); 898extern int ceph_debugfs_client_init(struct ceph_client *client);
892extern void ceph_debugfs_client_cleanup(struct ceph_client *client); 899extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
893 900
901/* locks.c */
902extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
903extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
904extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
905extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
906 int p_locks, int f_locks);
907extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
908
894static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) 909static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
895{ 910{
896 if (dentry && dentry->d_parent) 911 if (dentry && dentry->d_parent)
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907fc..9578af610b73 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
7 7
8static bool ceph_is_valid_xattr(const char *name) 8static bool ceph_is_valid_xattr(const char *name)
9{ 9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX, 10 return !strncmp(name, "ceph.", 5) ||
11 !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) || 12 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 13 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 14 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
76} 77}
77 78
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { 79static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, 80 { true, "ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files}, 81 { true, "ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, 82 { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, 83 { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, 84 { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, 85 { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, 86 { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, 87 { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL } 88 { true, NULL, NULL }
88}; 89};
89 90
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
107} 108}
108 109
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = { 110static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout}, 111 { true, "ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL } 112 { NULL, NULL }
112}; 113};
113 114
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
186 ci->i_xattrs.names_size -= xattr->name_len; 187 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len; 188 ci->i_xattrs.vals_size -= xattr->val_len;
188 } 189 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len; 190 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len; 191 ci->i_xattrs.vals_size += val_len;
197 if (val) 192 if (val)
@@ -342,6 +337,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
342} 337}
343 338
344static int __build_xattrs(struct inode *inode) 339static int __build_xattrs(struct inode *inode)
340 __releases(inode->i_lock)
341 __acquires(inode->i_lock)
345{ 342{
346 u32 namelen; 343 u32 namelen;
347 u32 numattr = 0; 344 u32 numattr = 0;
@@ -488,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
488 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; 485 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
489 ci->i_xattrs.prealloc_blob = NULL; 486 ci->i_xattrs.prealloc_blob = NULL;
490 ci->i_xattrs.dirty = false; 487 ci->i_xattrs.dirty = false;
488 ci->i_xattrs.version++;
491 } 489 }
492} 490}
493 491
@@ -574,7 +572,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
574 ci->i_xattrs.version, ci->i_xattrs.index_version); 572 ci->i_xattrs.version, ci->i_xattrs.index_version);
575 573
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 574 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) { 575 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
578 goto list_xattr; 576 goto list_xattr;
579 } else { 577 } else {
580 spin_unlock(&inode->i_lock); 578 spin_unlock(&inode->i_lock);
@@ -622,7 +620,7 @@ out:
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 620static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags) 621 const char *value, size_t size, int flags)
624{ 622{
625 struct ceph_client *client = ceph_client(dentry->d_sb); 623 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode; 624 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode); 625 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode; 626 struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +639,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
641 return -ENOMEM; 639 return -ENOMEM;
642 err = -ENOMEM; 640 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) { 641 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS); 642 pages[i] = __page_cache_alloc(GFP_NOFS);
645 if (!pages[i]) { 643 if (!pages[i]) {
646 nr_pages = i; 644 nr_pages = i;
647 goto out; 645 goto out;
@@ -779,7 +777,7 @@ out:
779 777
780static int ceph_send_removexattr(struct dentry *dentry, const char *name) 778static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{ 779{
782 struct ceph_client *client = ceph_client(dentry->d_sb); 780 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc; 781 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode; 782 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode; 783 struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d6db933df2b2..143d393881cb 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -20,6 +20,7 @@
20#include <linux/cdev.h> 20#include <linux/cdev.h>
21#include <linux/mutex.h> 21#include <linux/mutex.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/tty.h>
23 24
24#include "internal.h" 25#include "internal.h"
25 26
@@ -39,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = {
39#endif 40#endif
40 /* permit direct mmap, for read, write or exec */ 41 /* permit direct mmap, for read, write or exec */
41 BDI_CAP_MAP_DIRECT | 42 BDI_CAP_MAP_DIRECT |
42 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), 43 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
44 /* no writeback happens */
45 BDI_CAP_NO_ACCT_AND_WRITEBACK),
43}; 46};
44 47
45static struct kobj_map *cdev_map; 48static struct kobj_map *cdev_map;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 80f352596807..917b7d449bb2 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,7 +2,6 @@ config CIFS
2 tristate "CIFS support (advanced network filesystem, SMBFS successor)" 2 tristate "CIFS support (advanced network filesystem, SMBFS successor)"
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select SLOW_WORK
6 help 5 help
7 This is the client VFS module for the Common Internet File System 6 This is the client VFS module for the Common Internet File System
8 (CIFS) protocol which is the successor to the Server Message Block 7 (CIFS) protocol which is the successor to the Server Message Block
@@ -71,14 +70,14 @@ config CIFS_WEAK_PW_HASH
71 If unsure, say N. 70 If unsure, say N.
72 71
73config CIFS_UPCALL 72config CIFS_UPCALL
74 bool "Kerberos/SPNEGO advanced session setup" 73 bool "Kerberos/SPNEGO advanced session setup"
75 depends on CIFS && KEYS 74 depends on CIFS && KEYS
76 help 75 select DNS_RESOLVER
77 Enables an upcall mechanism for CIFS which accesses 76 help
78 userspace helper utilities to provide SPNEGO packaged (RFC 4178) 77 Enables an upcall mechanism for CIFS which accesses userspace helper
79 Kerberos tickets which are needed to mount to certain secure servers 78 utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
80 (for which more secure Kerberos authentication is required). If 79 which are needed to mount to certain secure servers (for which more
81 unsure, say N. 80 secure Kerberos authentication is required). If unsure, say N.
82 81
83config CIFS_XATTR 82config CIFS_XATTR
84 bool "CIFS extended attributes" 83 bool "CIFS extended attributes"
@@ -122,6 +121,7 @@ config CIFS_DEBUG2
122config CIFS_DFS_UPCALL 121config CIFS_DFS_UPCALL
123 bool "DFS feature support" 122 bool "DFS feature support"
124 depends on CIFS && KEYS 123 depends on CIFS && KEYS
124 select DNS_RESOLVER
125 help 125 help
126 Distributed File System (DFS) support is used to access shares 126 Distributed File System (DFS) support is used to access shares
127 transparently in an enterprise name space, even if the share 127 transparently in an enterprise name space, even if the share
@@ -131,6 +131,15 @@ config CIFS_DFS_UPCALL
131 IP addresses) which is needed for implicit mounts of DFS junction 131 IP addresses) which is needed for implicit mounts of DFS junction
132 points. If unsure, say N. 132 points. If unsure, say N.
133 133
134config CIFS_FSCACHE
135 bool "Provide CIFS client caching support (EXPERIMENTAL)"
136 depends on EXPERIMENTAL
137 depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
138 help
139 Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
140 to be cached locally on disk through the general filesystem cache
141 manager. If unsure, say N.
142
134config CIFS_EXPERIMENTAL 143config CIFS_EXPERIMENTAL
135 bool "CIFS Experimental Features (EXPERIMENTAL)" 144 bool "CIFS Experimental Features (EXPERIMENTAL)"
136 depends on CIFS && EXPERIMENTAL 145 depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 9948c0030e86..adefa60a9bdc 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -11,3 +11,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o 11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
12 12
13cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o 13cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
14
15cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
diff --git a/fs/cifs/README b/fs/cifs/README
index a727b7cb075f..7099a526f775 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -301,6 +301,16 @@ A partial list of the supported mount options follows:
301 gid Set the default gid for inodes (similar to above). 301 gid Set the default gid for inodes (similar to above).
302 file_mode If CIFS Unix extensions are not supported by the server 302 file_mode If CIFS Unix extensions are not supported by the server
303 this overrides the default mode for file inodes. 303 this overrides the default mode for file inodes.
304 fsc Enable local disk caching using FS-Cache (off by default). This
305 option could be useful to improve performance on a slow link,
306 heavily loaded server and/or network where reading from the
307 disk is faster than reading from the server (over the network).
308 This could also impact scalability positively as the
309 number of calls to the server are reduced. However, local
310 caching is not suitable for all workloads for e.g. read-once
311 type workloads. So, you need to consider carefully your
312 workload/scenario before using this option. Currently, local
313 disk caching is functional for CIFS files opened as read-only.
304 dir_mode If CIFS Unix extensions are not supported by the server 314 dir_mode If CIFS Unix extensions are not supported by the server
305 this overrides the default mode for directory inodes. 315 this overrides the default mode for directory inodes.
306 port attempt to contact the server on this tcp port, before 316 port attempt to contact the server on this tcp port, before
@@ -568,8 +578,9 @@ module can be displayed via modinfo.
568Misc /proc/fs/cifs Flags and Debug Info 578Misc /proc/fs/cifs Flags and Debug Info
569======================================= 579=======================================
570Informational pseudo-files: 580Informational pseudo-files:
571DebugData Displays information about active CIFS sessions 581DebugData Displays information about active CIFS sessions and
572 and shares, as well as the cifs.ko version. 582 shares, features enabled as well as the cifs.ko
583 version.
573Stats Lists summary resource usage information as well as per 584Stats Lists summary resource usage information as well as per
574 share statistics, if CONFIG_CIFS_STATS in enabled 585 share statistics, if CONFIG_CIFS_STATS in enabled
575 in the kernel configuration. 586 in the kernel configuration.
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index a20bea598933..cfd1ce34e0bc 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -492,17 +492,13 @@ compare_oid(unsigned long *oid1, unsigned int oid1len,
492 492
493int 493int
494decode_negTokenInit(unsigned char *security_blob, int length, 494decode_negTokenInit(unsigned char *security_blob, int length,
495 enum securityEnum *secType) 495 struct TCP_Server_Info *server)
496{ 496{
497 struct asn1_ctx ctx; 497 struct asn1_ctx ctx;
498 unsigned char *end; 498 unsigned char *end;
499 unsigned char *sequence_end; 499 unsigned char *sequence_end;
500 unsigned long *oid = NULL; 500 unsigned long *oid = NULL;
501 unsigned int cls, con, tag, oidlen, rc; 501 unsigned int cls, con, tag, oidlen, rc;
502 bool use_ntlmssp = false;
503 bool use_kerberos = false;
504 bool use_kerberosu2u = false;
505 bool use_mskerberos = false;
506 502
507 /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */ 503 /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
508 504
@@ -510,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
510 506
511 /* GSSAPI header */ 507 /* GSSAPI header */
512 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 508 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
513 cFYI(1, ("Error decoding negTokenInit header")); 509 cFYI(1, "Error decoding negTokenInit header");
514 return 0; 510 return 0;
515 } else if ((cls != ASN1_APL) || (con != ASN1_CON) 511 } else if ((cls != ASN1_APL) || (con != ASN1_CON)
516 || (tag != ASN1_EOC)) { 512 || (tag != ASN1_EOC)) {
517 cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag)); 513 cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
518 return 0; 514 return 0;
519 } 515 }
520 516
@@ -535,56 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
535 531
536 /* SPNEGO OID not present or garbled -- bail out */ 532 /* SPNEGO OID not present or garbled -- bail out */
537 if (!rc) { 533 if (!rc) {
538 cFYI(1, ("Error decoding negTokenInit header")); 534 cFYI(1, "Error decoding negTokenInit header");
539 return 0; 535 return 0;
540 } 536 }
541 537
542 /* SPNEGO */ 538 /* SPNEGO */
543 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 539 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
544 cFYI(1, ("Error decoding negTokenInit")); 540 cFYI(1, "Error decoding negTokenInit");
545 return 0; 541 return 0;
546 } else if ((cls != ASN1_CTX) || (con != ASN1_CON) 542 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
547 || (tag != ASN1_EOC)) { 543 || (tag != ASN1_EOC)) {
548 cFYI(1, 544 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
549 ("cls = %d con = %d tag = %d end = %p (%d) exit 0", 545 cls, con, tag, end, *end);
550 cls, con, tag, end, *end));
551 return 0; 546 return 0;
552 } 547 }
553 548
554 /* negTokenInit */ 549 /* negTokenInit */
555 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 550 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
556 cFYI(1, ("Error decoding negTokenInit")); 551 cFYI(1, "Error decoding negTokenInit");
557 return 0; 552 return 0;
558 } else if ((cls != ASN1_UNI) || (con != ASN1_CON) 553 } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
559 || (tag != ASN1_SEQ)) { 554 || (tag != ASN1_SEQ)) {
560 cFYI(1, 555 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
561 ("cls = %d con = %d tag = %d end = %p (%d) exit 1", 556 cls, con, tag, end, *end);
562 cls, con, tag, end, *end));
563 return 0; 557 return 0;
564 } 558 }
565 559
566 /* sequence */ 560 /* sequence */
567 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 561 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
568 cFYI(1, ("Error decoding 2nd part of negTokenInit")); 562 cFYI(1, "Error decoding 2nd part of negTokenInit");
569 return 0; 563 return 0;
570 } else if ((cls != ASN1_CTX) || (con != ASN1_CON) 564 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
571 || (tag != ASN1_EOC)) { 565 || (tag != ASN1_EOC)) {
572 cFYI(1, 566 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
573 ("cls = %d con = %d tag = %d end = %p (%d) exit 0", 567 cls, con, tag, end, *end);
574 cls, con, tag, end, *end));
575 return 0; 568 return 0;
576 } 569 }
577 570
578 /* sequence of */ 571 /* sequence of */
579 if (asn1_header_decode 572 if (asn1_header_decode
580 (&ctx, &sequence_end, &cls, &con, &tag) == 0) { 573 (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
581 cFYI(1, ("Error decoding 2nd part of negTokenInit")); 574 cFYI(1, "Error decoding 2nd part of negTokenInit");
582 return 0; 575 return 0;
583 } else if ((cls != ASN1_UNI) || (con != ASN1_CON) 576 } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
584 || (tag != ASN1_SEQ)) { 577 || (tag != ASN1_SEQ)) {
585 cFYI(1, 578 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
586 ("cls = %d con = %d tag = %d end = %p (%d) exit 1", 579 cls, con, tag, end, *end);
587 cls, con, tag, end, *end));
588 return 0; 580 return 0;
589 } 581 }
590 582
@@ -592,37 +584,33 @@ decode_negTokenInit(unsigned char *security_blob, int length,
592 while (!asn1_eoc_decode(&ctx, sequence_end)) { 584 while (!asn1_eoc_decode(&ctx, sequence_end)) {
593 rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); 585 rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
594 if (!rc) { 586 if (!rc) {
595 cFYI(1, 587 cFYI(1, "Error decoding negTokenInit hdr exit2");
596 ("Error decoding negTokenInit hdr exit2"));
597 return 0; 588 return 0;
598 } 589 }
599 if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { 590 if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
600 if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { 591 if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
601 592
602 cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx " 593 cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
603 "0x%lx 0x%lx", oidlen, *oid, 594 "0x%lx 0x%lx", oidlen, *oid,
604 *(oid + 1), *(oid + 2), *(oid + 3))); 595 *(oid + 1), *(oid + 2), *(oid + 3));
605 596
606 if (compare_oid(oid, oidlen, MSKRB5_OID, 597 if (compare_oid(oid, oidlen, MSKRB5_OID,
607 MSKRB5_OID_LEN) && 598 MSKRB5_OID_LEN))
608 !use_mskerberos) 599 server->sec_mskerberos = true;
609 use_mskerberos = true;
610 else if (compare_oid(oid, oidlen, KRB5U2U_OID, 600 else if (compare_oid(oid, oidlen, KRB5U2U_OID,
611 KRB5U2U_OID_LEN) && 601 KRB5U2U_OID_LEN))
612 !use_kerberosu2u) 602 server->sec_kerberosu2u = true;
613 use_kerberosu2u = true;
614 else if (compare_oid(oid, oidlen, KRB5_OID, 603 else if (compare_oid(oid, oidlen, KRB5_OID,
615 KRB5_OID_LEN) && 604 KRB5_OID_LEN))
616 !use_kerberos) 605 server->sec_kerberos = true;
617 use_kerberos = true;
618 else if (compare_oid(oid, oidlen, NTLMSSP_OID, 606 else if (compare_oid(oid, oidlen, NTLMSSP_OID,
619 NTLMSSP_OID_LEN)) 607 NTLMSSP_OID_LEN))
620 use_ntlmssp = true; 608 server->sec_ntlmssp = true;
621 609
622 kfree(oid); 610 kfree(oid);
623 } 611 }
624 } else { 612 } else {
625 cFYI(1, ("Should be an oid what is going on?")); 613 cFYI(1, "Should be an oid what is going on?");
626 } 614 }
627 } 615 }
628 616
@@ -632,54 +620,47 @@ decode_negTokenInit(unsigned char *security_blob, int length,
632 no mechListMic (e.g. NTLMSSP instead of KRB5) */ 620 no mechListMic (e.g. NTLMSSP instead of KRB5) */
633 if (ctx.error == ASN1_ERR_DEC_EMPTY) 621 if (ctx.error == ASN1_ERR_DEC_EMPTY)
634 goto decode_negtoken_exit; 622 goto decode_negtoken_exit;
635 cFYI(1, ("Error decoding last part negTokenInit exit3")); 623 cFYI(1, "Error decoding last part negTokenInit exit3");
636 return 0; 624 return 0;
637 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { 625 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
638 /* tag = 3 indicating mechListMIC */ 626 /* tag = 3 indicating mechListMIC */
639 cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)", 627 cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
640 cls, con, tag, end, *end)); 628 cls, con, tag, end, *end);
641 return 0; 629 return 0;
642 } 630 }
643 631
644 /* sequence */ 632 /* sequence */
645 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 633 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
646 cFYI(1, ("Error decoding last part negTokenInit exit5")); 634 cFYI(1, "Error decoding last part negTokenInit exit5");
647 return 0; 635 return 0;
648 } else if ((cls != ASN1_UNI) || (con != ASN1_CON) 636 } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
649 || (tag != ASN1_SEQ)) { 637 || (tag != ASN1_SEQ)) {
650 cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)", 638 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
651 cls, con, tag, end, *end)); 639 cls, con, tag, end, *end);
652 } 640 }
653 641
654 /* sequence of */ 642 /* sequence of */
655 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 643 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
656 cFYI(1, ("Error decoding last part negTokenInit exit 7")); 644 cFYI(1, "Error decoding last part negTokenInit exit 7");
657 return 0; 645 return 0;
658 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { 646 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
659 cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)", 647 cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
660 cls, con, tag, end, *end)); 648 cls, con, tag, end, *end);
661 return 0; 649 return 0;
662 } 650 }
663 651
664 /* general string */ 652 /* general string */
665 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 653 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
666 cFYI(1, ("Error decoding last part negTokenInit exit9")); 654 cFYI(1, "Error decoding last part negTokenInit exit9");
667 return 0; 655 return 0;
668 } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) 656 } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
669 || (tag != ASN1_GENSTR)) { 657 || (tag != ASN1_GENSTR)) {
670 cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)", 658 cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
671 cls, con, tag, end, *end)); 659 cls, con, tag, end, *end);
672 return 0; 660 return 0;
673 } 661 }
674 cFYI(1, ("Need to call asn1_octets_decode() function for %s", 662 cFYI(1, "Need to call asn1_octets_decode() function for %s",
675 ctx.pointer)); /* is this UTF-8 or ASCII? */ 663 ctx.pointer); /* is this UTF-8 or ASCII? */
676decode_negtoken_exit: 664decode_negtoken_exit:
677 if (use_kerberos)
678 *secType = Kerberos;
679 else if (use_mskerberos)
680 *secType = MSKerberos;
681 else if (use_ntlmssp)
682 *secType = RawNTLMSSP;
683
684 return 1; 665 return 1;
685} 666}
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
new file mode 100644
index 000000000000..224d7bbd1fcc
--- /dev/null
+++ b/fs/cifs/cache.c
@@ -0,0 +1,331 @@
1/*
2 * fs/cifs/cache.c - CIFS filesystem cache index structure definitions
3 *
4 * Copyright (c) 2010 Novell, Inc.
5 * Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21#include "fscache.h"
22#include "cifs_debug.h"
23
24/*
25 * CIFS filesystem definition for FS-Cache
26 */
27struct fscache_netfs cifs_fscache_netfs = {
28 .name = "cifs",
29 .version = 0,
30};
31
32/*
33 * Register CIFS for caching with FS-Cache
34 */
35int cifs_fscache_register(void)
36{
37 return fscache_register_netfs(&cifs_fscache_netfs);
38}
39
40/*
41 * Unregister CIFS for caching
42 */
43void cifs_fscache_unregister(void)
44{
45 fscache_unregister_netfs(&cifs_fscache_netfs);
46}
47
48/*
49 * Key layout of CIFS server cache index object
50 */
51struct cifs_server_key {
52 uint16_t family; /* address family */
53 uint16_t port; /* IP port */
54 union {
55 struct in_addr ipv4_addr;
56 struct in6_addr ipv6_addr;
57 } addr[0];
58};
59
60/*
61 * Server object keyed by {IPaddress,port,family} tuple
62 */
63static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
64 void *buffer, uint16_t maxbuf)
65{
66 const struct TCP_Server_Info *server = cookie_netfs_data;
67 const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
68 struct cifs_server_key *key = buffer;
69 uint16_t key_len = sizeof(struct cifs_server_key);
70
71 memset(key, 0, key_len);
72
73 /*
74 * Should not be a problem as sin_family/sin6_family overlays
75 * sa_family field
76 */
77 switch (sa->sa_family) {
78 case AF_INET:
79 key->family = server->addr.sockAddr.sin_family;
80 key->port = server->addr.sockAddr.sin_port;
81 key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
82 key_len += sizeof(key->addr[0].ipv4_addr);
83 break;
84
85 case AF_INET6:
86 key->family = server->addr.sockAddr6.sin6_family;
87 key->port = server->addr.sockAddr6.sin6_port;
88 key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
89 key_len += sizeof(key->addr[0].ipv6_addr);
90 break;
91
92 default:
93 cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family);
94 key_len = 0;
95 break;
96 }
97
98 return key_len;
99}
100
101/*
102 * Server object for FS-Cache
103 */
104const struct fscache_cookie_def cifs_fscache_server_index_def = {
105 .name = "CIFS.server",
106 .type = FSCACHE_COOKIE_TYPE_INDEX,
107 .get_key = cifs_server_get_key,
108};
109
110/*
111 * Auxiliary data attached to CIFS superblock within the cache
112 */
113struct cifs_fscache_super_auxdata {
114 u64 resource_id; /* unique server resource id */
115};
116
117static char *extract_sharename(const char *treename)
118{
119 const char *src;
120 char *delim, *dst;
121 int len;
122
123 /* skip double chars at the beginning */
124 src = treename + 2;
125
126 /* share name is always preceded by '\\' now */
127 delim = strchr(src, '\\');
128 if (!delim)
129 return ERR_PTR(-EINVAL);
130 delim++;
131 len = strlen(delim);
132
133 /* caller has to free the memory */
134 dst = kstrndup(delim, len, GFP_KERNEL);
135 if (!dst)
136 return ERR_PTR(-ENOMEM);
137
138 return dst;
139}
140
141/*
142 * Superblock object currently keyed by share name
143 */
144static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
145 uint16_t maxbuf)
146{
147 const struct cifsTconInfo *tcon = cookie_netfs_data;
148 char *sharename;
149 uint16_t len;
150
151 sharename = extract_sharename(tcon->treeName);
152 if (IS_ERR(sharename)) {
153 cFYI(1, "CIFS: couldn't extract sharename\n");
154 sharename = NULL;
155 return 0;
156 }
157
158 len = strlen(sharename);
159 if (len > maxbuf)
160 return 0;
161
162 memcpy(buffer, sharename, len);
163
164 kfree(sharename);
165
166 return len;
167}
168
169static uint16_t
170cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
171 uint16_t maxbuf)
172{
173 struct cifs_fscache_super_auxdata auxdata;
174 const struct cifsTconInfo *tcon = cookie_netfs_data;
175
176 memset(&auxdata, 0, sizeof(auxdata));
177 auxdata.resource_id = tcon->resource_id;
178
179 if (maxbuf > sizeof(auxdata))
180 maxbuf = sizeof(auxdata);
181
182 memcpy(buffer, &auxdata, maxbuf);
183
184 return maxbuf;
185}
186
187static enum
188fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
189 const void *data,
190 uint16_t datalen)
191{
192 struct cifs_fscache_super_auxdata auxdata;
193 const struct cifsTconInfo *tcon = cookie_netfs_data;
194
195 if (datalen != sizeof(auxdata))
196 return FSCACHE_CHECKAUX_OBSOLETE;
197
198 memset(&auxdata, 0, sizeof(auxdata));
199 auxdata.resource_id = tcon->resource_id;
200
201 if (memcmp(data, &auxdata, datalen) != 0)
202 return FSCACHE_CHECKAUX_OBSOLETE;
203
204 return FSCACHE_CHECKAUX_OKAY;
205}
206
207/*
208 * Superblock object for FS-Cache
209 */
210const struct fscache_cookie_def cifs_fscache_super_index_def = {
211 .name = "CIFS.super",
212 .type = FSCACHE_COOKIE_TYPE_INDEX,
213 .get_key = cifs_super_get_key,
214 .get_aux = cifs_fscache_super_get_aux,
215 .check_aux = cifs_fscache_super_check_aux,
216};
217
218/*
219 * Auxiliary data attached to CIFS inode within the cache
220 */
221struct cifs_fscache_inode_auxdata {
222 struct timespec last_write_time;
223 struct timespec last_change_time;
224 u64 eof;
225};
226
227static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
228 void *buffer, uint16_t maxbuf)
229{
230 const struct cifsInodeInfo *cifsi = cookie_netfs_data;
231 uint16_t keylen;
232
233 /* use the UniqueId as the key */
234 keylen = sizeof(cifsi->uniqueid);
235 if (keylen > maxbuf)
236 keylen = 0;
237 else
238 memcpy(buffer, &cifsi->uniqueid, keylen);
239
240 return keylen;
241}
242
243static void
244cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
245{
246 const struct cifsInodeInfo *cifsi = cookie_netfs_data;
247
248 *size = cifsi->vfs_inode.i_size;
249}
250
251static uint16_t
252cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
253 uint16_t maxbuf)
254{
255 struct cifs_fscache_inode_auxdata auxdata;
256 const struct cifsInodeInfo *cifsi = cookie_netfs_data;
257
258 memset(&auxdata, 0, sizeof(auxdata));
259 auxdata.eof = cifsi->server_eof;
260 auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
261 auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
262
263 if (maxbuf > sizeof(auxdata))
264 maxbuf = sizeof(auxdata);
265
266 memcpy(buffer, &auxdata, maxbuf);
267
268 return maxbuf;
269}
270
271static enum
272fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
273 const void *data,
274 uint16_t datalen)
275{
276 struct cifs_fscache_inode_auxdata auxdata;
277 struct cifsInodeInfo *cifsi = cookie_netfs_data;
278
279 if (datalen != sizeof(auxdata))
280 return FSCACHE_CHECKAUX_OBSOLETE;
281
282 memset(&auxdata, 0, sizeof(auxdata));
283 auxdata.eof = cifsi->server_eof;
284 auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
285 auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
286
287 if (memcmp(data, &auxdata, datalen) != 0)
288 return FSCACHE_CHECKAUX_OBSOLETE;
289
290 return FSCACHE_CHECKAUX_OKAY;
291}
292
293static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
294{
295 struct cifsInodeInfo *cifsi = cookie_netfs_data;
296 struct pagevec pvec;
297 pgoff_t first;
298 int loop, nr_pages;
299
300 pagevec_init(&pvec, 0);
301 first = 0;
302
303 cFYI(1, "cifs inode 0x%p now uncached", cifsi);
304
305 for (;;) {
306 nr_pages = pagevec_lookup(&pvec,
307 cifsi->vfs_inode.i_mapping, first,
308 PAGEVEC_SIZE - pagevec_count(&pvec));
309 if (!nr_pages)
310 break;
311
312 for (loop = 0; loop < nr_pages; loop++)
313 ClearPageFsCache(pvec.pages[loop]);
314
315 first = pvec.pages[nr_pages - 1]->index + 1;
316
317 pvec.nr = nr_pages;
318 pagevec_release(&pvec);
319 cond_resched();
320 }
321}
322
323const struct fscache_cookie_def cifs_fscache_inode_object_def = {
324 .name = "CIFS.uniqueid",
325 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
326 .get_key = cifs_fscache_inode_get_key,
327 .get_attr = cifs_fscache_inode_get_attr,
328 .get_aux = cifs_fscache_inode_get_aux,
329 .check_aux = cifs_fscache_inode_check_aux,
330 .now_uncached = cifs_fscache_inode_now_uncached,
331};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 42cec2a7c0cf..eb1ba493489f 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -60,10 +60,10 @@ cifs_dump_mem(char *label, void *data, int length)
60#ifdef CONFIG_CIFS_DEBUG2 60#ifdef CONFIG_CIFS_DEBUG2
61void cifs_dump_detail(struct smb_hdr *smb) 61void cifs_dump_detail(struct smb_hdr *smb)
62{ 62{
63 cERROR(1, ("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d", 63 cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
64 smb->Command, smb->Status.CifsError, 64 smb->Command, smb->Status.CifsError,
65 smb->Flags, smb->Flags2, smb->Mid, smb->Pid)); 65 smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
66 cERROR(1, ("smb buf %p len %d", smb, smbCalcSize_LE(smb))); 66 cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
67} 67}
68 68
69 69
@@ -75,25 +75,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
75 if (server == NULL) 75 if (server == NULL)
76 return; 76 return;
77 77
78 cERROR(1, ("Dump pending requests:")); 78 cERROR(1, "Dump pending requests:");
79 spin_lock(&GlobalMid_Lock); 79 spin_lock(&GlobalMid_Lock);
80 list_for_each(tmp, &server->pending_mid_q) { 80 list_for_each(tmp, &server->pending_mid_q) {
81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
82 cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", 82 cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
83 mid_entry->midState, 83 mid_entry->midState,
84 (int)mid_entry->command, 84 (int)mid_entry->command,
85 mid_entry->pid, 85 mid_entry->pid,
86 mid_entry->tsk, 86 mid_entry->tsk,
87 mid_entry->mid)); 87 mid_entry->mid);
88#ifdef CONFIG_CIFS_STATS2 88#ifdef CONFIG_CIFS_STATS2
89 cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld", 89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
90 mid_entry->largeBuf, 90 mid_entry->largeBuf,
91 mid_entry->resp_buf, 91 mid_entry->resp_buf,
92 mid_entry->when_received, 92 mid_entry->when_received,
93 jiffies)); 93 jiffies);
94#endif /* STATS2 */ 94#endif /* STATS2 */
95 cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp, 95 cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
96 mid_entry->multiEnd)); 96 mid_entry->multiEnd);
97 if (mid_entry->resp_buf) { 97 if (mid_entry->resp_buf) {
98 cifs_dump_detail(mid_entry->resp_buf); 98 cifs_dump_detail(mid_entry->resp_buf);
99 cifs_dump_mem("existing buf: ", 99 cifs_dump_mem("existing buf: ",
@@ -119,6 +119,31 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
119 "Display Internal CIFS Data Structures for Debugging\n" 119 "Display Internal CIFS Data Structures for Debugging\n"
120 "---------------------------------------------------\n"); 120 "---------------------------------------------------\n");
121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); 121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
122 seq_printf(m, "Features: ");
123#ifdef CONFIG_CIFS_DFS_UPCALL
124 seq_printf(m, "dfs");
125 seq_putc(m, ' ');
126#endif
127#ifdef CONFIG_CIFS_FSCACHE
128 seq_printf(m, "fscache");
129 seq_putc(m, ' ');
130#endif
131#ifdef CONFIG_CIFS_WEAK_PW_HASH
132 seq_printf(m, "lanman");
133 seq_putc(m, ' ');
134#endif
135#ifdef CONFIG_CIFS_POSIX
136 seq_printf(m, "posix");
137 seq_putc(m, ' ');
138#endif
139#ifdef CONFIG_CIFS_UPCALL
140 seq_printf(m, "spnego");
141 seq_putc(m, ' ');
142#endif
143#ifdef CONFIG_CIFS_XATTR
144 seq_printf(m, "xattr");
145#endif
146 seq_putc(m, '\n');
122 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); 147 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
123 seq_printf(m, "Servers:"); 148 seq_printf(m, "Servers:");
124 149
@@ -716,7 +741,7 @@ static const struct file_operations cifs_multiuser_mount_proc_fops = {
716 741
717static int cifs_security_flags_proc_show(struct seq_file *m, void *v) 742static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
718{ 743{
719 seq_printf(m, "0x%x\n", extended_security); 744 seq_printf(m, "0x%x\n", global_secflags);
720 return 0; 745 return 0;
721} 746}
722 747
@@ -744,13 +769,13 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
744 /* single char or single char followed by null */ 769 /* single char or single char followed by null */
745 c = flags_string[0]; 770 c = flags_string[0];
746 if (c == '0' || c == 'n' || c == 'N') { 771 if (c == '0' || c == 'n' || c == 'N') {
747 extended_security = CIFSSEC_DEF; /* default */ 772 global_secflags = CIFSSEC_DEF; /* default */
748 return count; 773 return count;
749 } else if (c == '1' || c == 'y' || c == 'Y') { 774 } else if (c == '1' || c == 'y' || c == 'Y') {
750 extended_security = CIFSSEC_MAX; 775 global_secflags = CIFSSEC_MAX;
751 return count; 776 return count;
752 } else if (!isdigit(c)) { 777 } else if (!isdigit(c)) {
753 cERROR(1, ("invalid flag %c", c)); 778 cERROR(1, "invalid flag %c", c);
754 return -EINVAL; 779 return -EINVAL;
755 } 780 }
756 } 781 }
@@ -758,26 +783,26 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
758 783
759 flags = simple_strtoul(flags_string, NULL, 0); 784 flags = simple_strtoul(flags_string, NULL, 0);
760 785
761 cFYI(1, ("sec flags 0x%x", flags)); 786 cFYI(1, "sec flags 0x%x", flags);
762 787
763 if (flags <= 0) { 788 if (flags <= 0) {
764 cERROR(1, ("invalid security flags %s", flags_string)); 789 cERROR(1, "invalid security flags %s", flags_string);
765 return -EINVAL; 790 return -EINVAL;
766 } 791 }
767 792
768 if (flags & ~CIFSSEC_MASK) { 793 if (flags & ~CIFSSEC_MASK) {
769 cERROR(1, ("attempt to set unsupported security flags 0x%x", 794 cERROR(1, "attempt to set unsupported security flags 0x%x",
770 flags & ~CIFSSEC_MASK)); 795 flags & ~CIFSSEC_MASK);
771 return -EINVAL; 796 return -EINVAL;
772 } 797 }
773 /* flags look ok - update the global security flags for cifs module */ 798 /* flags look ok - update the global security flags for cifs module */
774 extended_security = flags; 799 global_secflags = flags;
775 if (extended_security & CIFSSEC_MUST_SIGN) { 800 if (global_secflags & CIFSSEC_MUST_SIGN) {
776 /* requiring signing implies signing is allowed */ 801 /* requiring signing implies signing is allowed */
777 extended_security |= CIFSSEC_MAY_SIGN; 802 global_secflags |= CIFSSEC_MAY_SIGN;
778 cFYI(1, ("packet signing now required")); 803 cFYI(1, "packet signing now required");
779 } else if ((extended_security & CIFSSEC_MAY_SIGN) == 0) { 804 } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
780 cFYI(1, ("packet signing disabled")); 805 cFYI(1, "packet signing disabled");
781 } 806 }
782 /* BB should we turn on MAY flags for other MUST options? */ 807 /* BB should we turn on MAY flags for other MUST options? */
783 return count; 808 return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 5eb3b83bbfa7..aa316891ac0c 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -43,34 +43,54 @@ void dump_smb(struct smb_hdr *, int);
43 */ 43 */
44#ifdef CIFS_DEBUG 44#ifdef CIFS_DEBUG
45 45
46
47/* information message: e.g., configuration, major event */ 46/* information message: e.g., configuration, major event */
48extern int cifsFYI; 47extern int cifsFYI;
49#define cifsfyi(format,arg...) if (cifsFYI & CIFS_INFO) printk(KERN_DEBUG " " __FILE__ ": " format "\n" "" , ## arg) 48#define cifsfyi(fmt, arg...) \
49do { \
50 if (cifsFYI & CIFS_INFO) \
51 printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \
52} while (0)
50 53
51#define cFYI(button,prspec) if (button) cifsfyi prspec 54#define cFYI(set, fmt, arg...) \
55do { \
56 if (set) \
57 cifsfyi(fmt, ##arg); \
58} while (0)
52 59
53#define cifswarn(format, arg...) printk(KERN_WARNING ": " format "\n" , ## arg) 60#define cifswarn(fmt, arg...) \
61 printk(KERN_WARNING fmt "\n", ##arg)
54 62
55/* debug event message: */ 63/* debug event message: */
56extern int cifsERROR; 64extern int cifsERROR;
57 65
58#define cEVENT(format,arg...) if (cifsERROR) printk(KERN_EVENT __FILE__ ": " format "\n" , ## arg) 66#define cEVENT(fmt, arg...) \
67do { \
68 if (cifsERROR) \
69 printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \
70} while (0)
59 71
60/* error event message: e.g., i/o error */ 72/* error event message: e.g., i/o error */
61#define cifserror(format,arg...) if (cifsERROR) printk(KERN_ERR " CIFS VFS: " format "\n" "" , ## arg) 73#define cifserror(fmt, arg...) \
74do { \
75 if (cifsERROR) \
76 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \
77} while (0)
62 78
63#define cERROR(button, prspec) if (button) cifserror prspec 79#define cERROR(set, fmt, arg...) \
80do { \
81 if (set) \
82 cifserror(fmt, ##arg); \
83} while (0)
64 84
65/* 85/*
66 * debug OFF 86 * debug OFF
67 * --------- 87 * ---------
68 */ 88 */
69#else /* _CIFS_DEBUG */ 89#else /* _CIFS_DEBUG */
70#define cERROR(button, prspec) 90#define cERROR(set, fmt, arg...)
71#define cEVENT(format, arg...) 91#define cEVENT(fmt, arg...)
72#define cFYI(button, prspec) 92#define cFYI(set, fmt, arg...)
73#define cifserror(format, arg...) 93#define cifserror(fmt, arg...)
74#endif /* _CIFS_DEBUG */ 94#endif /* _CIFS_DEBUG */
75 95
76#endif /* _H_CIFS_DEBUG */ 96#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 78e4d2a3a68b..d6ced7aa23cf 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -85,8 +85,8 @@ static char *cifs_get_share_name(const char *node_name)
85 /* find server name end */ 85 /* find server name end */
86 pSep = memchr(UNC+2, '\\', len-2); 86 pSep = memchr(UNC+2, '\\', len-2);
87 if (!pSep) { 87 if (!pSep) {
88 cERROR(1, ("%s: no server name end in node name: %s", 88 cERROR(1, "%s: no server name end in node name: %s",
89 __func__, node_name)); 89 __func__, node_name);
90 kfree(UNC); 90 kfree(UNC);
91 return ERR_PTR(-EINVAL); 91 return ERR_PTR(-EINVAL);
92 } 92 }
@@ -141,17 +141,16 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
141 } 141 }
142 142
143 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 143 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
144 if (rc != 0) { 144 if (rc < 0) {
145 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", 145 cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
146 __func__, *devname, rc)); 146 __func__, *devname, rc);
147 goto compose_mount_options_err; 147 goto compose_mount_options_err;
148 } 148 }
149 /* md_len = strlen(...) + 12 for 'sep+prefixpath=' 149 /* md_len = strlen(...) + 12 for 'sep+prefixpath='
150 * assuming that we have 'unc=' and 'ip=' in 150 * assuming that we have 'unc=' and 'ip=' in
151 * the original sb_mountdata 151 * the original sb_mountdata
152 */ 152 */
153 md_len = strlen(sb_mountdata) + strlen(srvIP) + 153 md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
154 strlen(ref->node_name) + 12;
155 mountdata = kzalloc(md_len+1, GFP_KERNEL); 154 mountdata = kzalloc(md_len+1, GFP_KERNEL);
156 if (mountdata == NULL) { 155 if (mountdata == NULL) {
157 rc = -ENOMEM; 156 rc = -ENOMEM;
@@ -217,8 +216,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
217 strcat(mountdata, fullpath + ref->path_consumed); 216 strcat(mountdata, fullpath + ref->path_consumed);
218 } 217 }
219 218
220 /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/ 219 /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
221 /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/ 220 /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
222 221
223compose_mount_options_out: 222compose_mount_options_out:
224 kfree(srvIP); 223 kfree(srvIP);
@@ -230,28 +229,22 @@ compose_mount_options_err:
230 goto compose_mount_options_out; 229 goto compose_mount_options_out;
231} 230}
232 231
233 232/**
234static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent, 233 * cifs_dfs_do_refmount - mounts specified path using provided refferal
235 struct dentry *dentry, const struct dfs_info3_param *ref) 234 * @cifs_sb: parent/root superblock
235 * @fullpath: full path in UNC format
236 * @ref: server's referral
237 */
238static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
239 const char *fullpath, const struct dfs_info3_param *ref)
236{ 240{
237 struct cifs_sb_info *cifs_sb;
238 struct vfsmount *mnt; 241 struct vfsmount *mnt;
239 char *mountdata; 242 char *mountdata;
240 char *devname = NULL; 243 char *devname = NULL;
241 char *fullpath;
242
243 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
244 /*
245 * this function gives us a path with a double backslash prefix. We
246 * require a single backslash for DFS.
247 */
248 fullpath = build_path_from_dentry(dentry);
249 if (!fullpath)
250 return ERR_PTR(-ENOMEM);
251 244
245 /* strip first '\' from fullpath */
252 mountdata = cifs_compose_mount_options(cifs_sb->mountdata, 246 mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
253 fullpath + 1, ref, &devname); 247 fullpath + 1, ref, &devname);
254 kfree(fullpath);
255 248
256 if (IS_ERR(mountdata)) 249 if (IS_ERR(mountdata))
257 return (struct vfsmount *)mountdata; 250 return (struct vfsmount *)mountdata;
@@ -294,11 +287,11 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
294 287
295static void dump_referral(const struct dfs_info3_param *ref) 288static void dump_referral(const struct dfs_info3_param *ref)
296{ 289{
297 cFYI(1, ("DFS: ref path: %s", ref->path_name)); 290 cFYI(1, "DFS: ref path: %s", ref->path_name);
298 cFYI(1, ("DFS: node path: %s", ref->node_name)); 291 cFYI(1, "DFS: node path: %s", ref->node_name);
299 cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type)); 292 cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
300 cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag, 293 cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
301 ref->path_consumed)); 294 ref->path_consumed);
302} 295}
303 296
304 297
@@ -314,7 +307,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
314 int rc = 0; 307 int rc = 0;
315 struct vfsmount *mnt = ERR_PTR(-ENOENT); 308 struct vfsmount *mnt = ERR_PTR(-ENOENT);
316 309
317 cFYI(1, ("in %s", __func__)); 310 cFYI(1, "in %s", __func__);
318 BUG_ON(IS_ROOT(dentry)); 311 BUG_ON(IS_ROOT(dentry));
319 312
320 xid = GetXid(); 313 xid = GetXid();
@@ -352,15 +345,15 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
352 /* connect to a node */ 345 /* connect to a node */
353 len = strlen(referrals[i].node_name); 346 len = strlen(referrals[i].node_name);
354 if (len < 2) { 347 if (len < 2) {
355 cERROR(1, ("%s: Net Address path too short: %s", 348 cERROR(1, "%s: Net Address path too short: %s",
356 __func__, referrals[i].node_name)); 349 __func__, referrals[i].node_name);
357 rc = -EINVAL; 350 rc = -EINVAL;
358 goto out_err; 351 goto out_err;
359 } 352 }
360 mnt = cifs_dfs_do_refmount(nd->path.mnt, 353 mnt = cifs_dfs_do_refmount(cifs_sb,
361 nd->path.dentry, referrals + i); 354 full_path, referrals + i);
362 cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, 355 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
363 referrals[i].node_name, mnt)); 356 referrals[i].node_name, mnt);
364 357
365 /* complete mount procedure if we accured submount */ 358 /* complete mount procedure if we accured submount */
366 if (!IS_ERR(mnt)) 359 if (!IS_ERR(mnt))
@@ -378,7 +371,7 @@ out:
378 FreeXid(xid); 371 FreeXid(xid);
379 free_dfs_info_array(referrals, num_referrals); 372 free_dfs_info_array(referrals, num_referrals);
380 kfree(full_path); 373 kfree(full_path);
381 cFYI(1, ("leaving %s" , __func__)); 374 cFYI(1, "leaving %s" , __func__);
382 return ERR_PTR(rc); 375 return ERR_PTR(rc);
383out_err: 376out_err:
384 path_put(&nd->path); 377 path_put(&nd->path);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 246a167cb913..9e771450c3b8 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -35,6 +35,7 @@
35#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */ 35#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */
36#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ 36#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */
37#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ 37#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/
38#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */
38 39
39struct cifs_sb_info { 40struct cifs_sb_info {
40 struct cifsTconInfo *tcon; /* primary mount */ 41 struct cifsTconInfo *tcon; /* primary mount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 310d12f69a92..87044906cd1f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -84,6 +84,9 @@ struct key_type cifs_spnego_key_type = {
84/* strlen of ";uid=0x" */ 84/* strlen of ";uid=0x" */
85#define UID_KEY_LEN 7 85#define UID_KEY_LEN 7
86 86
87/* strlen of ";creduid=0x" */
88#define CREDUID_KEY_LEN 11
89
87/* strlen of ";user=" */ 90/* strlen of ";user=" */
88#define USER_KEY_LEN 6 91#define USER_KEY_LEN 6
89 92
@@ -107,6 +110,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
107 IP_KEY_LEN + INET6_ADDRSTRLEN + 110 IP_KEY_LEN + INET6_ADDRSTRLEN +
108 MAX_MECH_STR_LEN + 111 MAX_MECH_STR_LEN +
109 UID_KEY_LEN + (sizeof(uid_t) * 2) + 112 UID_KEY_LEN + (sizeof(uid_t) * 2) +
113 CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
110 USER_KEY_LEN + strlen(sesInfo->userName) + 114 USER_KEY_LEN + strlen(sesInfo->userName) +
111 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; 115 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
112 116
@@ -133,9 +137,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
133 dp = description + strlen(description); 137 dp = description + strlen(description);
134 138
135 /* for now, only sec=krb5 and sec=mskrb5 are valid */ 139 /* for now, only sec=krb5 and sec=mskrb5 are valid */
136 if (server->secType == Kerberos) 140 if (server->sec_kerberos)
137 sprintf(dp, ";sec=krb5"); 141 sprintf(dp, ";sec=krb5");
138 else if (server->secType == MSKerberos) 142 else if (server->sec_mskerberos)
139 sprintf(dp, ";sec=mskrb5"); 143 sprintf(dp, ";sec=mskrb5");
140 else 144 else
141 goto out; 145 goto out;
@@ -144,12 +148,15 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
144 sprintf(dp, ";uid=0x%x", sesInfo->linux_uid); 148 sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
145 149
146 dp = description + strlen(description); 150 dp = description + strlen(description);
151 sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
152
153 dp = description + strlen(description);
147 sprintf(dp, ";user=%s", sesInfo->userName); 154 sprintf(dp, ";user=%s", sesInfo->userName);
148 155
149 dp = description + strlen(description); 156 dp = description + strlen(description);
150 sprintf(dp, ";pid=0x%x", current->pid); 157 sprintf(dp, ";pid=0x%x", current->pid);
151 158
152 cFYI(1, ("key description = %s", description)); 159 cFYI(1, "key description = %s", description);
153 spnego_key = request_key(&cifs_spnego_key_type, description, ""); 160 spnego_key = request_key(&cifs_spnego_key_type, description, "");
154 161
155#ifdef CONFIG_CIFS_DEBUG2 162#ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index d07676bd76d2..430f510a1720 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -200,9 +200,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
200 /* works for 2.4.0 kernel or later */ 200 /* works for 2.4.0 kernel or later */
201 charlen = codepage->char2uni(from, len, &wchar_to[i]); 201 charlen = codepage->char2uni(from, len, &wchar_to[i]);
202 if (charlen < 1) { 202 if (charlen < 1) {
203 cERROR(1, 203 cERROR(1, "strtoUCS: char2uni of %d returned %d",
204 ("strtoUCS: char2uni of %d returned %d", 204 (int)*from, charlen);
205 (int)*from, charlen));
206 /* A question mark */ 205 /* A question mark */
207 to[i] = cpu_to_le16(0x003f); 206 to[i] = cpu_to_le16(0x003f);
208 charlen = 1; 207 charlen = 1;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6f..7fe6b52df507 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
30 * This is a compressed table of upper and lower case conversion. 30 * This is a compressed table of upper and lower case conversion.
31 * 31 *
32 */ 32 */
33#ifndef _CIFS_UNICODE_H
34#define _CIFS_UNICODE_H
33 35
34#include <asm/byteorder.h> 36#include <asm/byteorder.h>
35#include <linux/types.h> 37#include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
67#endif /* UNIUPR_NOUPPER */ 69#endif /* UNIUPR_NOUPPER */
68 70
69#ifndef UNIUPR_NOLOWER 71#ifndef UNIUPR_NOLOWER
70extern signed char UniLowerTable[512]; 72extern signed char CifsUniLowerTable[512];
71extern struct UniCaseRange UniLowerRange[]; 73extern const struct UniCaseRange CifsUniLowerRange[];
72#endif /* UNIUPR_NOLOWER */ 74#endif /* UNIUPR_NOLOWER */
73 75
74#ifdef __KERNEL__ 76#ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
337 * UniTolower: Convert a unicode character to lower case 339 * UniTolower: Convert a unicode character to lower case
338 */ 340 */
339static inline wchar_t 341static inline wchar_t
340UniTolower(wchar_t uc) 342UniTolower(register wchar_t uc)
341{ 343{
342 register struct UniCaseRange *rp; 344 register const struct UniCaseRange *rp;
343 345
344 if (uc < sizeof(UniLowerTable)) { 346 if (uc < sizeof(CifsUniLowerTable)) {
345 /* Latin characters */ 347 /* Latin characters */
346 return uc + UniLowerTable[uc]; /* Use base tables */ 348 return uc + CifsUniLowerTable[uc]; /* Use base tables */
347 } else { 349 } else {
348 rp = UniLowerRange; /* Use range tables */ 350 rp = CifsUniLowerRange; /* Use range tables */
349 while (rp->start) { 351 while (rp->start) {
350 if (uc < rp->start) /* Before start of range */ 352 if (uc < rp->start) /* Before start of range */
351 return uc; /* Uppercase = input */ 353 return uc; /* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
374} 376}
375 377
376#endif 378#endif
379
380#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e519..0ac7c5a8633a 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
140/* 140/*
141 * Latin lower case 141 * Latin lower case
142 */ 142 */
143static signed char CifsUniLowerTable[512] = { 143signed char CifsUniLowerTable[512] = {
144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ 144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ 145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ 146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
242/* 242/*
243 * Lower Case Range 243 * Lower Case Range
244 */ 244 */
245static const struct UniCaseRange CifsUniLowerRange[] = { 245const struct UniCaseRange CifsUniLowerRange[] = {
246 0x0380, 0x03ab, UniCaseRangeL0380, 246 {0x0380, 0x03ab, UniCaseRangeL0380},
247 0x0400, 0x042f, UniCaseRangeL0400, 247 {0x0400, 0x042f, UniCaseRangeL0400},
248 0x0490, 0x04cb, UniCaseRangeL0490, 248 {0x0490, 0x04cb, UniCaseRangeL0490},
249 0x1e00, 0x1ff7, UniCaseRangeL1e00, 249 {0x1e00, 0x1ff7, UniCaseRangeL1e00},
250 0xff20, 0xff3a, UniCaseRangeLff20, 250 {0xff20, 0xff3a, UniCaseRangeLff20},
251 0, 0, 0 251 {0}
252}; 252};
253#endif 253#endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 9b716d044bbd..85d7cf7ff2c8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -87,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
87 continue; /* all sub_auth values do not match */ 87 continue; /* all sub_auth values do not match */
88 } 88 }
89 89
90 cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname)); 90 cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
91 return 0; /* sids compare/match */ 91 return 0; /* sids compare/match */
92 } 92 }
93 93
94 cFYI(1, ("No matching sid")); 94 cFYI(1, "No matching sid");
95 return -1; 95 return -1;
96} 96}
97 97
@@ -208,14 +208,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
208 *pbits_to_set &= ~S_IXUGO; 208 *pbits_to_set &= ~S_IXUGO;
209 return; 209 return;
210 } else if (type != ACCESS_ALLOWED) { 210 } else if (type != ACCESS_ALLOWED) {
211 cERROR(1, ("unknown access control type %d", type)); 211 cERROR(1, "unknown access control type %d", type);
212 return; 212 return;
213 } 213 }
214 /* else ACCESS_ALLOWED type */ 214 /* else ACCESS_ALLOWED type */
215 215
216 if (flags & GENERIC_ALL) { 216 if (flags & GENERIC_ALL) {
217 *pmode |= (S_IRWXUGO & (*pbits_to_set)); 217 *pmode |= (S_IRWXUGO & (*pbits_to_set));
218 cFYI(DBG2, ("all perms")); 218 cFYI(DBG2, "all perms");
219 return; 219 return;
220 } 220 }
221 if ((flags & GENERIC_WRITE) || 221 if ((flags & GENERIC_WRITE) ||
@@ -228,7 +228,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
228 ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) 228 ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
229 *pmode |= (S_IXUGO & (*pbits_to_set)); 229 *pmode |= (S_IXUGO & (*pbits_to_set));
230 230
231 cFYI(DBG2, ("access flags 0x%x mode now 0x%x", flags, *pmode)); 231 cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
232 return; 232 return;
233} 233}
234 234
@@ -257,7 +257,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
257 if (mode & S_IXUGO) 257 if (mode & S_IXUGO)
258 *pace_flags |= SET_FILE_EXEC_RIGHTS; 258 *pace_flags |= SET_FILE_EXEC_RIGHTS;
259 259
260 cFYI(DBG2, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags)); 260 cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
261 return; 261 return;
262} 262}
263 263
@@ -297,24 +297,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
297 /* validate that we do not go past end of acl */ 297 /* validate that we do not go past end of acl */
298 298
299 if (le16_to_cpu(pace->size) < 16) { 299 if (le16_to_cpu(pace->size) < 16) {
300 cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size))); 300 cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
301 return; 301 return;
302 } 302 }
303 303
304 if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) { 304 if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
305 cERROR(1, ("ACL too small to parse ACE")); 305 cERROR(1, "ACL too small to parse ACE");
306 return; 306 return;
307 } 307 }
308 308
309 num_subauth = pace->sid.num_subauth; 309 num_subauth = pace->sid.num_subauth;
310 if (num_subauth) { 310 if (num_subauth) {
311 int i; 311 int i;
312 cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d", 312 cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
313 pace->sid.revision, pace->sid.num_subauth, pace->type, 313 pace->sid.revision, pace->sid.num_subauth, pace->type,
314 pace->flags, le16_to_cpu(pace->size))); 314 pace->flags, le16_to_cpu(pace->size));
315 for (i = 0; i < num_subauth; ++i) { 315 for (i = 0; i < num_subauth; ++i) {
316 cFYI(1, ("ACE sub_auth[%d]: 0x%x", i, 316 cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
317 le32_to_cpu(pace->sid.sub_auth[i]))); 317 le32_to_cpu(pace->sid.sub_auth[i]));
318 } 318 }
319 319
320 /* BB add length check to make sure that we do not have huge 320 /* BB add length check to make sure that we do not have huge
@@ -347,13 +347,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
347 347
348 /* validate that we do not go past end of acl */ 348 /* validate that we do not go past end of acl */
349 if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { 349 if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
350 cERROR(1, ("ACL too small to parse DACL")); 350 cERROR(1, "ACL too small to parse DACL");
351 return; 351 return;
352 } 352 }
353 353
354 cFYI(DBG2, ("DACL revision %d size %d num aces %d", 354 cFYI(DBG2, "DACL revision %d size %d num aces %d",
355 le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), 355 le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
356 le32_to_cpu(pdacl->num_aces))); 356 le32_to_cpu(pdacl->num_aces));
357 357
358 /* reset rwx permissions for user/group/other. 358 /* reset rwx permissions for user/group/other.
359 Also, if num_aces is 0 i.e. DACL has no ACEs, 359 Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -437,25 +437,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
437 /* validate that we do not go past end of ACL - sid must be at least 8 437 /* validate that we do not go past end of ACL - sid must be at least 8
438 bytes long (assuming no sub-auths - e.g. the null SID */ 438 bytes long (assuming no sub-auths - e.g. the null SID */
439 if (end_of_acl < (char *)psid + 8) { 439 if (end_of_acl < (char *)psid + 8) {
440 cERROR(1, ("ACL too small to parse SID %p", psid)); 440 cERROR(1, "ACL too small to parse SID %p", psid);
441 return -EINVAL; 441 return -EINVAL;
442 } 442 }
443 443
444 if (psid->num_subauth) { 444 if (psid->num_subauth) {
445#ifdef CONFIG_CIFS_DEBUG2 445#ifdef CONFIG_CIFS_DEBUG2
446 int i; 446 int i;
447 cFYI(1, ("SID revision %d num_auth %d", 447 cFYI(1, "SID revision %d num_auth %d",
448 psid->revision, psid->num_subauth)); 448 psid->revision, psid->num_subauth);
449 449
450 for (i = 0; i < psid->num_subauth; i++) { 450 for (i = 0; i < psid->num_subauth; i++) {
451 cFYI(1, ("SID sub_auth[%d]: 0x%x ", i, 451 cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
452 le32_to_cpu(psid->sub_auth[i]))); 452 le32_to_cpu(psid->sub_auth[i]));
453 } 453 }
454 454
455 /* BB add length check to make sure that we do not have huge 455 /* BB add length check to make sure that we do not have huge
456 num auths and therefore go off the end */ 456 num auths and therefore go off the end */
457 cFYI(1, ("RID 0x%x", 457 cFYI(1, "RID 0x%x",
458 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]))); 458 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
459#endif 459#endif
460 } 460 }
461 461
@@ -482,11 +482,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
482 le32_to_cpu(pntsd->gsidoffset)); 482 le32_to_cpu(pntsd->gsidoffset));
483 dacloffset = le32_to_cpu(pntsd->dacloffset); 483 dacloffset = le32_to_cpu(pntsd->dacloffset);
484 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); 484 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
485 cFYI(DBG2, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x " 485 cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
486 "sacloffset 0x%x dacloffset 0x%x", 486 "sacloffset 0x%x dacloffset 0x%x",
487 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), 487 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
488 le32_to_cpu(pntsd->gsidoffset), 488 le32_to_cpu(pntsd->gsidoffset),
489 le32_to_cpu(pntsd->sacloffset), dacloffset)); 489 le32_to_cpu(pntsd->sacloffset), dacloffset);
490/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */ 490/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
491 rc = parse_sid(owner_sid_ptr, end_of_acl); 491 rc = parse_sid(owner_sid_ptr, end_of_acl);
492 if (rc) 492 if (rc)
@@ -500,7 +500,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
500 parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, 500 parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
501 group_sid_ptr, fattr); 501 group_sid_ptr, fattr);
502 else 502 else
503 cFYI(1, ("no ACL")); /* BB grant all or default perms? */ 503 cFYI(1, "no ACL"); /* BB grant all or default perms? */
504 504
505/* cifscred->uid = owner_sid_ptr->rid; 505/* cifscred->uid = owner_sid_ptr->rid;
506 cifscred->gid = group_sid_ptr->rid; 506 cifscred->gid = group_sid_ptr->rid;
@@ -563,7 +563,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
563 FreeXid(xid); 563 FreeXid(xid);
564 564
565 565
566 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); 566 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
567 return pntsd; 567 return pntsd;
568} 568}
569 569
@@ -581,12 +581,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
581 &fid, &oplock, NULL, cifs_sb->local_nls, 581 &fid, &oplock, NULL, cifs_sb->local_nls,
582 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 582 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
583 if (rc) { 583 if (rc) {
584 cERROR(1, ("Unable to open file to get ACL")); 584 cERROR(1, "Unable to open file to get ACL");
585 goto out; 585 goto out;
586 } 586 }
587 587
588 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 588 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
589 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); 589 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
590 590
591 CIFSSMBClose(xid, cifs_sb->tcon, fid); 591 CIFSSMBClose(xid, cifs_sb->tcon, fid);
592 out: 592 out:
@@ -621,7 +621,7 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
621 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 621 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
622 FreeXid(xid); 622 FreeXid(xid);
623 623
624 cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); 624 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
625 return rc; 625 return rc;
626} 626}
627 627
@@ -638,12 +638,12 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
638 &fid, &oplock, NULL, cifs_sb->local_nls, 638 &fid, &oplock, NULL, cifs_sb->local_nls,
639 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 639 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
640 if (rc) { 640 if (rc) {
641 cERROR(1, ("Unable to open file to set ACL")); 641 cERROR(1, "Unable to open file to set ACL");
642 goto out; 642 goto out;
643 } 643 }
644 644
645 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 645 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
646 cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); 646 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
647 647
648 CIFSSMBClose(xid, cifs_sb->tcon, fid); 648 CIFSSMBClose(xid, cifs_sb->tcon, fid);
649 out: 649 out:
@@ -659,7 +659,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
659 struct cifsFileInfo *open_file; 659 struct cifsFileInfo *open_file;
660 int rc; 660 int rc;
661 661
662 cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode)); 662 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
663 663
664 open_file = find_readable_file(CIFS_I(inode)); 664 open_file = find_readable_file(CIFS_I(inode));
665 if (!open_file) 665 if (!open_file)
@@ -679,7 +679,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
679 u32 acllen = 0; 679 u32 acllen = 0;
680 int rc = 0; 680 int rc = 0;
681 681
682 cFYI(DBG2, ("converting ACL to mode for %s", path)); 682 cFYI(DBG2, "converting ACL to mode for %s", path);
683 683
684 if (pfid) 684 if (pfid)
685 pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); 685 pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -690,7 +690,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
690 if (pntsd) 690 if (pntsd)
691 rc = parse_sec_desc(pntsd, acllen, fattr); 691 rc = parse_sec_desc(pntsd, acllen, fattr);
692 if (rc) 692 if (rc)
693 cFYI(1, ("parse sec desc failed rc = %d", rc)); 693 cFYI(1, "parse sec desc failed rc = %d", rc);
694 694
695 kfree(pntsd); 695 kfree(pntsd);
696 return; 696 return;
@@ -704,7 +704,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
704 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ 704 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
705 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ 705 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
706 706
707 cFYI(DBG2, ("set ACL from mode for %s", path)); 707 cFYI(DBG2, "set ACL from mode for %s", path);
708 708
709 /* Get the security descriptor */ 709 /* Get the security descriptor */
710 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); 710 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
@@ -721,19 +721,19 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
721 DEFSECDESCLEN : secdesclen; 721 DEFSECDESCLEN : secdesclen;
722 pnntsd = kmalloc(secdesclen, GFP_KERNEL); 722 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
723 if (!pnntsd) { 723 if (!pnntsd) {
724 cERROR(1, ("Unable to allocate security descriptor")); 724 cERROR(1, "Unable to allocate security descriptor");
725 kfree(pntsd); 725 kfree(pntsd);
726 return -ENOMEM; 726 return -ENOMEM;
727 } 727 }
728 728
729 rc = build_sec_desc(pntsd, pnntsd, inode, nmode); 729 rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
730 730
731 cFYI(DBG2, ("build_sec_desc rc: %d", rc)); 731 cFYI(DBG2, "build_sec_desc rc: %d", rc);
732 732
733 if (!rc) { 733 if (!rc) {
734 /* Set the security descriptor */ 734 /* Set the security descriptor */
735 rc = set_cifs_acl(pnntsd, secdesclen, inode, path); 735 rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
736 cFYI(DBG2, ("set_cifs_acl rc: %d", rc)); 736 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
737 } 737 }
738 738
739 kfree(pnntsd); 739 kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index fbe986430d0c..35042d8f7338 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -103,7 +103,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
103 if (iov[i].iov_len == 0) 103 if (iov[i].iov_len == 0)
104 continue; 104 continue;
105 if (iov[i].iov_base == NULL) { 105 if (iov[i].iov_base == NULL) {
106 cERROR(1, ("null iovec entry")); 106 cERROR(1, "null iovec entry");
107 return -EIO; 107 return -EIO;
108 } 108 }
109 /* The first entry includes a length field (which does not get 109 /* The first entry includes a length field (which does not get
@@ -181,8 +181,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
181 181
182 /* Do not need to verify session setups with signature "BSRSPYL " */ 182 /* Do not need to verify session setups with signature "BSRSPYL " */
183 if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0) 183 if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
184 cFYI(1, ("dummy signature received for smb command 0x%x", 184 cFYI(1, "dummy signature received for smb command 0x%x",
185 cifs_pdu->Command)); 185 cifs_pdu->Command);
186 186
187 /* save off the origiginal signature so we can modify the smb and check 187 /* save off the origiginal signature so we can modify the smb and check
188 its signature against what the server sent */ 188 its signature against what the server sent */
@@ -223,63 +223,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
223 return 0; 223 return 0;
224} 224}
225 225
226int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
227 const struct nls_table *nls_info)
228{
229 char temp_hash[16];
230 struct HMACMD5Context ctx;
231 char *ucase_buf;
232 __le16 *unicode_buf;
233 unsigned int i, user_name_len, dom_name_len;
234
235 if (ses == NULL)
236 return -EINVAL;
237
238 E_md4hash(ses->password, temp_hash);
239
240 hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
241 user_name_len = strlen(ses->userName);
242 if (user_name_len > MAX_USERNAME_SIZE)
243 return -EINVAL;
244 if (ses->domainName == NULL)
245 return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
246 dom_name_len = strlen(ses->domainName);
247 if (dom_name_len > MAX_USERNAME_SIZE)
248 return -EINVAL;
249
250 ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL);
251 if (ucase_buf == NULL)
252 return -ENOMEM;
253 unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
254 if (unicode_buf == NULL) {
255 kfree(ucase_buf);
256 return -ENOMEM;
257 }
258
259 for (i = 0; i < user_name_len; i++)
260 ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
261 ucase_buf[i] = 0;
262 user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
263 MAX_USERNAME_SIZE*2, nls_info);
264 unicode_buf[user_name_len] = 0;
265 user_name_len++;
266
267 for (i = 0; i < dom_name_len; i++)
268 ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
269 ucase_buf[i] = 0;
270 dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
271 MAX_USERNAME_SIZE*2, nls_info);
272
273 unicode_buf[user_name_len + dom_name_len] = 0;
274 hmac_md5_update((const unsigned char *) unicode_buf,
275 (user_name_len+dom_name_len)*2, &ctx);
276
277 hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
278 kfree(ucase_buf);
279 kfree(unicode_buf);
280 return 0;
281}
282
283#ifdef CONFIG_CIFS_WEAK_PW_HASH 226#ifdef CONFIG_CIFS_WEAK_PW_HASH
284void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, 227void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
285 char *lnm_session_key) 228 char *lnm_session_key)
@@ -291,7 +234,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
291 if (password) 234 if (password)
292 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); 235 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
293 236
294 if (!encrypt && extended_security & CIFSSEC_MAY_PLNTXT) { 237 if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
295 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); 238 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
296 memcpy(lnm_session_key, password_with_pad, 239 memcpy(lnm_session_key, password_with_pad,
297 CIFS_ENCPWD_SIZE); 240 CIFS_ENCPWD_SIZE);
@@ -398,7 +341,7 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
398 /* calculate buf->ntlmv2_hash */ 341 /* calculate buf->ntlmv2_hash */
399 rc = calc_ntlmv2_hash(ses, nls_cp); 342 rc = calc_ntlmv2_hash(ses, nls_cp);
400 if (rc) 343 if (rc)
401 cERROR(1, ("could not get v2 hash rc %d", rc)); 344 cERROR(1, "could not get v2 hash rc %d", rc);
402 CalcNTLMv2_response(ses, resp_buf); 345 CalcNTLMv2_response(ses, resp_buf);
403 346
404 /* now calculate the MAC key for NTLMv2 */ 347 /* now calculate the MAC key for NTLMv2 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ad235d604a0b..b7431afdd76d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -45,14 +45,10 @@
45#include "cifs_fs_sb.h" 45#include "cifs_fs_sb.h"
46#include <linux/mm.h> 46#include <linux/mm.h>
47#include <linux/key-type.h> 47#include <linux/key-type.h>
48#include "dns_resolve.h"
49#include "cifs_spnego.h" 48#include "cifs_spnego.h"
49#include "fscache.h"
50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ 50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
51 51
52#ifdef CONFIG_CIFS_QUOTA
53static const struct quotactl_ops cifs_quotactl_ops;
54#endif /* QUOTA */
55
56int cifsFYI = 0; 52int cifsFYI = 0;
57int cifsERROR = 1; 53int cifsERROR = 1;
58int traceSMB = 0; 54int traceSMB = 0;
@@ -61,7 +57,7 @@ unsigned int experimEnabled = 0;
61unsigned int linuxExtEnabled = 1; 57unsigned int linuxExtEnabled = 1;
62unsigned int lookupCacheEnabled = 1; 58unsigned int lookupCacheEnabled = 1;
63unsigned int multiuser_mount = 0; 59unsigned int multiuser_mount = 0;
64unsigned int extended_security = CIFSSEC_DEF; 60unsigned int global_secflags = CIFSSEC_DEF;
65/* unsigned int ntlmv2_support = 0; */ 61/* unsigned int ntlmv2_support = 0; */
66unsigned int sign_CIFS_PDUs = 1; 62unsigned int sign_CIFS_PDUs = 1;
67static const struct super_operations cifs_super_ops; 63static const struct super_operations cifs_super_ops;
@@ -86,8 +82,6 @@ extern mempool_t *cifs_sm_req_poolp;
86extern mempool_t *cifs_req_poolp; 82extern mempool_t *cifs_req_poolp;
87extern mempool_t *cifs_mid_poolp; 83extern mempool_t *cifs_mid_poolp;
88 84
89extern struct kmem_cache *cifs_oplock_cachep;
90
91static int 85static int
92cifs_read_super(struct super_block *sb, void *data, 86cifs_read_super(struct super_block *sb, void *data,
93 const char *devname, int silent) 87 const char *devname, int silent)
@@ -135,8 +129,7 @@ cifs_read_super(struct super_block *sb, void *data,
135 129
136 if (rc) { 130 if (rc) {
137 if (!silent) 131 if (!silent)
138 cERROR(1, 132 cERROR(1, "cifs_mount failed w/return code = %d", rc);
139 ("cifs_mount failed w/return code = %d", rc));
140 goto out_mount_failed; 133 goto out_mount_failed;
141 } 134 }
142 135
@@ -146,9 +139,6 @@ cifs_read_super(struct super_block *sb, void *data,
146/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512) 139/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
147 sb->s_blocksize = 140 sb->s_blocksize =
148 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */ 141 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
149#ifdef CONFIG_CIFS_QUOTA
150 sb->s_qcop = &cifs_quotactl_ops;
151#endif
152 sb->s_blocksize = CIFS_MAX_MSGSIZE; 142 sb->s_blocksize = CIFS_MAX_MSGSIZE;
153 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 143 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
154 inode = cifs_root_iget(sb, ROOT_I); 144 inode = cifs_root_iget(sb, ROOT_I);
@@ -168,7 +158,7 @@ cifs_read_super(struct super_block *sb, void *data,
168 158
169#ifdef CONFIG_CIFS_EXPERIMENTAL 159#ifdef CONFIG_CIFS_EXPERIMENTAL
170 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 160 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
171 cFYI(1, ("export ops supported")); 161 cFYI(1, "export ops supported");
172 sb->s_export_op = &cifs_export_ops; 162 sb->s_export_op = &cifs_export_ops;
173 } 163 }
174#endif /* EXPERIMENTAL */ 164#endif /* EXPERIMENTAL */
@@ -176,7 +166,7 @@ cifs_read_super(struct super_block *sb, void *data,
176 return 0; 166 return 0;
177 167
178out_no_root: 168out_no_root:
179 cERROR(1, ("cifs_read_super: get root inode failed")); 169 cERROR(1, "cifs_read_super: get root inode failed");
180 if (inode) 170 if (inode)
181 iput(inode); 171 iput(inode);
182 172
@@ -203,10 +193,10 @@ cifs_put_super(struct super_block *sb)
203 int rc = 0; 193 int rc = 0;
204 struct cifs_sb_info *cifs_sb; 194 struct cifs_sb_info *cifs_sb;
205 195
206 cFYI(1, ("In cifs_put_super")); 196 cFYI(1, "In cifs_put_super");
207 cifs_sb = CIFS_SB(sb); 197 cifs_sb = CIFS_SB(sb);
208 if (cifs_sb == NULL) { 198 if (cifs_sb == NULL) {
209 cFYI(1, ("Empty cifs superblock info passed to unmount")); 199 cFYI(1, "Empty cifs superblock info passed to unmount");
210 return; 200 return;
211 } 201 }
212 202
@@ -214,7 +204,7 @@ cifs_put_super(struct super_block *sb)
214 204
215 rc = cifs_umount(sb, cifs_sb); 205 rc = cifs_umount(sb, cifs_sb);
216 if (rc) 206 if (rc)
217 cERROR(1, ("cifs_umount failed with return code %d", rc)); 207 cERROR(1, "cifs_umount failed with return code %d", rc);
218#ifdef CONFIG_CIFS_DFS_UPCALL 208#ifdef CONFIG_CIFS_DFS_UPCALL
219 if (cifs_sb->mountdata) { 209 if (cifs_sb->mountdata) {
220 kfree(cifs_sb->mountdata); 210 kfree(cifs_sb->mountdata);
@@ -300,7 +290,6 @@ static int cifs_permission(struct inode *inode, int mask)
300static struct kmem_cache *cifs_inode_cachep; 290static struct kmem_cache *cifs_inode_cachep;
301static struct kmem_cache *cifs_req_cachep; 291static struct kmem_cache *cifs_req_cachep;
302static struct kmem_cache *cifs_mid_cachep; 292static struct kmem_cache *cifs_mid_cachep;
303struct kmem_cache *cifs_oplock_cachep;
304static struct kmem_cache *cifs_sm_req_cachep; 293static struct kmem_cache *cifs_sm_req_cachep;
305mempool_t *cifs_sm_req_poolp; 294mempool_t *cifs_sm_req_poolp;
306mempool_t *cifs_req_poolp; 295mempool_t *cifs_req_poolp;
@@ -340,6 +329,14 @@ cifs_destroy_inode(struct inode *inode)
340} 329}
341 330
342static void 331static void
332cifs_evict_inode(struct inode *inode)
333{
334 truncate_inode_pages(&inode->i_data, 0);
335 end_writeback(inode);
336 cifs_fscache_release_inode_cookie(inode);
337}
338
339static void
343cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) 340cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
344{ 341{
345 seq_printf(s, ",addr="); 342 seq_printf(s, ",addr=");
@@ -432,106 +429,6 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
432 return 0; 429 return 0;
433} 430}
434 431
435#ifdef CONFIG_CIFS_QUOTA
436int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
437 struct fs_disk_quota *pdquota)
438{
439 int xid;
440 int rc = 0;
441 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
442 struct cifsTconInfo *pTcon;
443
444 if (cifs_sb)
445 pTcon = cifs_sb->tcon;
446 else
447 return -EIO;
448
449
450 xid = GetXid();
451 if (pTcon) {
452 cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
453 } else
454 rc = -EIO;
455
456 FreeXid(xid);
457 return rc;
458}
459
460int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
461 struct fs_disk_quota *pdquota)
462{
463 int xid;
464 int rc = 0;
465 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
466 struct cifsTconInfo *pTcon;
467
468 if (cifs_sb)
469 pTcon = cifs_sb->tcon;
470 else
471 return -EIO;
472
473 xid = GetXid();
474 if (pTcon) {
475 cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
476 } else
477 rc = -EIO;
478
479 FreeXid(xid);
480 return rc;
481}
482
483int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
484{
485 int xid;
486 int rc = 0;
487 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
488 struct cifsTconInfo *pTcon;
489
490 if (cifs_sb)
491 pTcon = cifs_sb->tcon;
492 else
493 return -EIO;
494
495 xid = GetXid();
496 if (pTcon) {
497 cFYI(1, ("flags: 0x%x operation: 0x%x", flags, operation));
498 } else
499 rc = -EIO;
500
501 FreeXid(xid);
502 return rc;
503}
504
505int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
506{
507 int xid;
508 int rc = 0;
509 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
510 struct cifsTconInfo *pTcon;
511
512 if (cifs_sb)
513 pTcon = cifs_sb->tcon;
514 else
515 return -EIO;
516
517 xid = GetXid();
518 if (pTcon) {
519 cFYI(1, ("pqstats %p", qstats));
520 } else
521 rc = -EIO;
522
523 FreeXid(xid);
524 return rc;
525}
526
527static const struct quotactl_ops cifs_quotactl_ops = {
528 .set_xquota = cifs_xquota_set,
529 .get_xquota = cifs_xquota_get,
530 .set_xstate = cifs_xstate_set,
531 .get_xstate = cifs_xstate_get,
532};
533#endif
534
535static void cifs_umount_begin(struct super_block *sb) 432static void cifs_umount_begin(struct super_block *sb)
536{ 433{
537 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 434 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -558,7 +455,7 @@ static void cifs_umount_begin(struct super_block *sb)
558 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ 455 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
559 /* cancel_notify_requests(tcon); */ 456 /* cancel_notify_requests(tcon); */
560 if (tcon->ses && tcon->ses->server) { 457 if (tcon->ses && tcon->ses->server) {
561 cFYI(1, ("wake up tasks now - umount begin not complete")); 458 cFYI(1, "wake up tasks now - umount begin not complete");
562 wake_up_all(&tcon->ses->server->request_q); 459 wake_up_all(&tcon->ses->server->request_q);
563 wake_up_all(&tcon->ses->server->response_q); 460 wake_up_all(&tcon->ses->server->response_q);
564 msleep(1); /* yield */ 461 msleep(1); /* yield */
@@ -584,14 +481,24 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data)
584 return 0; 481 return 0;
585} 482}
586 483
484static int cifs_drop_inode(struct inode *inode)
485{
486 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
487
488 /* no serverino => unconditional eviction */
489 return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
490 generic_drop_inode(inode);
491}
492
587static const struct super_operations cifs_super_ops = { 493static const struct super_operations cifs_super_ops = {
588 .put_super = cifs_put_super, 494 .put_super = cifs_put_super,
589 .statfs = cifs_statfs, 495 .statfs = cifs_statfs,
590 .alloc_inode = cifs_alloc_inode, 496 .alloc_inode = cifs_alloc_inode,
591 .destroy_inode = cifs_destroy_inode, 497 .destroy_inode = cifs_destroy_inode,
592/* .drop_inode = generic_delete_inode, 498 .drop_inode = cifs_drop_inode,
593 .delete_inode = cifs_delete_inode, */ /* Do not need above two 499 .evict_inode = cifs_evict_inode,
594 functions unless later we add lazy close of inodes or unless the 500/* .delete_inode = cifs_delete_inode, */ /* Do not need above
501 function unless later we add lazy close of inodes or unless the
595 kernel forgets to call us with the same number of releases (closes) 502 kernel forgets to call us with the same number of releases (closes)
596 as opens */ 503 as opens */
597 .show_options = cifs_show_options, 504 .show_options = cifs_show_options,
@@ -609,7 +516,7 @@ cifs_get_sb(struct file_system_type *fs_type,
609 int rc; 516 int rc;
610 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL); 517 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
611 518
612 cFYI(1, ("Devname: %s flags: %d ", dev_name, flags)); 519 cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
613 520
614 if (IS_ERR(sb)) 521 if (IS_ERR(sb))
615 return PTR_ERR(sb); 522 return PTR_ERR(sb);
@@ -656,7 +563,6 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
656 return generic_file_llseek_unlocked(file, offset, origin); 563 return generic_file_llseek_unlocked(file, offset, origin);
657} 564}
658 565
659#ifdef CONFIG_CIFS_EXPERIMENTAL
660static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 566static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
661{ 567{
662 /* note that this is called by vfs setlease with the BKL held 568 /* note that this is called by vfs setlease with the BKL held
@@ -685,7 +591,6 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
685 else 591 else
686 return -EAGAIN; 592 return -EAGAIN;
687} 593}
688#endif
689 594
690struct file_system_type cifs_fs_type = { 595struct file_system_type cifs_fs_type = {
691 .owner = THIS_MODULE, 596 .owner = THIS_MODULE,
@@ -762,10 +667,7 @@ const struct file_operations cifs_file_ops = {
762#ifdef CONFIG_CIFS_POSIX 667#ifdef CONFIG_CIFS_POSIX
763 .unlocked_ioctl = cifs_ioctl, 668 .unlocked_ioctl = cifs_ioctl,
764#endif /* CONFIG_CIFS_POSIX */ 669#endif /* CONFIG_CIFS_POSIX */
765
766#ifdef CONFIG_CIFS_EXPERIMENTAL
767 .setlease = cifs_setlease, 670 .setlease = cifs_setlease,
768#endif /* CONFIG_CIFS_EXPERIMENTAL */
769}; 671};
770 672
771const struct file_operations cifs_file_direct_ops = { 673const struct file_operations cifs_file_direct_ops = {
@@ -784,9 +686,7 @@ const struct file_operations cifs_file_direct_ops = {
784 .unlocked_ioctl = cifs_ioctl, 686 .unlocked_ioctl = cifs_ioctl,
785#endif /* CONFIG_CIFS_POSIX */ 687#endif /* CONFIG_CIFS_POSIX */
786 .llseek = cifs_llseek, 688 .llseek = cifs_llseek,
787#ifdef CONFIG_CIFS_EXPERIMENTAL
788 .setlease = cifs_setlease, 689 .setlease = cifs_setlease,
789#endif /* CONFIG_CIFS_EXPERIMENTAL */
790}; 690};
791const struct file_operations cifs_file_nobrl_ops = { 691const struct file_operations cifs_file_nobrl_ops = {
792 .read = do_sync_read, 692 .read = do_sync_read,
@@ -803,10 +703,7 @@ const struct file_operations cifs_file_nobrl_ops = {
803#ifdef CONFIG_CIFS_POSIX 703#ifdef CONFIG_CIFS_POSIX
804 .unlocked_ioctl = cifs_ioctl, 704 .unlocked_ioctl = cifs_ioctl,
805#endif /* CONFIG_CIFS_POSIX */ 705#endif /* CONFIG_CIFS_POSIX */
806
807#ifdef CONFIG_CIFS_EXPERIMENTAL
808 .setlease = cifs_setlease, 706 .setlease = cifs_setlease,
809#endif /* CONFIG_CIFS_EXPERIMENTAL */
810}; 707};
811 708
812const struct file_operations cifs_file_direct_nobrl_ops = { 709const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -824,9 +721,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
824 .unlocked_ioctl = cifs_ioctl, 721 .unlocked_ioctl = cifs_ioctl,
825#endif /* CONFIG_CIFS_POSIX */ 722#endif /* CONFIG_CIFS_POSIX */
826 .llseek = cifs_llseek, 723 .llseek = cifs_llseek,
827#ifdef CONFIG_CIFS_EXPERIMENTAL
828 .setlease = cifs_setlease, 724 .setlease = cifs_setlease,
829#endif /* CONFIG_CIFS_EXPERIMENTAL */
830}; 725};
831 726
832const struct file_operations cifs_dir_ops = { 727const struct file_operations cifs_dir_ops = {
@@ -878,7 +773,7 @@ cifs_init_request_bufs(void)
878 } else { 773 } else {
879 CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/ 774 CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
880 } 775 }
881/* cERROR(1,("CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize)); */ 776/* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
882 cifs_req_cachep = kmem_cache_create("cifs_request", 777 cifs_req_cachep = kmem_cache_create("cifs_request",
883 CIFSMaxBufSize + 778 CIFSMaxBufSize +
884 MAX_CIFS_HDR_SIZE, 0, 779 MAX_CIFS_HDR_SIZE, 0,
@@ -890,7 +785,7 @@ cifs_init_request_bufs(void)
890 cifs_min_rcv = 1; 785 cifs_min_rcv = 1;
891 else if (cifs_min_rcv > 64) { 786 else if (cifs_min_rcv > 64) {
892 cifs_min_rcv = 64; 787 cifs_min_rcv = 64;
893 cERROR(1, ("cifs_min_rcv set to maximum (64)")); 788 cERROR(1, "cifs_min_rcv set to maximum (64)");
894 } 789 }
895 790
896 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv, 791 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -921,7 +816,7 @@ cifs_init_request_bufs(void)
921 cifs_min_small = 2; 816 cifs_min_small = 2;
922 else if (cifs_min_small > 256) { 817 else if (cifs_min_small > 256) {
923 cifs_min_small = 256; 818 cifs_min_small = 256;
924 cFYI(1, ("cifs_min_small set to maximum (256)")); 819 cFYI(1, "cifs_min_small set to maximum (256)");
925 } 820 }
926 821
927 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small, 822 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -962,15 +857,6 @@ cifs_init_mids(void)
962 return -ENOMEM; 857 return -ENOMEM;
963 } 858 }
964 859
965 cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
966 sizeof(struct oplock_q_entry), 0,
967 SLAB_HWCACHE_ALIGN, NULL);
968 if (cifs_oplock_cachep == NULL) {
969 mempool_destroy(cifs_mid_poolp);
970 kmem_cache_destroy(cifs_mid_cachep);
971 return -ENOMEM;
972 }
973
974 return 0; 860 return 0;
975} 861}
976 862
@@ -979,7 +865,6 @@ cifs_destroy_mids(void)
979{ 865{
980 mempool_destroy(cifs_mid_poolp); 866 mempool_destroy(cifs_mid_poolp);
981 kmem_cache_destroy(cifs_mid_cachep); 867 kmem_cache_destroy(cifs_mid_cachep);
982 kmem_cache_destroy(cifs_oplock_cachep);
983} 868}
984 869
985static int __init 870static int __init
@@ -1019,12 +904,16 @@ init_cifs(void)
1019 904
1020 if (cifs_max_pending < 2) { 905 if (cifs_max_pending < 2) {
1021 cifs_max_pending = 2; 906 cifs_max_pending = 2;
1022 cFYI(1, ("cifs_max_pending set to min of 2")); 907 cFYI(1, "cifs_max_pending set to min of 2");
1023 } else if (cifs_max_pending > 256) { 908 } else if (cifs_max_pending > 256) {
1024 cifs_max_pending = 256; 909 cifs_max_pending = 256;
1025 cFYI(1, ("cifs_max_pending set to max of 256")); 910 cFYI(1, "cifs_max_pending set to max of 256");
1026 } 911 }
1027 912
913 rc = cifs_fscache_register();
914 if (rc)
915 goto out;
916
1028 rc = cifs_init_inodecache(); 917 rc = cifs_init_inodecache();
1029 if (rc) 918 if (rc)
1030 goto out_clean_proc; 919 goto out_clean_proc;
@@ -1045,27 +934,13 @@ init_cifs(void)
1045 if (rc) 934 if (rc)
1046 goto out_unregister_filesystem; 935 goto out_unregister_filesystem;
1047#endif 936#endif
1048#ifdef CONFIG_CIFS_DFS_UPCALL
1049 rc = register_key_type(&key_type_dns_resolver);
1050 if (rc)
1051 goto out_unregister_key_type;
1052#endif
1053 rc = slow_work_register_user(THIS_MODULE);
1054 if (rc)
1055 goto out_unregister_resolver_key;
1056 937
1057 return 0; 938 return 0;
1058 939
1059 out_unregister_resolver_key:
1060#ifdef CONFIG_CIFS_DFS_UPCALL
1061 unregister_key_type(&key_type_dns_resolver);
1062 out_unregister_key_type:
1063#endif
1064#ifdef CONFIG_CIFS_UPCALL 940#ifdef CONFIG_CIFS_UPCALL
1065 unregister_key_type(&cifs_spnego_key_type);
1066 out_unregister_filesystem: 941 out_unregister_filesystem:
1067#endif
1068 unregister_filesystem(&cifs_fs_type); 942 unregister_filesystem(&cifs_fs_type);
943#endif
1069 out_destroy_request_bufs: 944 out_destroy_request_bufs:
1070 cifs_destroy_request_bufs(); 945 cifs_destroy_request_bufs();
1071 out_destroy_mids: 946 out_destroy_mids:
@@ -1074,17 +949,19 @@ init_cifs(void)
1074 cifs_destroy_inodecache(); 949 cifs_destroy_inodecache();
1075 out_clean_proc: 950 out_clean_proc:
1076 cifs_proc_clean(); 951 cifs_proc_clean();
952 cifs_fscache_unregister();
953 out:
1077 return rc; 954 return rc;
1078} 955}
1079 956
1080static void __exit 957static void __exit
1081exit_cifs(void) 958exit_cifs(void)
1082{ 959{
1083 cFYI(DBG2, ("exit_cifs")); 960 cFYI(DBG2, "exit_cifs");
1084 cifs_proc_clean(); 961 cifs_proc_clean();
962 cifs_fscache_unregister();
1085#ifdef CONFIG_CIFS_DFS_UPCALL 963#ifdef CONFIG_CIFS_DFS_UPCALL
1086 cifs_dfs_release_automount_timer(); 964 cifs_dfs_release_automount_timer();
1087 unregister_key_type(&key_type_dns_resolver);
1088#endif 965#endif
1089#ifdef CONFIG_CIFS_UPCALL 966#ifdef CONFIG_CIFS_UPCALL
1090 unregister_key_type(&cifs_spnego_key_type); 967 unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7aa57ecdc437..d82f5fb4761e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
85 size_t write_size, loff_t *poffset); 85 size_t write_size, loff_t *poffset);
86extern int cifs_lock(struct file *, int, struct file_lock *); 86extern int cifs_lock(struct file *, int, struct file_lock *);
87extern int cifs_fsync(struct file *, struct dentry *, int); 87extern int cifs_fsync(struct file *, int);
88extern int cifs_flush(struct file *, fl_owner_t id); 88extern int cifs_flush(struct file *, fl_owner_t id);
89extern int cifs_file_mmap(struct file * , struct vm_area_struct *); 89extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
90extern const struct file_operations cifs_dir_ops; 90extern const struct file_operations cifs_dir_ops;
@@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
114extern const struct export_operations cifs_export_ops; 114extern const struct export_operations cifs_export_ops;
115#endif /* EXPERIMENTAL */ 115#endif /* EXPERIMENTAL */
116 116
117#define CIFS_VERSION "1.62" 117#define CIFS_VERSION "1.65"
118#endif /* _CIFSFS_H */ 118#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0c2fd17439c8..0cdfb8c32ac6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -16,10 +16,13 @@
16 * the GNU Lesser General Public License for more details. 16 * the GNU Lesser General Public License for more details.
17 * 17 *
18 */ 18 */
19#ifndef _CIFS_GLOB_H
20#define _CIFS_GLOB_H
21
19#include <linux/in.h> 22#include <linux/in.h>
20#include <linux/in6.h> 23#include <linux/in6.h>
21#include <linux/slab.h> 24#include <linux/slab.h>
22#include <linux/slow-work.h> 25#include <linux/workqueue.h>
23#include "cifs_fs_sb.h" 26#include "cifs_fs_sb.h"
24#include "cifsacl.h" 27#include "cifsacl.h"
25/* 28/*
@@ -34,7 +37,7 @@
34#define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */ 37#define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */
35#define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null 38#define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null
36 termination then *2 for unicode versions */ 39 termination then *2 for unicode versions */
37#define MAX_PASSWORD_SIZE 16 40#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */
38 41
39#define CIFS_MIN_RCV_POOL 4 42#define CIFS_MIN_RCV_POOL 4
40 43
@@ -80,14 +83,12 @@ enum statusEnum {
80}; 83};
81 84
82enum securityEnum { 85enum securityEnum {
83 PLAINTXT = 0, /* Legacy with Plaintext passwords */ 86 LANMAN = 0, /* Legacy LANMAN auth */
84 LANMAN, /* Legacy LANMAN auth */
85 NTLM, /* Legacy NTLM012 auth with NTLM hash */ 87 NTLM, /* Legacy NTLM012 auth with NTLM hash */
86 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ 88 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
87 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ 89 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
88/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */ 90/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
89 Kerberos, /* Kerberos via SPNEGO */ 91 Kerberos, /* Kerberos via SPNEGO */
90 MSKerberos, /* MS Kerberos via SPNEGO */
91}; 92};
92 93
93enum protocolEnum { 94enum protocolEnum {
@@ -143,7 +144,6 @@ struct TCP_Server_Info {
143 struct list_head pending_mid_q; 144 struct list_head pending_mid_q;
144 void *Server_NlsInfo; /* BB - placeholder for future NLS info */ 145 void *Server_NlsInfo; /* BB - placeholder for future NLS info */
145 unsigned short server_codepage; /* codepage for the server */ 146 unsigned short server_codepage; /* codepage for the server */
146 unsigned long ip_address; /* IP addr for the server if known */
147 enum protocolEnum protocolType; 147 enum protocolEnum protocolType;
148 char versionMajor; 148 char versionMajor;
149 char versionMinor; 149 char versionMinor;
@@ -185,19 +185,15 @@ struct TCP_Server_Info {
185 struct mac_key mac_signing_key; 185 struct mac_key mac_signing_key;
186 char ntlmv2_hash[16]; 186 char ntlmv2_hash[16];
187 unsigned long lstrp; /* when we got last response from this server */ 187 unsigned long lstrp; /* when we got last response from this server */
188}; 188 u16 dialect; /* dialect index that server chose */
189 189 /* extended security flavors that server supports */
190/* 190 bool sec_kerberos; /* supports plain Kerberos */
191 * The following is our shortcut to user information. We surface the uid, 191 bool sec_mskerberos; /* supports legacy MS Kerberos */
192 * and name. We always get the password on the fly in case it 192 bool sec_kerberosu2u; /* supports U2U Kerberos */
193 * has changed. We also hang a list of sessions owned by this user off here. 193 bool sec_ntlmssp; /* supports NTLMSSP */
194 */ 194#ifdef CONFIG_CIFS_FSCACHE
195struct cifsUidInfo { 195 struct fscache_cookie *fscache; /* client index cache cookie */
196 struct list_head userList; 196#endif
197 struct list_head sessionList; /* SMB sessions for this user */
198 uid_t linux_uid;
199 char user[MAX_USERNAME_SIZE + 1]; /* ascii name of user */
200 /* BB may need ptr or callback for PAM or WinBind info */
201}; 197};
202 198
203/* 199/*
@@ -207,9 +203,6 @@ struct cifsSesInfo {
207 struct list_head smb_ses_list; 203 struct list_head smb_ses_list;
208 struct list_head tcon_list; 204 struct list_head tcon_list;
209 struct mutex session_mutex; 205 struct mutex session_mutex;
210#if 0
211 struct cifsUidInfo *uidInfo; /* pointer to user info */
212#endif
213 struct TCP_Server_Info *server; /* pointer to server info */ 206 struct TCP_Server_Info *server; /* pointer to server info */
214 int ses_count; /* reference counter */ 207 int ses_count; /* reference counter */
215 enum statusEnum status; 208 enum statusEnum status;
@@ -221,7 +214,8 @@ struct cifsSesInfo {
221 char *serverNOS; /* name of network operating system of server */ 214 char *serverNOS; /* name of network operating system of server */
222 char *serverDomain; /* security realm of server */ 215 char *serverDomain; /* security realm of server */
223 int Suid; /* remote smb uid */ 216 int Suid; /* remote smb uid */
224 uid_t linux_uid; /* local Linux uid */ 217 uid_t linux_uid; /* overriding owner of files on the mount */
218 uid_t cred_uid; /* owner of credentials */
225 int capabilities; 219 int capabilities;
226 char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for 220 char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
227 TCP names - will ipv6 and sctp addresses fit? */ 221 TCP names - will ipv6 and sctp addresses fit? */
@@ -306,6 +300,10 @@ struct cifsTconInfo {
306 bool local_lease:1; /* check leases (only) on local system not remote */ 300 bool local_lease:1; /* check leases (only) on local system not remote */
307 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ 301 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
308 bool need_reconnect:1; /* connection reset, tid now invalid */ 302 bool need_reconnect:1; /* connection reset, tid now invalid */
303#ifdef CONFIG_CIFS_FSCACHE
304 u64 resource_id; /* server resource id */
305 struct fscache_cookie *fscache; /* cookie for share */
306#endif
309 /* BB add field for back pointer to sb struct(s)? */ 307 /* BB add field for back pointer to sb struct(s)? */
310}; 308};
311 309
@@ -358,7 +356,7 @@ struct cifsFileInfo {
358 atomic_t count; /* reference count */ 356 atomic_t count; /* reference count */
359 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 357 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
360 struct cifs_search_info srch_inf; 358 struct cifs_search_info srch_inf;
361 struct slow_work oplock_break; /* slow_work job for oplock breaks */ 359 struct work_struct oplock_break; /* work for oplock breaks */
362}; 360};
363 361
364/* Take a reference on the file private data */ 362/* Take a reference on the file private data */
@@ -393,6 +391,9 @@ struct cifsInodeInfo {
393 bool invalid_mapping:1; /* pagecache is invalid */ 391 bool invalid_mapping:1; /* pagecache is invalid */
394 u64 server_eof; /* current file size on server */ 392 u64 server_eof; /* current file size on server */
395 u64 uniqueid; /* server inode number */ 393 u64 uniqueid; /* server inode number */
394#ifdef CONFIG_CIFS_FSCACHE
395 struct fscache_cookie *fscache;
396#endif
396 struct inode vfs_inode; 397 struct inode vfs_inode;
397}; 398};
398 399
@@ -718,7 +719,7 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
718GLOBAL_EXTERN unsigned int oplockEnabled; 719GLOBAL_EXTERN unsigned int oplockEnabled;
719GLOBAL_EXTERN unsigned int experimEnabled; 720GLOBAL_EXTERN unsigned int experimEnabled;
720GLOBAL_EXTERN unsigned int lookupCacheEnabled; 721GLOBAL_EXTERN unsigned int lookupCacheEnabled;
721GLOBAL_EXTERN unsigned int extended_security; /* if on, session setup sent 722GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
722 with more secure ntlmssp2 challenge/resp */ 723 with more secure ntlmssp2 challenge/resp */
723GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ 724GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
724GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ 725GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
@@ -727,4 +728,10 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
727GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 728GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
728GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 729GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
729 730
731void cifs_oplock_break(struct work_struct *work);
732void cifs_oplock_break_get(struct cifsFileInfo *cfile);
733void cifs_oplock_break_put(struct cifsFileInfo *cfile);
734
730extern const struct slow_work_ops cifs_oplock_break_ops; 735extern const struct slow_work_ops cifs_oplock_break_ops;
736
737#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 39e47f46dea5..1d60c655e3e0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,8 +39,20 @@ extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
39 unsigned int /* length */); 39 unsigned int /* length */);
40extern unsigned int _GetXid(void); 40extern unsigned int _GetXid(void);
41extern void _FreeXid(unsigned int); 41extern void _FreeXid(unsigned int);
42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid())); 42#define GetXid() \
43#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));} 43({ \
44 int __xid = (int)_GetXid(); \
45 cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d", \
46 __func__, __xid, current_fsuid()); \
47 __xid; \
48})
49
50#define FreeXid(curr_xid) \
51do { \
52 _FreeXid(curr_xid); \
53 cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d", \
54 __func__, curr_xid, (int)rc); \
55} while (0)
44extern char *build_path_from_dentry(struct dentry *); 56extern char *build_path_from_dentry(struct dentry *);
45extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); 57extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
46extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 58extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
@@ -73,8 +85,11 @@ extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
73extern unsigned int smbCalcSize(struct smb_hdr *ptr); 85extern unsigned int smbCalcSize(struct smb_hdr *ptr);
74extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
75extern int decode_negTokenInit(unsigned char *security_blob, int length, 87extern int decode_negTokenInit(unsigned char *security_blob, int length,
76 enum securityEnum *secType); 88 struct TCP_Server_Info *server);
77extern int cifs_convert_address(char *src, void *dst); 89extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
90extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
91extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
92 const unsigned short int port);
78extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); 93extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
79extern void header_assemble(struct smb_hdr *, char /* command */ , 94extern void header_assemble(struct smb_hdr *, char /* command */ ,
80 const struct cifsTconInfo *, int /* length of 95 const struct cifsTconInfo *, int /* length of
@@ -83,7 +98,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
83 struct cifsSesInfo *ses, 98 struct cifsSesInfo *ses,
84 void **request_buf); 99 void **request_buf);
85extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, 100extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
86 const int stage,
87 const struct nls_table *nls_cp); 101 const struct nls_table *nls_cp);
88extern __u16 GetNextMid(struct TCP_Server_Info *server); 102extern __u16 GetNextMid(struct TCP_Server_Info *server);
89extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); 103extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -95,8 +109,10 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
95 __u16 fileHandle, struct file *file, 109 __u16 fileHandle, struct file *file,
96 struct vfsmount *mnt, unsigned int oflags); 110 struct vfsmount *mnt, unsigned int oflags);
97extern int cifs_posix_open(char *full_path, struct inode **pinode, 111extern int cifs_posix_open(char *full_path, struct inode **pinode,
98 struct vfsmount *mnt, int mode, int oflags, 112 struct super_block *sb,
99 __u32 *poplock, __u16 *pnetfid, int xid); 113 int mode, int oflags,
114 __u32 *poplock, __u16 *pnetfid, int xid);
115void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
100extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 116extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
101 FILE_UNIX_BASIC_INFO *info, 117 FILE_UNIX_BASIC_INFO *info,
102 struct cifs_sb_info *cifs_sb); 118 struct cifs_sb_info *cifs_sb);
@@ -125,7 +141,9 @@ extern void cifs_dfs_release_automount_timer(void);
125void cifs_proc_init(void); 141void cifs_proc_init(void);
126void cifs_proc_clean(void); 142void cifs_proc_clean(void);
127 143
128extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, 144extern int cifs_negotiate_protocol(unsigned int xid,
145 struct cifsSesInfo *ses);
146extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
129 struct nls_table *nls_info); 147 struct nls_table *nls_info);
130extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses); 148extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
131 149
@@ -348,8 +366,6 @@ extern int cifs_verify_signature(struct smb_hdr *,
348 __u32 expected_sequence_number); 366 __u32 expected_sequence_number);
349extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 367extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
350 const char *pass); 368 const char *pass);
351extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *,
352 const struct nls_table *);
353extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); 369extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
354extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, 370extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
355 const struct nls_table *); 371 const struct nls_table *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5d3f29fef532..7e83b356cc9e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifssmb.c 2 * fs/cifs/cifssmb.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2009 4 * Copyright (C) International Business Machines Corp., 2002,2010
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * Contains the routines for constructing the SMB PDUs themselves 7 * Contains the routines for constructing the SMB PDUs themselves
@@ -130,8 +130,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
130 if (smb_command != SMB_COM_WRITE_ANDX && 130 if (smb_command != SMB_COM_WRITE_ANDX &&
131 smb_command != SMB_COM_OPEN_ANDX && 131 smb_command != SMB_COM_OPEN_ANDX &&
132 smb_command != SMB_COM_TREE_DISCONNECT) { 132 smb_command != SMB_COM_TREE_DISCONNECT) {
133 cFYI(1, ("can not send cmd %d while umounting", 133 cFYI(1, "can not send cmd %d while umounting",
134 smb_command)); 134 smb_command);
135 return -ENODEV; 135 return -ENODEV;
136 } 136 }
137 } 137 }
@@ -157,7 +157,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
157 * back on-line 157 * back on-line
158 */ 158 */
159 if (!tcon->retry || ses->status == CifsExiting) { 159 if (!tcon->retry || ses->status == CifsExiting) {
160 cFYI(1, ("gave up waiting on reconnect in smb_init")); 160 cFYI(1, "gave up waiting on reconnect in smb_init");
161 return -EHOSTDOWN; 161 return -EHOSTDOWN;
162 } 162 }
163 } 163 }
@@ -172,7 +172,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
172 * reconnect the same SMB session 172 * reconnect the same SMB session
173 */ 173 */
174 mutex_lock(&ses->session_mutex); 174 mutex_lock(&ses->session_mutex);
175 if (ses->need_reconnect) 175 rc = cifs_negotiate_protocol(0, ses);
176 if (rc == 0 && ses->need_reconnect)
176 rc = cifs_setup_session(0, ses, nls_codepage); 177 rc = cifs_setup_session(0, ses, nls_codepage);
177 178
178 /* do we need to reconnect tcon? */ 179 /* do we need to reconnect tcon? */
@@ -184,7 +185,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
184 mark_open_files_invalid(tcon); 185 mark_open_files_invalid(tcon);
185 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); 186 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
186 mutex_unlock(&ses->session_mutex); 187 mutex_unlock(&ses->session_mutex);
187 cFYI(1, ("reconnect tcon rc = %d", rc)); 188 cFYI(1, "reconnect tcon rc = %d", rc);
188 189
189 if (rc) 190 if (rc)
190 goto out; 191 goto out;
@@ -231,7 +232,7 @@ static int
231small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 232small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
232 void **request_buf) 233 void **request_buf)
233{ 234{
234 int rc = 0; 235 int rc;
235 236
236 rc = cifs_reconnect_tcon(tcon, smb_command); 237 rc = cifs_reconnect_tcon(tcon, smb_command);
237 if (rc) 238 if (rc)
@@ -249,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
249 if (tcon != NULL) 250 if (tcon != NULL)
250 cifs_stats_inc(&tcon->num_smbs_sent); 251 cifs_stats_inc(&tcon->num_smbs_sent);
251 252
252 return rc; 253 return 0;
253} 254}
254 255
255int 256int
@@ -280,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
280 281
281/* If the return code is zero, this function must fill in request_buf pointer */ 282/* If the return code is zero, this function must fill in request_buf pointer */
282static int 283static int
283smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 284__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
284 void **request_buf /* returned */ , 285 void **request_buf, void **response_buf)
285 void **response_buf /* returned */ )
286{ 286{
287 int rc = 0;
288
289 rc = cifs_reconnect_tcon(tcon, smb_command);
290 if (rc)
291 return rc;
292
293 *request_buf = cifs_buf_get(); 287 *request_buf = cifs_buf_get();
294 if (*request_buf == NULL) { 288 if (*request_buf == NULL) {
295 /* BB should we add a retry in here if not a writepage? */ 289 /* BB should we add a retry in here if not a writepage? */
@@ -308,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
308 if (tcon != NULL) 302 if (tcon != NULL)
309 cifs_stats_inc(&tcon->num_smbs_sent); 303 cifs_stats_inc(&tcon->num_smbs_sent);
310 304
311 return rc; 305 return 0;
306}
307
308/* If the return code is zero, this function must fill in request_buf pointer */
309static int
310smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
311 void **request_buf, void **response_buf)
312{
313 int rc;
314
315 rc = cifs_reconnect_tcon(tcon, smb_command);
316 if (rc)
317 return rc;
318
319 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
320}
321
322static int
323smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
324 void **request_buf, void **response_buf)
325{
326 if (tcon->ses->need_reconnect || tcon->need_reconnect)
327 return -EHOSTDOWN;
328
329 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
312} 330}
313 331
314static int validate_t2(struct smb_t2_rsp *pSMB) 332static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -355,7 +373,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
355 struct TCP_Server_Info *server; 373 struct TCP_Server_Info *server;
356 u16 count; 374 u16 count;
357 unsigned int secFlags; 375 unsigned int secFlags;
358 u16 dialect;
359 376
360 if (ses->server) 377 if (ses->server)
361 server = ses->server; 378 server = ses->server;
@@ -372,9 +389,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
372 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) 389 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
373 secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */ 390 secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */
374 else /* if override flags set only sign/seal OR them with global auth */ 391 else /* if override flags set only sign/seal OR them with global auth */
375 secFlags = extended_security | ses->overrideSecFlg; 392 secFlags = global_secflags | ses->overrideSecFlg;
376 393
377 cFYI(1, ("secFlags 0x%x", secFlags)); 394 cFYI(1, "secFlags 0x%x", secFlags);
378 395
379 pSMB->hdr.Mid = GetNextMid(server); 396 pSMB->hdr.Mid = GetNextMid(server);
380 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); 397 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
@@ -382,14 +399,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
382 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) 399 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
383 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 400 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
384 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { 401 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
385 cFYI(1, ("Kerberos only mechanism, enable extended security")); 402 cFYI(1, "Kerberos only mechanism, enable extended security");
386 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 403 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
387 } 404 }
388#ifdef CONFIG_CIFS_EXPERIMENTAL 405#ifdef CONFIG_CIFS_EXPERIMENTAL
389 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) 406 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
390 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 407 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
391 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { 408 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
392 cFYI(1, ("NTLMSSP only mechanism, enable extended security")); 409 cFYI(1, "NTLMSSP only mechanism, enable extended security");
393 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 410 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
394 } 411 }
395#endif 412#endif
@@ -408,10 +425,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
408 if (rc != 0) 425 if (rc != 0)
409 goto neg_err_exit; 426 goto neg_err_exit;
410 427
411 dialect = le16_to_cpu(pSMBr->DialectIndex); 428 server->dialect = le16_to_cpu(pSMBr->DialectIndex);
412 cFYI(1, ("Dialect: %d", dialect)); 429 cFYI(1, "Dialect: %d", server->dialect);
413 /* Check wct = 1 error case */ 430 /* Check wct = 1 error case */
414 if ((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) { 431 if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
415 /* core returns wct = 1, but we do not ask for core - otherwise 432 /* core returns wct = 1, but we do not ask for core - otherwise
416 small wct just comes when dialect index is -1 indicating we 433 small wct just comes when dialect index is -1 indicating we
417 could not negotiate a common dialect */ 434 could not negotiate a common dialect */
@@ -419,8 +436,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
419 goto neg_err_exit; 436 goto neg_err_exit;
420#ifdef CONFIG_CIFS_WEAK_PW_HASH 437#ifdef CONFIG_CIFS_WEAK_PW_HASH
421 } else if ((pSMBr->hdr.WordCount == 13) 438 } else if ((pSMBr->hdr.WordCount == 13)
422 && ((dialect == LANMAN_PROT) 439 && ((server->dialect == LANMAN_PROT)
423 || (dialect == LANMAN2_PROT))) { 440 || (server->dialect == LANMAN2_PROT))) {
424 __s16 tmp; 441 __s16 tmp;
425 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; 442 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
426 443
@@ -428,8 +445,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
428 (secFlags & CIFSSEC_MAY_PLNTXT)) 445 (secFlags & CIFSSEC_MAY_PLNTXT))
429 server->secType = LANMAN; 446 server->secType = LANMAN;
430 else { 447 else {
431 cERROR(1, ("mount failed weak security disabled" 448 cERROR(1, "mount failed weak security disabled"
432 " in /proc/fs/cifs/SecurityFlags")); 449 " in /proc/fs/cifs/SecurityFlags");
433 rc = -EOPNOTSUPP; 450 rc = -EOPNOTSUPP;
434 goto neg_err_exit; 451 goto neg_err_exit;
435 } 452 }
@@ -462,9 +479,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
462 utc = CURRENT_TIME; 479 utc = CURRENT_TIME;
463 ts = cnvrtDosUnixTm(rsp->SrvTime.Date, 480 ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
464 rsp->SrvTime.Time, 0); 481 rsp->SrvTime.Time, 0);
465 cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d", 482 cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
466 (int)ts.tv_sec, (int)utc.tv_sec, 483 (int)ts.tv_sec, (int)utc.tv_sec,
467 (int)(utc.tv_sec - ts.tv_sec))); 484 (int)(utc.tv_sec - ts.tv_sec));
468 val = (int)(utc.tv_sec - ts.tv_sec); 485 val = (int)(utc.tv_sec - ts.tv_sec);
469 seconds = abs(val); 486 seconds = abs(val);
470 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; 487 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
@@ -478,7 +495,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
478 server->timeAdj = (int)tmp; 495 server->timeAdj = (int)tmp;
479 server->timeAdj *= 60; /* also in seconds */ 496 server->timeAdj *= 60; /* also in seconds */
480 } 497 }
481 cFYI(1, ("server->timeAdj: %d seconds", server->timeAdj)); 498 cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
482 499
483 500
484 /* BB get server time for time conversions and add 501 /* BB get server time for time conversions and add
@@ -493,14 +510,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
493 goto neg_err_exit; 510 goto neg_err_exit;
494 } 511 }
495 512
496 cFYI(1, ("LANMAN negotiated")); 513 cFYI(1, "LANMAN negotiated");
497 /* we will not end up setting signing flags - as no signing 514 /* we will not end up setting signing flags - as no signing
498 was in LANMAN and server did not return the flags on */ 515 was in LANMAN and server did not return the flags on */
499 goto signing_check; 516 goto signing_check;
500#else /* weak security disabled */ 517#else /* weak security disabled */
501 } else if (pSMBr->hdr.WordCount == 13) { 518 } else if (pSMBr->hdr.WordCount == 13) {
502 cERROR(1, ("mount failed, cifs module not built " 519 cERROR(1, "mount failed, cifs module not built "
503 "with CIFS_WEAK_PW_HASH support")); 520 "with CIFS_WEAK_PW_HASH support");
504 rc = -EOPNOTSUPP; 521 rc = -EOPNOTSUPP;
505#endif /* WEAK_PW_HASH */ 522#endif /* WEAK_PW_HASH */
506 goto neg_err_exit; 523 goto neg_err_exit;
@@ -512,14 +529,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
512 /* else wct == 17 NTLM */ 529 /* else wct == 17 NTLM */
513 server->secMode = pSMBr->SecurityMode; 530 server->secMode = pSMBr->SecurityMode;
514 if ((server->secMode & SECMODE_USER) == 0) 531 if ((server->secMode & SECMODE_USER) == 0)
515 cFYI(1, ("share mode security")); 532 cFYI(1, "share mode security");
516 533
517 if ((server->secMode & SECMODE_PW_ENCRYPT) == 0) 534 if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
518#ifdef CONFIG_CIFS_WEAK_PW_HASH 535#ifdef CONFIG_CIFS_WEAK_PW_HASH
519 if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0) 536 if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
520#endif /* CIFS_WEAK_PW_HASH */ 537#endif /* CIFS_WEAK_PW_HASH */
521 cERROR(1, ("Server requests plain text password" 538 cERROR(1, "Server requests plain text password"
522 " but client support disabled")); 539 " but client support disabled");
523 540
524 if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) 541 if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
525 server->secType = NTLMv2; 542 server->secType = NTLMv2;
@@ -539,7 +556,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
539#endif */ 556#endif */
540 else { 557 else {
541 rc = -EOPNOTSUPP; 558 rc = -EOPNOTSUPP;
542 cERROR(1, ("Invalid security type")); 559 cERROR(1, "Invalid security type");
543 goto neg_err_exit; 560 goto neg_err_exit;
544 } 561 }
545 /* else ... any others ...? */ 562 /* else ... any others ...? */
@@ -551,7 +568,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
551 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), 568 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
552 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 569 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
553 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 570 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
554 cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf)); 571 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
555 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); 572 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
556 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 573 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
557 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 574 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
@@ -582,7 +599,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
582 if (memcmp(server->server_GUID, 599 if (memcmp(server->server_GUID,
583 pSMBr->u.extended_response. 600 pSMBr->u.extended_response.
584 GUID, 16) != 0) { 601 GUID, 16) != 0) {
585 cFYI(1, ("server UID changed")); 602 cFYI(1, "server UID changed");
586 memcpy(server->server_GUID, 603 memcpy(server->server_GUID,
587 pSMBr->u.extended_response.GUID, 604 pSMBr->u.extended_response.GUID,
588 16); 605 16);
@@ -597,13 +614,19 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
597 server->secType = RawNTLMSSP; 614 server->secType = RawNTLMSSP;
598 } else { 615 } else {
599 rc = decode_negTokenInit(pSMBr->u.extended_response. 616 rc = decode_negTokenInit(pSMBr->u.extended_response.
600 SecurityBlob, 617 SecurityBlob, count - 16,
601 count - 16, 618 server);
602 &server->secType);
603 if (rc == 1) 619 if (rc == 1)
604 rc = 0; 620 rc = 0;
605 else 621 else
606 rc = -EINVAL; 622 rc = -EINVAL;
623
624 if (server->sec_kerberos || server->sec_mskerberos)
625 server->secType = Kerberos;
626 else if (server->sec_ntlmssp)
627 server->secType = RawNTLMSSP;
628 else
629 rc = -EOPNOTSUPP;
607 } 630 }
608 } else 631 } else
609 server->capabilities &= ~CAP_EXTENDED_SECURITY; 632 server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -614,22 +637,21 @@ signing_check:
614 if ((secFlags & CIFSSEC_MAY_SIGN) == 0) { 637 if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
615 /* MUST_SIGN already includes the MAY_SIGN FLAG 638 /* MUST_SIGN already includes the MAY_SIGN FLAG
616 so if this is zero it means that signing is disabled */ 639 so if this is zero it means that signing is disabled */
617 cFYI(1, ("Signing disabled")); 640 cFYI(1, "Signing disabled");
618 if (server->secMode & SECMODE_SIGN_REQUIRED) { 641 if (server->secMode & SECMODE_SIGN_REQUIRED) {
619 cERROR(1, ("Server requires " 642 cERROR(1, "Server requires "
620 "packet signing to be enabled in " 643 "packet signing to be enabled in "
621 "/proc/fs/cifs/SecurityFlags.")); 644 "/proc/fs/cifs/SecurityFlags.");
622 rc = -EOPNOTSUPP; 645 rc = -EOPNOTSUPP;
623 } 646 }
624 server->secMode &= 647 server->secMode &=
625 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); 648 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
626 } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { 649 } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
627 /* signing required */ 650 /* signing required */
628 cFYI(1, ("Must sign - secFlags 0x%x", secFlags)); 651 cFYI(1, "Must sign - secFlags 0x%x", secFlags);
629 if ((server->secMode & 652 if ((server->secMode &
630 (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) { 653 (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
631 cERROR(1, 654 cERROR(1, "signing required but server lacks support");
632 ("signing required but server lacks support"));
633 rc = -EOPNOTSUPP; 655 rc = -EOPNOTSUPP;
634 } else 656 } else
635 server->secMode |= SECMODE_SIGN_REQUIRED; 657 server->secMode |= SECMODE_SIGN_REQUIRED;
@@ -643,7 +665,7 @@ signing_check:
643neg_err_exit: 665neg_err_exit:
644 cifs_buf_release(pSMB); 666 cifs_buf_release(pSMB);
645 667
646 cFYI(1, ("negprot rc %d", rc)); 668 cFYI(1, "negprot rc %d", rc);
647 return rc; 669 return rc;
648} 670}
649 671
@@ -653,7 +675,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
653 struct smb_hdr *smb_buffer; 675 struct smb_hdr *smb_buffer;
654 int rc = 0; 676 int rc = 0;
655 677
656 cFYI(1, ("In tree disconnect")); 678 cFYI(1, "In tree disconnect");
657 679
658 /* BB: do we need to check this? These should never be NULL. */ 680 /* BB: do we need to check this? These should never be NULL. */
659 if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) 681 if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -675,7 +697,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
675 697
676 rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0); 698 rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
677 if (rc) 699 if (rc)
678 cFYI(1, ("Tree disconnect failed %d", rc)); 700 cFYI(1, "Tree disconnect failed %d", rc);
679 701
680 /* No need to return error on this operation if tid invalidated and 702 /* No need to return error on this operation if tid invalidated and
681 closed on server already e.g. due to tcp session crashing */ 703 closed on server already e.g. due to tcp session crashing */
@@ -691,7 +713,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
691 LOGOFF_ANDX_REQ *pSMB; 713 LOGOFF_ANDX_REQ *pSMB;
692 int rc = 0; 714 int rc = 0;
693 715
694 cFYI(1, ("In SMBLogoff for session disconnect")); 716 cFYI(1, "In SMBLogoff for session disconnect");
695 717
696 /* 718 /*
697 * BB: do we need to check validity of ses and server? They should 719 * BB: do we need to check validity of ses and server? They should
@@ -744,7 +766,7 @@ CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
744 int bytes_returned = 0; 766 int bytes_returned = 0;
745 __u16 params, param_offset, offset, byte_count; 767 __u16 params, param_offset, offset, byte_count;
746 768
747 cFYI(1, ("In POSIX delete")); 769 cFYI(1, "In POSIX delete");
748PsxDelete: 770PsxDelete:
749 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 771 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
750 (void **) &pSMBr); 772 (void **) &pSMBr);
@@ -796,7 +818,7 @@ PsxDelete:
796 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 818 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
797 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 819 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
798 if (rc) 820 if (rc)
799 cFYI(1, ("Posix delete returned %d", rc)); 821 cFYI(1, "Posix delete returned %d", rc);
800 cifs_buf_release(pSMB); 822 cifs_buf_release(pSMB);
801 823
802 cifs_stats_inc(&tcon->num_deletes); 824 cifs_stats_inc(&tcon->num_deletes);
@@ -843,7 +865,7 @@ DelFileRetry:
843 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 865 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
844 cifs_stats_inc(&tcon->num_deletes); 866 cifs_stats_inc(&tcon->num_deletes);
845 if (rc) 867 if (rc)
846 cFYI(1, ("Error in RMFile = %d", rc)); 868 cFYI(1, "Error in RMFile = %d", rc);
847 869
848 cifs_buf_release(pSMB); 870 cifs_buf_release(pSMB);
849 if (rc == -EAGAIN) 871 if (rc == -EAGAIN)
@@ -862,7 +884,7 @@ CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
862 int bytes_returned; 884 int bytes_returned;
863 int name_len; 885 int name_len;
864 886
865 cFYI(1, ("In CIFSSMBRmDir")); 887 cFYI(1, "In CIFSSMBRmDir");
866RmDirRetry: 888RmDirRetry:
867 rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB, 889 rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
868 (void **) &pSMBr); 890 (void **) &pSMBr);
@@ -887,7 +909,7 @@ RmDirRetry:
887 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 909 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
888 cifs_stats_inc(&tcon->num_rmdirs); 910 cifs_stats_inc(&tcon->num_rmdirs);
889 if (rc) 911 if (rc)
890 cFYI(1, ("Error in RMDir = %d", rc)); 912 cFYI(1, "Error in RMDir = %d", rc);
891 913
892 cifs_buf_release(pSMB); 914 cifs_buf_release(pSMB);
893 if (rc == -EAGAIN) 915 if (rc == -EAGAIN)
@@ -905,7 +927,7 @@ CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
905 int bytes_returned; 927 int bytes_returned;
906 int name_len; 928 int name_len;
907 929
908 cFYI(1, ("In CIFSSMBMkDir")); 930 cFYI(1, "In CIFSSMBMkDir");
909MkDirRetry: 931MkDirRetry:
910 rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB, 932 rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
911 (void **) &pSMBr); 933 (void **) &pSMBr);
@@ -930,7 +952,7 @@ MkDirRetry:
930 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 952 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
931 cifs_stats_inc(&tcon->num_mkdirs); 953 cifs_stats_inc(&tcon->num_mkdirs);
932 if (rc) 954 if (rc)
933 cFYI(1, ("Error in Mkdir = %d", rc)); 955 cFYI(1, "Error in Mkdir = %d", rc);
934 956
935 cifs_buf_release(pSMB); 957 cifs_buf_release(pSMB);
936 if (rc == -EAGAIN) 958 if (rc == -EAGAIN)
@@ -953,7 +975,7 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
953 OPEN_PSX_REQ *pdata; 975 OPEN_PSX_REQ *pdata;
954 OPEN_PSX_RSP *psx_rsp; 976 OPEN_PSX_RSP *psx_rsp;
955 977
956 cFYI(1, ("In POSIX Create")); 978 cFYI(1, "In POSIX Create");
957PsxCreat: 979PsxCreat:
958 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 980 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
959 (void **) &pSMBr); 981 (void **) &pSMBr);
@@ -1007,11 +1029,11 @@ PsxCreat:
1007 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1029 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1008 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 1030 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
1009 if (rc) { 1031 if (rc) {
1010 cFYI(1, ("Posix create returned %d", rc)); 1032 cFYI(1, "Posix create returned %d", rc);
1011 goto psx_create_err; 1033 goto psx_create_err;
1012 } 1034 }
1013 1035
1014 cFYI(1, ("copying inode info")); 1036 cFYI(1, "copying inode info");
1015 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 1037 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
1016 1038
1017 if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) { 1039 if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
@@ -1033,11 +1055,11 @@ PsxCreat:
1033 /* check to make sure response data is there */ 1055 /* check to make sure response data is there */
1034 if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) { 1056 if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
1035 pRetData->Type = cpu_to_le32(-1); /* unknown */ 1057 pRetData->Type = cpu_to_le32(-1); /* unknown */
1036 cFYI(DBG2, ("unknown type")); 1058 cFYI(DBG2, "unknown type");
1037 } else { 1059 } else {
1038 if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP) 1060 if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
1039 + sizeof(FILE_UNIX_BASIC_INFO)) { 1061 + sizeof(FILE_UNIX_BASIC_INFO)) {
1040 cERROR(1, ("Open response data too small")); 1062 cERROR(1, "Open response data too small");
1041 pRetData->Type = cpu_to_le32(-1); 1063 pRetData->Type = cpu_to_le32(-1);
1042 goto psx_create_err; 1064 goto psx_create_err;
1043 } 1065 }
@@ -1084,7 +1106,7 @@ static __u16 convert_disposition(int disposition)
1084 ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC; 1106 ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
1085 break; 1107 break;
1086 default: 1108 default:
1087 cFYI(1, ("unknown disposition %d", disposition)); 1109 cFYI(1, "unknown disposition %d", disposition);
1088 ofun = SMBOPEN_OAPPEND; /* regular open */ 1110 ofun = SMBOPEN_OAPPEND; /* regular open */
1089 } 1111 }
1090 return ofun; 1112 return ofun;
@@ -1175,7 +1197,7 @@ OldOpenRetry:
1175 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1197 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
1176 cifs_stats_inc(&tcon->num_opens); 1198 cifs_stats_inc(&tcon->num_opens);
1177 if (rc) { 1199 if (rc) {
1178 cFYI(1, ("Error in Open = %d", rc)); 1200 cFYI(1, "Error in Open = %d", rc);
1179 } else { 1201 } else {
1180 /* BB verify if wct == 15 */ 1202 /* BB verify if wct == 15 */
1181 1203
@@ -1288,7 +1310,7 @@ openRetry:
1288 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1310 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
1289 cifs_stats_inc(&tcon->num_opens); 1311 cifs_stats_inc(&tcon->num_opens);
1290 if (rc) { 1312 if (rc) {
1291 cFYI(1, ("Error in Open = %d", rc)); 1313 cFYI(1, "Error in Open = %d", rc);
1292 } else { 1314 } else {
1293 *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */ 1315 *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
1294 *netfid = pSMBr->Fid; /* cifs fid stays in le */ 1316 *netfid = pSMBr->Fid; /* cifs fid stays in le */
@@ -1326,7 +1348,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1326 int resp_buf_type = 0; 1348 int resp_buf_type = 0;
1327 struct kvec iov[1]; 1349 struct kvec iov[1];
1328 1350
1329 cFYI(1, ("Reading %d bytes on fid %d", count, netfid)); 1351 cFYI(1, "Reading %d bytes on fid %d", count, netfid);
1330 if (tcon->ses->capabilities & CAP_LARGE_FILES) 1352 if (tcon->ses->capabilities & CAP_LARGE_FILES)
1331 wct = 12; 1353 wct = 12;
1332 else { 1354 else {
@@ -1371,7 +1393,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1371 cifs_stats_inc(&tcon->num_reads); 1393 cifs_stats_inc(&tcon->num_reads);
1372 pSMBr = (READ_RSP *)iov[0].iov_base; 1394 pSMBr = (READ_RSP *)iov[0].iov_base;
1373 if (rc) { 1395 if (rc) {
1374 cERROR(1, ("Send error in read = %d", rc)); 1396 cERROR(1, "Send error in read = %d", rc);
1375 } else { 1397 } else {
1376 int data_length = le16_to_cpu(pSMBr->DataLengthHigh); 1398 int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
1377 data_length = data_length << 16; 1399 data_length = data_length << 16;
@@ -1381,15 +1403,15 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1381 /*check that DataLength would not go beyond end of SMB */ 1403 /*check that DataLength would not go beyond end of SMB */
1382 if ((data_length > CIFSMaxBufSize) 1404 if ((data_length > CIFSMaxBufSize)
1383 || (data_length > count)) { 1405 || (data_length > count)) {
1384 cFYI(1, ("bad length %d for count %d", 1406 cFYI(1, "bad length %d for count %d",
1385 data_length, count)); 1407 data_length, count);
1386 rc = -EIO; 1408 rc = -EIO;
1387 *nbytes = 0; 1409 *nbytes = 0;
1388 } else { 1410 } else {
1389 pReadData = (char *) (&pSMBr->hdr.Protocol) + 1411 pReadData = (char *) (&pSMBr->hdr.Protocol) +
1390 le16_to_cpu(pSMBr->DataOffset); 1412 le16_to_cpu(pSMBr->DataOffset);
1391/* if (rc = copy_to_user(buf, pReadData, data_length)) { 1413/* if (rc = copy_to_user(buf, pReadData, data_length)) {
1392 cERROR(1,("Faulting on read rc = %d",rc)); 1414 cERROR(1, "Faulting on read rc = %d",rc);
1393 rc = -EFAULT; 1415 rc = -EFAULT;
1394 }*/ /* can not use copy_to_user when using page cache*/ 1416 }*/ /* can not use copy_to_user when using page cache*/
1395 if (*buf) 1417 if (*buf)
@@ -1433,7 +1455,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1433 1455
1434 *nbytes = 0; 1456 *nbytes = 0;
1435 1457
1436 /* cFYI(1, ("write at %lld %d bytes", offset, count));*/ 1458 /* cFYI(1, "write at %lld %d bytes", offset, count);*/
1437 if (tcon->ses == NULL) 1459 if (tcon->ses == NULL)
1438 return -ECONNABORTED; 1460 return -ECONNABORTED;
1439 1461
@@ -1514,7 +1536,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1514 (struct smb_hdr *) pSMBr, &bytes_returned, long_op); 1536 (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
1515 cifs_stats_inc(&tcon->num_writes); 1537 cifs_stats_inc(&tcon->num_writes);
1516 if (rc) { 1538 if (rc) {
1517 cFYI(1, ("Send error in write = %d", rc)); 1539 cFYI(1, "Send error in write = %d", rc);
1518 } else { 1540 } else {
1519 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1541 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1520 *nbytes = (*nbytes) << 16; 1542 *nbytes = (*nbytes) << 16;
@@ -1551,7 +1573,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1551 1573
1552 *nbytes = 0; 1574 *nbytes = 0;
1553 1575
1554 cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count)); 1576 cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
1555 1577
1556 if (tcon->ses->capabilities & CAP_LARGE_FILES) { 1578 if (tcon->ses->capabilities & CAP_LARGE_FILES) {
1557 wct = 14; 1579 wct = 14;
@@ -1606,7 +1628,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1606 long_op); 1628 long_op);
1607 cifs_stats_inc(&tcon->num_writes); 1629 cifs_stats_inc(&tcon->num_writes);
1608 if (rc) { 1630 if (rc) {
1609 cFYI(1, ("Send error Write2 = %d", rc)); 1631 cFYI(1, "Send error Write2 = %d", rc);
1610 } else if (resp_buf_type == 0) { 1632 } else if (resp_buf_type == 0) {
1611 /* presumably this can not happen, but best to be safe */ 1633 /* presumably this can not happen, but best to be safe */
1612 rc = -EIO; 1634 rc = -EIO;
@@ -1651,7 +1673,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1651 int timeout = 0; 1673 int timeout = 0;
1652 __u16 count; 1674 __u16 count;
1653 1675
1654 cFYI(1, ("CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock)); 1676 cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
1655 rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); 1677 rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
1656 1678
1657 if (rc) 1679 if (rc)
@@ -1699,7 +1721,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1699 } 1721 }
1700 cifs_stats_inc(&tcon->num_locks); 1722 cifs_stats_inc(&tcon->num_locks);
1701 if (rc) 1723 if (rc)
1702 cFYI(1, ("Send error in Lock = %d", rc)); 1724 cFYI(1, "Send error in Lock = %d", rc);
1703 1725
1704 /* Note: On -EAGAIN error only caller can retry on handle based calls 1726 /* Note: On -EAGAIN error only caller can retry on handle based calls
1705 since file handle passed in no longer valid */ 1727 since file handle passed in no longer valid */
@@ -1722,7 +1744,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1722 __u16 params, param_offset, offset, byte_count, count; 1744 __u16 params, param_offset, offset, byte_count, count;
1723 struct kvec iov[1]; 1745 struct kvec iov[1];
1724 1746
1725 cFYI(1, ("Posix Lock")); 1747 cFYI(1, "Posix Lock");
1726 1748
1727 if (pLockData == NULL) 1749 if (pLockData == NULL)
1728 return -EINVAL; 1750 return -EINVAL;
@@ -1792,7 +1814,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1792 } 1814 }
1793 1815
1794 if (rc) { 1816 if (rc) {
1795 cFYI(1, ("Send error in Posix Lock = %d", rc)); 1817 cFYI(1, "Send error in Posix Lock = %d", rc);
1796 } else if (get_flag) { 1818 } else if (get_flag) {
1797 /* lock structure can be returned on get */ 1819 /* lock structure can be returned on get */
1798 __u16 data_offset; 1820 __u16 data_offset;
@@ -1849,7 +1871,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1849{ 1871{
1850 int rc = 0; 1872 int rc = 0;
1851 CLOSE_REQ *pSMB = NULL; 1873 CLOSE_REQ *pSMB = NULL;
1852 cFYI(1, ("In CIFSSMBClose")); 1874 cFYI(1, "In CIFSSMBClose");
1853 1875
1854/* do not retry on dead session on close */ 1876/* do not retry on dead session on close */
1855 rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB); 1877 rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -1866,7 +1888,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1866 if (rc) { 1888 if (rc) {
1867 if (rc != -EINTR) { 1889 if (rc != -EINTR) {
1868 /* EINTR is expected when user ctl-c to kill app */ 1890 /* EINTR is expected when user ctl-c to kill app */
1869 cERROR(1, ("Send error in Close = %d", rc)); 1891 cERROR(1, "Send error in Close = %d", rc);
1870 } 1892 }
1871 } 1893 }
1872 1894
@@ -1882,7 +1904,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1882{ 1904{
1883 int rc = 0; 1905 int rc = 0;
1884 FLUSH_REQ *pSMB = NULL; 1906 FLUSH_REQ *pSMB = NULL;
1885 cFYI(1, ("In CIFSSMBFlush")); 1907 cFYI(1, "In CIFSSMBFlush");
1886 1908
1887 rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB); 1909 rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
1888 if (rc) 1910 if (rc)
@@ -1893,7 +1915,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1893 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 1915 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
1894 cifs_stats_inc(&tcon->num_flushes); 1916 cifs_stats_inc(&tcon->num_flushes);
1895 if (rc) 1917 if (rc)
1896 cERROR(1, ("Send error in Flush = %d", rc)); 1918 cERROR(1, "Send error in Flush = %d", rc);
1897 1919
1898 return rc; 1920 return rc;
1899} 1921}
@@ -1910,7 +1932,7 @@ CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
1910 int name_len, name_len2; 1932 int name_len, name_len2;
1911 __u16 count; 1933 __u16 count;
1912 1934
1913 cFYI(1, ("In CIFSSMBRename")); 1935 cFYI(1, "In CIFSSMBRename");
1914renameRetry: 1936renameRetry:
1915 rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB, 1937 rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
1916 (void **) &pSMBr); 1938 (void **) &pSMBr);
@@ -1956,7 +1978,7 @@ renameRetry:
1956 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 1978 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
1957 cifs_stats_inc(&tcon->num_renames); 1979 cifs_stats_inc(&tcon->num_renames);
1958 if (rc) 1980 if (rc)
1959 cFYI(1, ("Send error in rename = %d", rc)); 1981 cFYI(1, "Send error in rename = %d", rc);
1960 1982
1961 cifs_buf_release(pSMB); 1983 cifs_buf_release(pSMB);
1962 1984
@@ -1980,7 +2002,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
1980 int len_of_str; 2002 int len_of_str;
1981 __u16 params, param_offset, offset, count, byte_count; 2003 __u16 params, param_offset, offset, count, byte_count;
1982 2004
1983 cFYI(1, ("Rename to File by handle")); 2005 cFYI(1, "Rename to File by handle");
1984 rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB, 2006 rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
1985 (void **) &pSMBr); 2007 (void **) &pSMBr);
1986 if (rc) 2008 if (rc)
@@ -2035,7 +2057,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
2035 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2057 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2036 cifs_stats_inc(&pTcon->num_t2renames); 2058 cifs_stats_inc(&pTcon->num_t2renames);
2037 if (rc) 2059 if (rc)
2038 cFYI(1, ("Send error in Rename (by file handle) = %d", rc)); 2060 cFYI(1, "Send error in Rename (by file handle) = %d", rc);
2039 2061
2040 cifs_buf_release(pSMB); 2062 cifs_buf_release(pSMB);
2041 2063
@@ -2057,7 +2079,7 @@ CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
2057 int name_len, name_len2; 2079 int name_len, name_len2;
2058 __u16 count; 2080 __u16 count;
2059 2081
2060 cFYI(1, ("In CIFSSMBCopy")); 2082 cFYI(1, "In CIFSSMBCopy");
2061copyRetry: 2083copyRetry:
2062 rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB, 2084 rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
2063 (void **) &pSMBr); 2085 (void **) &pSMBr);
@@ -2102,8 +2124,8 @@ copyRetry:
2102 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2124 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2103 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2125 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2104 if (rc) { 2126 if (rc) {
2105 cFYI(1, ("Send error in copy = %d with %d files copied", 2127 cFYI(1, "Send error in copy = %d with %d files copied",
2106 rc, le16_to_cpu(pSMBr->CopyCount))); 2128 rc, le16_to_cpu(pSMBr->CopyCount));
2107 } 2129 }
2108 cifs_buf_release(pSMB); 2130 cifs_buf_release(pSMB);
2109 2131
@@ -2127,7 +2149,7 @@ CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
2127 int bytes_returned = 0; 2149 int bytes_returned = 0;
2128 __u16 params, param_offset, offset, byte_count; 2150 __u16 params, param_offset, offset, byte_count;
2129 2151
2130 cFYI(1, ("In Symlink Unix style")); 2152 cFYI(1, "In Symlink Unix style");
2131createSymLinkRetry: 2153createSymLinkRetry:
2132 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2154 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
2133 (void **) &pSMBr); 2155 (void **) &pSMBr);
@@ -2192,7 +2214,7 @@ createSymLinkRetry:
2192 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2214 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2193 cifs_stats_inc(&tcon->num_symlinks); 2215 cifs_stats_inc(&tcon->num_symlinks);
2194 if (rc) 2216 if (rc)
2195 cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc)); 2217 cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
2196 2218
2197 cifs_buf_release(pSMB); 2219 cifs_buf_release(pSMB);
2198 2220
@@ -2216,7 +2238,7 @@ CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
2216 int bytes_returned = 0; 2238 int bytes_returned = 0;
2217 __u16 params, param_offset, offset, byte_count; 2239 __u16 params, param_offset, offset, byte_count;
2218 2240
2219 cFYI(1, ("In Create Hard link Unix style")); 2241 cFYI(1, "In Create Hard link Unix style");
2220createHardLinkRetry: 2242createHardLinkRetry:
2221 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2243 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
2222 (void **) &pSMBr); 2244 (void **) &pSMBr);
@@ -2278,7 +2300,7 @@ createHardLinkRetry:
2278 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2300 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2279 cifs_stats_inc(&tcon->num_hardlinks); 2301 cifs_stats_inc(&tcon->num_hardlinks);
2280 if (rc) 2302 if (rc)
2281 cFYI(1, ("Send error in SetPathInfo (hard link) = %d", rc)); 2303 cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
2282 2304
2283 cifs_buf_release(pSMB); 2305 cifs_buf_release(pSMB);
2284 if (rc == -EAGAIN) 2306 if (rc == -EAGAIN)
@@ -2299,7 +2321,7 @@ CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
2299 int name_len, name_len2; 2321 int name_len, name_len2;
2300 __u16 count; 2322 __u16 count;
2301 2323
2302 cFYI(1, ("In CIFSCreateHardLink")); 2324 cFYI(1, "In CIFSCreateHardLink");
2303winCreateHardLinkRetry: 2325winCreateHardLinkRetry:
2304 2326
2305 rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB, 2327 rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2350,7 +2372,7 @@ winCreateHardLinkRetry:
2350 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2372 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2351 cifs_stats_inc(&tcon->num_hardlinks); 2373 cifs_stats_inc(&tcon->num_hardlinks);
2352 if (rc) 2374 if (rc)
2353 cFYI(1, ("Send error in hard link (NT rename) = %d", rc)); 2375 cFYI(1, "Send error in hard link (NT rename) = %d", rc);
2354 2376
2355 cifs_buf_release(pSMB); 2377 cifs_buf_release(pSMB);
2356 if (rc == -EAGAIN) 2378 if (rc == -EAGAIN)
@@ -2373,7 +2395,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
2373 __u16 params, byte_count; 2395 __u16 params, byte_count;
2374 char *data_start; 2396 char *data_start;
2375 2397
2376 cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName)); 2398 cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
2377 2399
2378querySymLinkRetry: 2400querySymLinkRetry:
2379 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2401 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2420,7 +2442,7 @@ querySymLinkRetry:
2420 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2442 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2421 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2443 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2422 if (rc) { 2444 if (rc) {
2423 cFYI(1, ("Send error in QuerySymLinkInfo = %d", rc)); 2445 cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
2424 } else { 2446 } else {
2425 /* decode response */ 2447 /* decode response */
2426 2448
@@ -2521,21 +2543,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
2521 2543
2522 /* should we also check that parm and data areas do not overlap? */ 2544 /* should we also check that parm and data areas do not overlap? */
2523 if (*ppparm > end_of_smb) { 2545 if (*ppparm > end_of_smb) {
2524 cFYI(1, ("parms start after end of smb")); 2546 cFYI(1, "parms start after end of smb");
2525 return -EINVAL; 2547 return -EINVAL;
2526 } else if (parm_count + *ppparm > end_of_smb) { 2548 } else if (parm_count + *ppparm > end_of_smb) {
2527 cFYI(1, ("parm end after end of smb")); 2549 cFYI(1, "parm end after end of smb");
2528 return -EINVAL; 2550 return -EINVAL;
2529 } else if (*ppdata > end_of_smb) { 2551 } else if (*ppdata > end_of_smb) {
2530 cFYI(1, ("data starts after end of smb")); 2552 cFYI(1, "data starts after end of smb");
2531 return -EINVAL; 2553 return -EINVAL;
2532 } else if (data_count + *ppdata > end_of_smb) { 2554 } else if (data_count + *ppdata > end_of_smb) {
2533 cFYI(1, ("data %p + count %d (%p) ends after end of smb %p start %p", 2555 cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
2534 *ppdata, data_count, (data_count + *ppdata), 2556 *ppdata, data_count, (data_count + *ppdata),
2535 end_of_smb, pSMBr)); 2557 end_of_smb, pSMBr);
2536 return -EINVAL; 2558 return -EINVAL;
2537 } else if (parm_count + data_count > pSMBr->ByteCount) { 2559 } else if (parm_count + data_count > pSMBr->ByteCount) {
2538 cFYI(1, ("parm count and data count larger than SMB")); 2560 cFYI(1, "parm count and data count larger than SMB");
2539 return -EINVAL; 2561 return -EINVAL;
2540 } 2562 }
2541 *pdatalen = data_count; 2563 *pdatalen = data_count;
@@ -2554,7 +2576,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2554 struct smb_com_transaction_ioctl_req *pSMB; 2576 struct smb_com_transaction_ioctl_req *pSMB;
2555 struct smb_com_transaction_ioctl_rsp *pSMBr; 2577 struct smb_com_transaction_ioctl_rsp *pSMBr;
2556 2578
2557 cFYI(1, ("In Windows reparse style QueryLink for path %s", searchName)); 2579 cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
2558 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, 2580 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
2559 (void **) &pSMBr); 2581 (void **) &pSMBr);
2560 if (rc) 2582 if (rc)
@@ -2583,7 +2605,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2583 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2605 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2584 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2606 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2585 if (rc) { 2607 if (rc) {
2586 cFYI(1, ("Send error in QueryReparseLinkInfo = %d", rc)); 2608 cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
2587 } else { /* decode response */ 2609 } else { /* decode response */
2588 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); 2610 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
2589 __u32 data_count = le32_to_cpu(pSMBr->DataCount); 2611 __u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -2607,7 +2629,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2607 if ((reparse_buf->LinkNamesBuf + 2629 if ((reparse_buf->LinkNamesBuf +
2608 reparse_buf->TargetNameOffset + 2630 reparse_buf->TargetNameOffset +
2609 reparse_buf->TargetNameLen) > end_of_smb) { 2631 reparse_buf->TargetNameLen) > end_of_smb) {
2610 cFYI(1, ("reparse buf beyond SMB")); 2632 cFYI(1, "reparse buf beyond SMB");
2611 rc = -EIO; 2633 rc = -EIO;
2612 goto qreparse_out; 2634 goto qreparse_out;
2613 } 2635 }
@@ -2628,12 +2650,12 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2628 } 2650 }
2629 } else { 2651 } else {
2630 rc = -EIO; 2652 rc = -EIO;
2631 cFYI(1, ("Invalid return data count on " 2653 cFYI(1, "Invalid return data count on "
2632 "get reparse info ioctl")); 2654 "get reparse info ioctl");
2633 } 2655 }
2634 symlinkinfo[buflen] = 0; /* just in case so the caller 2656 symlinkinfo[buflen] = 0; /* just in case so the caller
2635 does not go off the end of the buffer */ 2657 does not go off the end of the buffer */
2636 cFYI(1, ("readlink result - %s", symlinkinfo)); 2658 cFYI(1, "readlink result - %s", symlinkinfo);
2637 } 2659 }
2638 2660
2639qreparse_out: 2661qreparse_out:
@@ -2656,7 +2678,7 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
2656 ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); 2678 ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
2657 ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); 2679 ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag);
2658 ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); 2680 ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
2659 /* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */ 2681 /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
2660 2682
2661 return; 2683 return;
2662} 2684}
@@ -2682,8 +2704,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
2682 size += sizeof(struct cifs_posix_ace) * count; 2704 size += sizeof(struct cifs_posix_ace) * count;
2683 /* check if we would go beyond end of SMB */ 2705 /* check if we would go beyond end of SMB */
2684 if (size_of_data_area < size) { 2706 if (size_of_data_area < size) {
2685 cFYI(1, ("bad CIFS POSIX ACL size %d vs. %d", 2707 cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
2686 size_of_data_area, size)); 2708 size_of_data_area, size);
2687 return -EINVAL; 2709 return -EINVAL;
2688 } 2710 }
2689 } else if (acl_type & ACL_TYPE_DEFAULT) { 2711 } else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -2730,7 +2752,7 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
2730 cifs_ace->cifs_uid = cpu_to_le64(-1); 2752 cifs_ace->cifs_uid = cpu_to_le64(-1);
2731 } else 2753 } else
2732 cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); 2754 cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
2733 /*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/ 2755 /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
2734 return rc; 2756 return rc;
2735} 2757}
2736 2758
@@ -2748,12 +2770,12 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
2748 return 0; 2770 return 0;
2749 2771
2750 count = posix_acl_xattr_count((size_t)buflen); 2772 count = posix_acl_xattr_count((size_t)buflen);
2751 cFYI(1, ("setting acl with %d entries from buf of length %d and " 2773 cFYI(1, "setting acl with %d entries from buf of length %d and "
2752 "version of %d", 2774 "version of %d",
2753 count, buflen, le32_to_cpu(local_acl->a_version))); 2775 count, buflen, le32_to_cpu(local_acl->a_version));
2754 if (le32_to_cpu(local_acl->a_version) != 2) { 2776 if (le32_to_cpu(local_acl->a_version) != 2) {
2755 cFYI(1, ("unknown POSIX ACL version %d", 2777 cFYI(1, "unknown POSIX ACL version %d",
2756 le32_to_cpu(local_acl->a_version))); 2778 le32_to_cpu(local_acl->a_version));
2757 return 0; 2779 return 0;
2758 } 2780 }
2759 cifs_acl->version = cpu_to_le16(1); 2781 cifs_acl->version = cpu_to_le16(1);
@@ -2762,7 +2784,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
2762 else if (acl_type == ACL_TYPE_DEFAULT) 2784 else if (acl_type == ACL_TYPE_DEFAULT)
2763 cifs_acl->default_entry_count = cpu_to_le16(count); 2785 cifs_acl->default_entry_count = cpu_to_le16(count);
2764 else { 2786 else {
2765 cFYI(1, ("unknown ACL type %d", acl_type)); 2787 cFYI(1, "unknown ACL type %d", acl_type);
2766 return 0; 2788 return 0;
2767 } 2789 }
2768 for (i = 0; i < count; i++) { 2790 for (i = 0; i < count; i++) {
@@ -2795,7 +2817,7 @@ CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
2795 int name_len; 2817 int name_len;
2796 __u16 params, byte_count; 2818 __u16 params, byte_count;
2797 2819
2798 cFYI(1, ("In GetPosixACL (Unix) for path %s", searchName)); 2820 cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
2799 2821
2800queryAclRetry: 2822queryAclRetry:
2801 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2823 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2847,7 +2869,7 @@ queryAclRetry:
2847 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2869 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2848 cifs_stats_inc(&tcon->num_acl_get); 2870 cifs_stats_inc(&tcon->num_acl_get);
2849 if (rc) { 2871 if (rc) {
2850 cFYI(1, ("Send error in Query POSIX ACL = %d", rc)); 2872 cFYI(1, "Send error in Query POSIX ACL = %d", rc);
2851 } else { 2873 } else {
2852 /* decode response */ 2874 /* decode response */
2853 2875
@@ -2884,7 +2906,7 @@ CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
2884 int bytes_returned = 0; 2906 int bytes_returned = 0;
2885 __u16 params, byte_count, data_count, param_offset, offset; 2907 __u16 params, byte_count, data_count, param_offset, offset;
2886 2908
2887 cFYI(1, ("In SetPosixACL (Unix) for path %s", fileName)); 2909 cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
2888setAclRetry: 2910setAclRetry:
2889 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2911 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
2890 (void **) &pSMBr); 2912 (void **) &pSMBr);
@@ -2939,7 +2961,7 @@ setAclRetry:
2939 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2961 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2940 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2962 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2941 if (rc) 2963 if (rc)
2942 cFYI(1, ("Set POSIX ACL returned %d", rc)); 2964 cFYI(1, "Set POSIX ACL returned %d", rc);
2943 2965
2944setACLerrorExit: 2966setACLerrorExit:
2945 cifs_buf_release(pSMB); 2967 cifs_buf_release(pSMB);
@@ -2959,7 +2981,7 @@ CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
2959 int bytes_returned; 2981 int bytes_returned;
2960 __u16 params, byte_count; 2982 __u16 params, byte_count;
2961 2983
2962 cFYI(1, ("In GetExtAttr")); 2984 cFYI(1, "In GetExtAttr");
2963 if (tcon == NULL) 2985 if (tcon == NULL)
2964 return -ENODEV; 2986 return -ENODEV;
2965 2987
@@ -2998,7 +3020,7 @@ GetExtAttrRetry:
2998 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3020 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2999 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3021 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3000 if (rc) { 3022 if (rc) {
3001 cFYI(1, ("error %d in GetExtAttr", rc)); 3023 cFYI(1, "error %d in GetExtAttr", rc);
3002 } else { 3024 } else {
3003 /* decode response */ 3025 /* decode response */
3004 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3026 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3013,7 +3035,7 @@ GetExtAttrRetry:
3013 struct file_chattr_info *pfinfo; 3035 struct file_chattr_info *pfinfo;
3014 /* BB Do we need a cast or hash here ? */ 3036 /* BB Do we need a cast or hash here ? */
3015 if (count != 16) { 3037 if (count != 16) {
3016 cFYI(1, ("Illegal size ret in GetExtAttr")); 3038 cFYI(1, "Illegal size ret in GetExtAttr");
3017 rc = -EIO; 3039 rc = -EIO;
3018 goto GetExtAttrOut; 3040 goto GetExtAttrOut;
3019 } 3041 }
@@ -3043,7 +3065,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3043 QUERY_SEC_DESC_REQ *pSMB; 3065 QUERY_SEC_DESC_REQ *pSMB;
3044 struct kvec iov[1]; 3066 struct kvec iov[1];
3045 3067
3046 cFYI(1, ("GetCifsACL")); 3068 cFYI(1, "GetCifsACL");
3047 3069
3048 *pbuflen = 0; 3070 *pbuflen = 0;
3049 *acl_inf = NULL; 3071 *acl_inf = NULL;
@@ -3068,7 +3090,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3068 CIFS_STD_OP); 3090 CIFS_STD_OP);
3069 cifs_stats_inc(&tcon->num_acl_get); 3091 cifs_stats_inc(&tcon->num_acl_get);
3070 if (rc) { 3092 if (rc) {
3071 cFYI(1, ("Send error in QuerySecDesc = %d", rc)); 3093 cFYI(1, "Send error in QuerySecDesc = %d", rc);
3072 } else { /* decode response */ 3094 } else { /* decode response */
3073 __le32 *parm; 3095 __le32 *parm;
3074 __u32 parm_len; 3096 __u32 parm_len;
@@ -3083,7 +3105,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3083 goto qsec_out; 3105 goto qsec_out;
3084 pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base; 3106 pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
3085 3107
3086 cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf)); 3108 cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
3087 3109
3088 if (le32_to_cpu(pSMBr->ParameterCount) != 4) { 3110 if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
3089 rc = -EIO; /* bad smb */ 3111 rc = -EIO; /* bad smb */
@@ -3095,8 +3117,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3095 3117
3096 acl_len = le32_to_cpu(*parm); 3118 acl_len = le32_to_cpu(*parm);
3097 if (acl_len != *pbuflen) { 3119 if (acl_len != *pbuflen) {
3098 cERROR(1, ("acl length %d does not match %d", 3120 cERROR(1, "acl length %d does not match %d",
3099 acl_len, *pbuflen)); 3121 acl_len, *pbuflen);
3100 if (*pbuflen > acl_len) 3122 if (*pbuflen > acl_len)
3101 *pbuflen = acl_len; 3123 *pbuflen = acl_len;
3102 } 3124 }
@@ -3105,7 +3127,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3105 header followed by the smallest SID */ 3127 header followed by the smallest SID */
3106 if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) || 3128 if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
3107 (*pbuflen >= 64 * 1024)) { 3129 (*pbuflen >= 64 * 1024)) {
3108 cERROR(1, ("bad acl length %d", *pbuflen)); 3130 cERROR(1, "bad acl length %d", *pbuflen);
3109 rc = -EINVAL; 3131 rc = -EINVAL;
3110 *pbuflen = 0; 3132 *pbuflen = 0;
3111 } else { 3133 } else {
@@ -3179,9 +3201,9 @@ setCifsAclRetry:
3179 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3201 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3180 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3202 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3181 3203
3182 cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc)); 3204 cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
3183 if (rc) 3205 if (rc)
3184 cFYI(1, ("Set CIFS ACL returned %d", rc)); 3206 cFYI(1, "Set CIFS ACL returned %d", rc);
3185 cifs_buf_release(pSMB); 3207 cifs_buf_release(pSMB);
3186 3208
3187 if (rc == -EAGAIN) 3209 if (rc == -EAGAIN)
@@ -3205,7 +3227,7 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
3205 int bytes_returned; 3227 int bytes_returned;
3206 int name_len; 3228 int name_len;
3207 3229
3208 cFYI(1, ("In SMBQPath path %s", searchName)); 3230 cFYI(1, "In SMBQPath path %s", searchName);
3209QInfRetry: 3231QInfRetry:
3210 rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB, 3232 rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
3211 (void **) &pSMBr); 3233 (void **) &pSMBr);
@@ -3231,7 +3253,7 @@ QInfRetry:
3231 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3253 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3232 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3254 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3233 if (rc) { 3255 if (rc) {
3234 cFYI(1, ("Send error in QueryInfo = %d", rc)); 3256 cFYI(1, "Send error in QueryInfo = %d", rc);
3235 } else if (pFinfo) { 3257 } else if (pFinfo) {
3236 struct timespec ts; 3258 struct timespec ts;
3237 __u32 time = le32_to_cpu(pSMBr->last_write_time); 3259 __u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3305,7 +3327,7 @@ QFileInfoRetry:
3305 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3327 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3306 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3328 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3307 if (rc) { 3329 if (rc) {
3308 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3330 cFYI(1, "Send error in QPathInfo = %d", rc);
3309 } else { /* decode response */ 3331 } else { /* decode response */
3310 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3332 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3311 3333
@@ -3343,7 +3365,7 @@ CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
3343 int name_len; 3365 int name_len;
3344 __u16 params, byte_count; 3366 __u16 params, byte_count;
3345 3367
3346/* cFYI(1, ("In QPathInfo path %s", searchName)); */ 3368/* cFYI(1, "In QPathInfo path %s", searchName); */
3347QPathInfoRetry: 3369QPathInfoRetry:
3348 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 3370 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3349 (void **) &pSMBr); 3371 (void **) &pSMBr);
@@ -3393,7 +3415,7 @@ QPathInfoRetry:
3393 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3415 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3394 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3416 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3395 if (rc) { 3417 if (rc) {
3396 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3418 cFYI(1, "Send error in QPathInfo = %d", rc);
3397 } else { /* decode response */ 3419 } else { /* decode response */
3398 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3420 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3399 3421
@@ -3473,14 +3495,14 @@ UnixQFileInfoRetry:
3473 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3495 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3474 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3496 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3475 if (rc) { 3497 if (rc) {
3476 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3498 cFYI(1, "Send error in QPathInfo = %d", rc);
3477 } else { /* decode response */ 3499 } else { /* decode response */
3478 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3500 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3479 3501
3480 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) { 3502 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3481 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n" 3503 cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
3482 "Unix Extensions can be disabled on mount " 3504 "Unix Extensions can be disabled on mount "
3483 "by specifying the nosfu mount option.")); 3505 "by specifying the nosfu mount option.");
3484 rc = -EIO; /* bad smb */ 3506 rc = -EIO; /* bad smb */
3485 } else { 3507 } else {
3486 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 3508 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3512,7 +3534,7 @@ CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3512 int name_len; 3534 int name_len;
3513 __u16 params, byte_count; 3535 __u16 params, byte_count;
3514 3536
3515 cFYI(1, ("In QPathInfo (Unix) the path %s", searchName)); 3537 cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
3516UnixQPathInfoRetry: 3538UnixQPathInfoRetry:
3517 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 3539 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3518 (void **) &pSMBr); 3540 (void **) &pSMBr);
@@ -3559,14 +3581,14 @@ UnixQPathInfoRetry:
3559 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3581 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3560 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3582 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3561 if (rc) { 3583 if (rc) {
3562 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3584 cFYI(1, "Send error in QPathInfo = %d", rc);
3563 } else { /* decode response */ 3585 } else { /* decode response */
3564 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3586 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3565 3587
3566 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) { 3588 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3567 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n" 3589 cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
3568 "Unix Extensions can be disabled on mount " 3590 "Unix Extensions can be disabled on mount "
3569 "by specifying the nosfu mount option.")); 3591 "by specifying the nosfu mount option.");
3570 rc = -EIO; /* bad smb */ 3592 rc = -EIO; /* bad smb */
3571 } else { 3593 } else {
3572 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 3594 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3600,7 +3622,7 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
3600 int name_len; 3622 int name_len;
3601 __u16 params, byte_count; 3623 __u16 params, byte_count;
3602 3624
3603 cFYI(1, ("In FindFirst for %s", searchName)); 3625 cFYI(1, "In FindFirst for %s", searchName);
3604 3626
3605findFirstRetry: 3627findFirstRetry:
3606 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 3628 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3677,7 +3699,7 @@ findFirstRetry:
3677 if (rc) {/* BB add logic to retry regular search if Unix search 3699 if (rc) {/* BB add logic to retry regular search if Unix search
3678 rejected unexpectedly by server */ 3700 rejected unexpectedly by server */
3679 /* BB Add code to handle unsupported level rc */ 3701 /* BB Add code to handle unsupported level rc */
3680 cFYI(1, ("Error in FindFirst = %d", rc)); 3702 cFYI(1, "Error in FindFirst = %d", rc);
3681 3703
3682 cifs_buf_release(pSMB); 3704 cifs_buf_release(pSMB);
3683 3705
@@ -3716,7 +3738,7 @@ findFirstRetry:
3716 lnoff = le16_to_cpu(parms->LastNameOffset); 3738 lnoff = le16_to_cpu(parms->LastNameOffset);
3717 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 3739 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3718 lnoff) { 3740 lnoff) {
3719 cERROR(1, ("ignoring corrupt resume name")); 3741 cERROR(1, "ignoring corrupt resume name");
3720 psrch_inf->last_entry = NULL; 3742 psrch_inf->last_entry = NULL;
3721 return rc; 3743 return rc;
3722 } 3744 }
@@ -3744,7 +3766,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3744 int bytes_returned, name_len; 3766 int bytes_returned, name_len;
3745 __u16 params, byte_count; 3767 __u16 params, byte_count;
3746 3768
3747 cFYI(1, ("In FindNext")); 3769 cFYI(1, "In FindNext");
3748 3770
3749 if (psrch_inf->endOfSearch) 3771 if (psrch_inf->endOfSearch)
3750 return -ENOENT; 3772 return -ENOENT;
@@ -3808,7 +3830,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3808 cifs_buf_release(pSMB); 3830 cifs_buf_release(pSMB);
3809 rc = 0; /* search probably was closed at end of search*/ 3831 rc = 0; /* search probably was closed at end of search*/
3810 } else 3832 } else
3811 cFYI(1, ("FindNext returned = %d", rc)); 3833 cFYI(1, "FindNext returned = %d", rc);
3812 } else { /* decode response */ 3834 } else { /* decode response */
3813 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3835 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3814 3836
@@ -3844,15 +3866,15 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3844 lnoff = le16_to_cpu(parms->LastNameOffset); 3866 lnoff = le16_to_cpu(parms->LastNameOffset);
3845 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 3867 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3846 lnoff) { 3868 lnoff) {
3847 cERROR(1, ("ignoring corrupt resume name")); 3869 cERROR(1, "ignoring corrupt resume name");
3848 psrch_inf->last_entry = NULL; 3870 psrch_inf->last_entry = NULL;
3849 return rc; 3871 return rc;
3850 } else 3872 } else
3851 psrch_inf->last_entry = 3873 psrch_inf->last_entry =
3852 psrch_inf->srch_entries_start + lnoff; 3874 psrch_inf->srch_entries_start + lnoff;
3853 3875
3854/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d", 3876/* cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
3855 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */ 3877 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
3856 3878
3857 /* BB fixme add unlock here */ 3879 /* BB fixme add unlock here */
3858 } 3880 }
@@ -3877,7 +3899,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
3877 int rc = 0; 3899 int rc = 0;
3878 FINDCLOSE_REQ *pSMB = NULL; 3900 FINDCLOSE_REQ *pSMB = NULL;
3879 3901
3880 cFYI(1, ("In CIFSSMBFindClose")); 3902 cFYI(1, "In CIFSSMBFindClose");
3881 rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB); 3903 rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
3882 3904
3883 /* no sense returning error if session restarted 3905 /* no sense returning error if session restarted
@@ -3891,7 +3913,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
3891 pSMB->ByteCount = 0; 3913 pSMB->ByteCount = 0;
3892 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 3914 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
3893 if (rc) 3915 if (rc)
3894 cERROR(1, ("Send error in FindClose = %d", rc)); 3916 cERROR(1, "Send error in FindClose = %d", rc);
3895 3917
3896 cifs_stats_inc(&tcon->num_fclose); 3918 cifs_stats_inc(&tcon->num_fclose);
3897 3919
@@ -3914,7 +3936,7 @@ CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
3914 int name_len, bytes_returned; 3936 int name_len, bytes_returned;
3915 __u16 params, byte_count; 3937 __u16 params, byte_count;
3916 3938
3917 cFYI(1, ("In GetSrvInodeNum for %s", searchName)); 3939 cFYI(1, "In GetSrvInodeNum for %s", searchName);
3918 if (tcon == NULL) 3940 if (tcon == NULL)
3919 return -ENODEV; 3941 return -ENODEV;
3920 3942
@@ -3964,7 +3986,7 @@ GetInodeNumberRetry:
3964 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3986 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3965 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3987 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3966 if (rc) { 3988 if (rc) {
3967 cFYI(1, ("error %d in QueryInternalInfo", rc)); 3989 cFYI(1, "error %d in QueryInternalInfo", rc);
3968 } else { 3990 } else {
3969 /* decode response */ 3991 /* decode response */
3970 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3992 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3979,7 +4001,7 @@ GetInodeNumberRetry:
3979 struct file_internal_info *pfinfo; 4001 struct file_internal_info *pfinfo;
3980 /* BB Do we need a cast or hash here ? */ 4002 /* BB Do we need a cast or hash here ? */
3981 if (count < 8) { 4003 if (count < 8) {
3982 cFYI(1, ("Illegal size ret in QryIntrnlInf")); 4004 cFYI(1, "Illegal size ret in QryIntrnlInf");
3983 rc = -EIO; 4005 rc = -EIO;
3984 goto GetInodeNumOut; 4006 goto GetInodeNumOut;
3985 } 4007 }
@@ -4020,16 +4042,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4020 *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals); 4042 *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
4021 4043
4022 if (*num_of_nodes < 1) { 4044 if (*num_of_nodes < 1) {
4023 cERROR(1, ("num_referrals: must be at least > 0," 4045 cERROR(1, "num_referrals: must be at least > 0,"
4024 "but we get num_referrals = %d\n", *num_of_nodes)); 4046 "but we get num_referrals = %d\n", *num_of_nodes);
4025 rc = -EINVAL; 4047 rc = -EINVAL;
4026 goto parse_DFS_referrals_exit; 4048 goto parse_DFS_referrals_exit;
4027 } 4049 }
4028 4050
4029 ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals); 4051 ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
4030 if (ref->VersionNumber != cpu_to_le16(3)) { 4052 if (ref->VersionNumber != cpu_to_le16(3)) {
4031 cERROR(1, ("Referrals of V%d version are not supported," 4053 cERROR(1, "Referrals of V%d version are not supported,"
4032 "should be V3", le16_to_cpu(ref->VersionNumber))); 4054 "should be V3", le16_to_cpu(ref->VersionNumber));
4033 rc = -EINVAL; 4055 rc = -EINVAL;
4034 goto parse_DFS_referrals_exit; 4056 goto parse_DFS_referrals_exit;
4035 } 4057 }
@@ -4038,14 +4060,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4038 data_end = (char *)(&(pSMBr->PathConsumed)) + 4060 data_end = (char *)(&(pSMBr->PathConsumed)) +
4039 le16_to_cpu(pSMBr->t2.DataCount); 4061 le16_to_cpu(pSMBr->t2.DataCount);
4040 4062
4041 cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n", 4063 cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
4042 *num_of_nodes, 4064 *num_of_nodes,
4043 le32_to_cpu(pSMBr->DFSFlags))); 4065 le32_to_cpu(pSMBr->DFSFlags));
4044 4066
4045 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) * 4067 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
4046 *num_of_nodes, GFP_KERNEL); 4068 *num_of_nodes, GFP_KERNEL);
4047 if (*target_nodes == NULL) { 4069 if (*target_nodes == NULL) {
4048 cERROR(1, ("Failed to allocate buffer for target_nodes\n")); 4070 cERROR(1, "Failed to allocate buffer for target_nodes\n");
4049 rc = -ENOMEM; 4071 rc = -ENOMEM;
4050 goto parse_DFS_referrals_exit; 4072 goto parse_DFS_referrals_exit;
4051 } 4073 }
@@ -4121,7 +4143,7 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
4121 *num_of_nodes = 0; 4143 *num_of_nodes = 0;
4122 *target_nodes = NULL; 4144 *target_nodes = NULL;
4123 4145
4124 cFYI(1, ("In GetDFSRefer the path %s", searchName)); 4146 cFYI(1, "In GetDFSRefer the path %s", searchName);
4125 if (ses == NULL) 4147 if (ses == NULL)
4126 return -ENODEV; 4148 return -ENODEV;
4127getDFSRetry: 4149getDFSRetry:
@@ -4188,7 +4210,7 @@ getDFSRetry:
4188 rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, 4210 rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
4189 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4211 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4190 if (rc) { 4212 if (rc) {
4191 cFYI(1, ("Send error in GetDFSRefer = %d", rc)); 4213 cFYI(1, "Send error in GetDFSRefer = %d", rc);
4192 goto GetDFSRefExit; 4214 goto GetDFSRefExit;
4193 } 4215 }
4194 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4216 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4199,9 +4221,9 @@ getDFSRetry:
4199 goto GetDFSRefExit; 4221 goto GetDFSRefExit;
4200 } 4222 }
4201 4223
4202 cFYI(1, ("Decoding GetDFSRefer response BCC: %d Offset %d", 4224 cFYI(1, "Decoding GetDFSRefer response BCC: %d Offset %d",
4203 pSMBr->ByteCount, 4225 pSMBr->ByteCount,
4204 le16_to_cpu(pSMBr->t2.DataOffset))); 4226 le16_to_cpu(pSMBr->t2.DataOffset));
4205 4227
4206 /* parse returned result into more usable form */ 4228 /* parse returned result into more usable form */
4207 rc = parse_DFS_referrals(pSMBr, num_of_nodes, 4229 rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4229,7 +4251,7 @@ SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
4229 int bytes_returned = 0; 4251 int bytes_returned = 0;
4230 __u16 params, byte_count; 4252 __u16 params, byte_count;
4231 4253
4232 cFYI(1, ("OldQFSInfo")); 4254 cFYI(1, "OldQFSInfo");
4233oldQFSInfoRetry: 4255oldQFSInfoRetry:
4234 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4256 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4235 (void **) &pSMBr); 4257 (void **) &pSMBr);
@@ -4262,7 +4284,7 @@ oldQFSInfoRetry:
4262 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4284 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4263 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4285 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4264 if (rc) { 4286 if (rc) {
4265 cFYI(1, ("Send error in QFSInfo = %d", rc)); 4287 cFYI(1, "Send error in QFSInfo = %d", rc);
4266 } else { /* decode response */ 4288 } else { /* decode response */
4267 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4289 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4268 4290
@@ -4270,8 +4292,8 @@ oldQFSInfoRetry:
4270 rc = -EIO; /* bad smb */ 4292 rc = -EIO; /* bad smb */
4271 else { 4293 else {
4272 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 4294 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
4273 cFYI(1, ("qfsinf resp BCC: %d Offset %d", 4295 cFYI(1, "qfsinf resp BCC: %d Offset %d",
4274 pSMBr->ByteCount, data_offset)); 4296 pSMBr->ByteCount, data_offset);
4275 4297
4276 response_data = (FILE_SYSTEM_ALLOC_INFO *) 4298 response_data = (FILE_SYSTEM_ALLOC_INFO *)
4277 (((char *) &pSMBr->hdr.Protocol) + data_offset); 4299 (((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4283,11 +4305,10 @@ oldQFSInfoRetry:
4283 le32_to_cpu(response_data->TotalAllocationUnits); 4305 le32_to_cpu(response_data->TotalAllocationUnits);
4284 FSData->f_bfree = FSData->f_bavail = 4306 FSData->f_bfree = FSData->f_bavail =
4285 le32_to_cpu(response_data->FreeAllocationUnits); 4307 le32_to_cpu(response_data->FreeAllocationUnits);
4286 cFYI(1, 4308 cFYI(1, "Blocks: %lld Free: %lld Block size %ld",
4287 ("Blocks: %lld Free: %lld Block size %ld", 4309 (unsigned long long)FSData->f_blocks,
4288 (unsigned long long)FSData->f_blocks, 4310 (unsigned long long)FSData->f_bfree,
4289 (unsigned long long)FSData->f_bfree, 4311 FSData->f_bsize);
4290 FSData->f_bsize));
4291 } 4312 }
4292 } 4313 }
4293 cifs_buf_release(pSMB); 4314 cifs_buf_release(pSMB);
@@ -4309,7 +4330,7 @@ CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
4309 int bytes_returned = 0; 4330 int bytes_returned = 0;
4310 __u16 params, byte_count; 4331 __u16 params, byte_count;
4311 4332
4312 cFYI(1, ("In QFSInfo")); 4333 cFYI(1, "In QFSInfo");
4313QFSInfoRetry: 4334QFSInfoRetry:
4314 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4335 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4315 (void **) &pSMBr); 4336 (void **) &pSMBr);
@@ -4342,7 +4363,7 @@ QFSInfoRetry:
4342 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4363 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4343 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4364 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4344 if (rc) { 4365 if (rc) {
4345 cFYI(1, ("Send error in QFSInfo = %d", rc)); 4366 cFYI(1, "Send error in QFSInfo = %d", rc);
4346 } else { /* decode response */ 4367 } else { /* decode response */
4347 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4368 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4348 4369
@@ -4363,11 +4384,10 @@ QFSInfoRetry:
4363 le64_to_cpu(response_data->TotalAllocationUnits); 4384 le64_to_cpu(response_data->TotalAllocationUnits);
4364 FSData->f_bfree = FSData->f_bavail = 4385 FSData->f_bfree = FSData->f_bavail =
4365 le64_to_cpu(response_data->FreeAllocationUnits); 4386 le64_to_cpu(response_data->FreeAllocationUnits);
4366 cFYI(1, 4387 cFYI(1, "Blocks: %lld Free: %lld Block size %ld",
4367 ("Blocks: %lld Free: %lld Block size %ld", 4388 (unsigned long long)FSData->f_blocks,
4368 (unsigned long long)FSData->f_blocks, 4389 (unsigned long long)FSData->f_bfree,
4369 (unsigned long long)FSData->f_bfree, 4390 FSData->f_bsize);
4370 FSData->f_bsize));
4371 } 4391 }
4372 } 4392 }
4373 cifs_buf_release(pSMB); 4393 cifs_buf_release(pSMB);
@@ -4389,7 +4409,7 @@ CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
4389 int bytes_returned = 0; 4409 int bytes_returned = 0;
4390 __u16 params, byte_count; 4410 __u16 params, byte_count;
4391 4411
4392 cFYI(1, ("In QFSAttributeInfo")); 4412 cFYI(1, "In QFSAttributeInfo");
4393QFSAttributeRetry: 4413QFSAttributeRetry:
4394 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4414 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4395 (void **) &pSMBr); 4415 (void **) &pSMBr);
@@ -4423,7 +4443,7 @@ QFSAttributeRetry:
4423 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4443 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4424 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4444 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4425 if (rc) { 4445 if (rc) {
4426 cERROR(1, ("Send error in QFSAttributeInfo = %d", rc)); 4446 cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
4427 } else { /* decode response */ 4447 } else { /* decode response */
4428 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4448 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4429 4449
@@ -4459,7 +4479,7 @@ CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
4459 int bytes_returned = 0; 4479 int bytes_returned = 0;
4460 __u16 params, byte_count; 4480 __u16 params, byte_count;
4461 4481
4462 cFYI(1, ("In QFSDeviceInfo")); 4482 cFYI(1, "In QFSDeviceInfo");
4463QFSDeviceRetry: 4483QFSDeviceRetry:
4464 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4484 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4465 (void **) &pSMBr); 4485 (void **) &pSMBr);
@@ -4494,7 +4514,7 @@ QFSDeviceRetry:
4494 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4514 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4495 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4515 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4496 if (rc) { 4516 if (rc) {
4497 cFYI(1, ("Send error in QFSDeviceInfo = %d", rc)); 4517 cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
4498 } else { /* decode response */ 4518 } else { /* decode response */
4499 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4519 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4500 4520
@@ -4529,10 +4549,10 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
4529 int bytes_returned = 0; 4549 int bytes_returned = 0;
4530 __u16 params, byte_count; 4550 __u16 params, byte_count;
4531 4551
4532 cFYI(1, ("In QFSUnixInfo")); 4552 cFYI(1, "In QFSUnixInfo");
4533QFSUnixRetry: 4553QFSUnixRetry:
4534 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4554 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4535 (void **) &pSMBr); 4555 (void **) &pSMB, (void **) &pSMBr);
4536 if (rc) 4556 if (rc)
4537 return rc; 4557 return rc;
4538 4558
@@ -4563,7 +4583,7 @@ QFSUnixRetry:
4563 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4583 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4564 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4584 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4565 if (rc) { 4585 if (rc) {
4566 cERROR(1, ("Send error in QFSUnixInfo = %d", rc)); 4586 cERROR(1, "Send error in QFSUnixInfo = %d", rc);
4567 } else { /* decode response */ 4587 } else { /* decode response */
4568 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4588 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4569 4589
@@ -4598,11 +4618,11 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
4598 int bytes_returned = 0; 4618 int bytes_returned = 0;
4599 __u16 params, param_offset, offset, byte_count; 4619 __u16 params, param_offset, offset, byte_count;
4600 4620
4601 cFYI(1, ("In SETFSUnixInfo")); 4621 cFYI(1, "In SETFSUnixInfo");
4602SETFSUnixRetry: 4622SETFSUnixRetry:
4603 /* BB switch to small buf init to save memory */ 4623 /* BB switch to small buf init to save memory */
4604 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4624 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4605 (void **) &pSMBr); 4625 (void **) &pSMB, (void **) &pSMBr);
4606 if (rc) 4626 if (rc)
4607 return rc; 4627 return rc;
4608 4628
@@ -4646,7 +4666,7 @@ SETFSUnixRetry:
4646 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4666 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4647 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4667 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4648 if (rc) { 4668 if (rc) {
4649 cERROR(1, ("Send error in SETFSUnixInfo = %d", rc)); 4669 cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
4650 } else { /* decode response */ 4670 } else { /* decode response */
4651 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4671 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4652 if (rc) 4672 if (rc)
@@ -4674,7 +4694,7 @@ CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
4674 int bytes_returned = 0; 4694 int bytes_returned = 0;
4675 __u16 params, byte_count; 4695 __u16 params, byte_count;
4676 4696
4677 cFYI(1, ("In QFSPosixInfo")); 4697 cFYI(1, "In QFSPosixInfo");
4678QFSPosixRetry: 4698QFSPosixRetry:
4679 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4699 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4680 (void **) &pSMBr); 4700 (void **) &pSMBr);
@@ -4708,7 +4728,7 @@ QFSPosixRetry:
4708 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4728 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4709 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4729 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4710 if (rc) { 4730 if (rc) {
4711 cFYI(1, ("Send error in QFSUnixInfo = %d", rc)); 4731 cFYI(1, "Send error in QFSUnixInfo = %d", rc);
4712 } else { /* decode response */ 4732 } else { /* decode response */
4713 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4733 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4714 4734
@@ -4768,7 +4788,7 @@ CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
4768 int bytes_returned = 0; 4788 int bytes_returned = 0;
4769 __u16 params, byte_count, data_count, param_offset, offset; 4789 __u16 params, byte_count, data_count, param_offset, offset;
4770 4790
4771 cFYI(1, ("In SetEOF")); 4791 cFYI(1, "In SetEOF");
4772SetEOFRetry: 4792SetEOFRetry:
4773 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4793 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4774 (void **) &pSMBr); 4794 (void **) &pSMBr);
@@ -4834,7 +4854,7 @@ SetEOFRetry:
4834 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4854 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4835 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4855 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4836 if (rc) 4856 if (rc)
4837 cFYI(1, ("SetPathInfo (file size) returned %d", rc)); 4857 cFYI(1, "SetPathInfo (file size) returned %d", rc);
4838 4858
4839 cifs_buf_release(pSMB); 4859 cifs_buf_release(pSMB);
4840 4860
@@ -4854,8 +4874,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4854 int rc = 0; 4874 int rc = 0;
4855 __u16 params, param_offset, offset, byte_count, count; 4875 __u16 params, param_offset, offset, byte_count, count;
4856 4876
4857 cFYI(1, ("SetFileSize (via SetFileInfo) %lld", 4877 cFYI(1, "SetFileSize (via SetFileInfo) %lld",
4858 (long long)size)); 4878 (long long)size);
4859 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 4879 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4860 4880
4861 if (rc) 4881 if (rc)
@@ -4914,9 +4934,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4914 pSMB->ByteCount = cpu_to_le16(byte_count); 4934 pSMB->ByteCount = cpu_to_le16(byte_count);
4915 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 4935 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4916 if (rc) { 4936 if (rc) {
4917 cFYI(1, 4937 cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
4918 ("Send error in SetFileInfo (SetFileSize) = %d",
4919 rc));
4920 } 4938 }
4921 4939
4922 /* Note: On -EAGAIN error only caller can retry on handle based calls 4940 /* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -4940,7 +4958,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
4940 int rc = 0; 4958 int rc = 0;
4941 __u16 params, param_offset, offset, byte_count, count; 4959 __u16 params, param_offset, offset, byte_count, count;
4942 4960
4943 cFYI(1, ("Set Times (via SetFileInfo)")); 4961 cFYI(1, "Set Times (via SetFileInfo)");
4944 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 4962 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4945 4963
4946 if (rc) 4964 if (rc)
@@ -4985,7 +5003,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
4985 memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); 5003 memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
4986 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 5004 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4987 if (rc) 5005 if (rc)
4988 cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc)); 5006 cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
4989 5007
4990 /* Note: On -EAGAIN error only caller can retry on handle based calls 5008 /* Note: On -EAGAIN error only caller can retry on handle based calls
4991 since file handle passed in no longer valid */ 5009 since file handle passed in no longer valid */
@@ -5002,7 +5020,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
5002 int rc = 0; 5020 int rc = 0;
5003 __u16 params, param_offset, offset, byte_count, count; 5021 __u16 params, param_offset, offset, byte_count, count;
5004 5022
5005 cFYI(1, ("Set File Disposition (via SetFileInfo)")); 5023 cFYI(1, "Set File Disposition (via SetFileInfo)");
5006 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 5024 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
5007 5025
5008 if (rc) 5026 if (rc)
@@ -5044,7 +5062,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
5044 *data_offset = delete_file ? 1 : 0; 5062 *data_offset = delete_file ? 1 : 0;
5045 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 5063 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
5046 if (rc) 5064 if (rc)
5047 cFYI(1, ("Send error in SetFileDisposition = %d", rc)); 5065 cFYI(1, "Send error in SetFileDisposition = %d", rc);
5048 5066
5049 return rc; 5067 return rc;
5050} 5068}
@@ -5062,7 +5080,7 @@ CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
5062 char *data_offset; 5080 char *data_offset;
5063 __u16 params, param_offset, offset, byte_count, count; 5081 __u16 params, param_offset, offset, byte_count, count;
5064 5082
5065 cFYI(1, ("In SetTimes")); 5083 cFYI(1, "In SetTimes");
5066 5084
5067SetTimesRetry: 5085SetTimesRetry:
5068 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5086 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -5118,7 +5136,7 @@ SetTimesRetry:
5118 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5136 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5119 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5137 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5120 if (rc) 5138 if (rc)
5121 cFYI(1, ("SetPathInfo (times) returned %d", rc)); 5139 cFYI(1, "SetPathInfo (times) returned %d", rc);
5122 5140
5123 cifs_buf_release(pSMB); 5141 cifs_buf_release(pSMB);
5124 5142
@@ -5143,7 +5161,7 @@ CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
5143 int bytes_returned; 5161 int bytes_returned;
5144 int name_len; 5162 int name_len;
5145 5163
5146 cFYI(1, ("In SetAttrLegacy")); 5164 cFYI(1, "In SetAttrLegacy");
5147 5165
5148SetAttrLgcyRetry: 5166SetAttrLgcyRetry:
5149 rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, 5167 rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5169,7 +5187,7 @@ SetAttrLgcyRetry:
5169 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5187 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5170 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5188 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5171 if (rc) 5189 if (rc)
5172 cFYI(1, ("Error in LegacySetAttr = %d", rc)); 5190 cFYI(1, "Error in LegacySetAttr = %d", rc);
5173 5191
5174 cifs_buf_release(pSMB); 5192 cifs_buf_release(pSMB);
5175 5193
@@ -5231,7 +5249,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
5231 int rc = 0; 5249 int rc = 0;
5232 u16 params, param_offset, offset, byte_count, count; 5250 u16 params, param_offset, offset, byte_count, count;
5233 5251
5234 cFYI(1, ("Set Unix Info (via SetFileInfo)")); 5252 cFYI(1, "Set Unix Info (via SetFileInfo)");
5235 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 5253 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
5236 5254
5237 if (rc) 5255 if (rc)
@@ -5276,7 +5294,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
5276 5294
5277 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 5295 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
5278 if (rc) 5296 if (rc)
5279 cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc)); 5297 cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
5280 5298
5281 /* Note: On -EAGAIN error only caller can retry on handle based calls 5299 /* Note: On -EAGAIN error only caller can retry on handle based calls
5282 since file handle passed in no longer valid */ 5300 since file handle passed in no longer valid */
@@ -5297,7 +5315,7 @@ CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
5297 FILE_UNIX_BASIC_INFO *data_offset; 5315 FILE_UNIX_BASIC_INFO *data_offset;
5298 __u16 params, param_offset, offset, count, byte_count; 5316 __u16 params, param_offset, offset, count, byte_count;
5299 5317
5300 cFYI(1, ("In SetUID/GID/Mode")); 5318 cFYI(1, "In SetUID/GID/Mode");
5301setPermsRetry: 5319setPermsRetry:
5302 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5320 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
5303 (void **) &pSMBr); 5321 (void **) &pSMBr);
@@ -5353,7 +5371,7 @@ setPermsRetry:
5353 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5371 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5354 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5372 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5355 if (rc) 5373 if (rc)
5356 cFYI(1, ("SetPathInfo (perms) returned %d", rc)); 5374 cFYI(1, "SetPathInfo (perms) returned %d", rc);
5357 5375
5358 cifs_buf_release(pSMB); 5376 cifs_buf_release(pSMB);
5359 if (rc == -EAGAIN) 5377 if (rc == -EAGAIN)
@@ -5372,7 +5390,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5372 struct dir_notify_req *dnotify_req; 5390 struct dir_notify_req *dnotify_req;
5373 int bytes_returned; 5391 int bytes_returned;
5374 5392
5375 cFYI(1, ("In CIFSSMBNotify for file handle %d", (int)netfid)); 5393 cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
5376 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, 5394 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
5377 (void **) &pSMBr); 5395 (void **) &pSMBr);
5378 if (rc) 5396 if (rc)
@@ -5406,7 +5424,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5406 (struct smb_hdr *)pSMBr, &bytes_returned, 5424 (struct smb_hdr *)pSMBr, &bytes_returned,
5407 CIFS_ASYNC_OP); 5425 CIFS_ASYNC_OP);
5408 if (rc) { 5426 if (rc) {
5409 cFYI(1, ("Error in Notify = %d", rc)); 5427 cFYI(1, "Error in Notify = %d", rc);
5410 } else { 5428 } else {
5411 /* Add file to outstanding requests */ 5429 /* Add file to outstanding requests */
5412 /* BB change to kmem cache alloc */ 5430 /* BB change to kmem cache alloc */
@@ -5462,7 +5480,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
5462 char *end_of_smb; 5480 char *end_of_smb;
5463 __u16 params, byte_count, data_offset; 5481 __u16 params, byte_count, data_offset;
5464 5482
5465 cFYI(1, ("In Query All EAs path %s", searchName)); 5483 cFYI(1, "In Query All EAs path %s", searchName);
5466QAllEAsRetry: 5484QAllEAsRetry:
5467 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5485 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
5468 (void **) &pSMBr); 5486 (void **) &pSMBr);
@@ -5509,7 +5527,7 @@ QAllEAsRetry:
5509 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5527 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5510 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5528 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5511 if (rc) { 5529 if (rc) {
5512 cFYI(1, ("Send error in QueryAllEAs = %d", rc)); 5530 cFYI(1, "Send error in QueryAllEAs = %d", rc);
5513 goto QAllEAsOut; 5531 goto QAllEAsOut;
5514 } 5532 }
5515 5533
@@ -5537,16 +5555,16 @@ QAllEAsRetry:
5537 (((char *) &pSMBr->hdr.Protocol) + data_offset); 5555 (((char *) &pSMBr->hdr.Protocol) + data_offset);
5538 5556
5539 list_len = le32_to_cpu(ea_response_data->list_len); 5557 list_len = le32_to_cpu(ea_response_data->list_len);
5540 cFYI(1, ("ea length %d", list_len)); 5558 cFYI(1, "ea length %d", list_len);
5541 if (list_len <= 8) { 5559 if (list_len <= 8) {
5542 cFYI(1, ("empty EA list returned from server")); 5560 cFYI(1, "empty EA list returned from server");
5543 goto QAllEAsOut; 5561 goto QAllEAsOut;
5544 } 5562 }
5545 5563
5546 /* make sure list_len doesn't go past end of SMB */ 5564 /* make sure list_len doesn't go past end of SMB */
5547 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr); 5565 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
5548 if ((char *)ea_response_data + list_len > end_of_smb) { 5566 if ((char *)ea_response_data + list_len > end_of_smb) {
5549 cFYI(1, ("EA list appears to go beyond SMB")); 5567 cFYI(1, "EA list appears to go beyond SMB");
5550 rc = -EIO; 5568 rc = -EIO;
5551 goto QAllEAsOut; 5569 goto QAllEAsOut;
5552 } 5570 }
@@ -5563,7 +5581,7 @@ QAllEAsRetry:
5563 temp_ptr += 4; 5581 temp_ptr += 4;
5564 /* make sure we can read name_len and value_len */ 5582 /* make sure we can read name_len and value_len */
5565 if (list_len < 0) { 5583 if (list_len < 0) {
5566 cFYI(1, ("EA entry goes beyond length of list")); 5584 cFYI(1, "EA entry goes beyond length of list");
5567 rc = -EIO; 5585 rc = -EIO;
5568 goto QAllEAsOut; 5586 goto QAllEAsOut;
5569 } 5587 }
@@ -5572,7 +5590,7 @@ QAllEAsRetry:
5572 value_len = le16_to_cpu(temp_fea->value_len); 5590 value_len = le16_to_cpu(temp_fea->value_len);
5573 list_len -= name_len + 1 + value_len; 5591 list_len -= name_len + 1 + value_len;
5574 if (list_len < 0) { 5592 if (list_len < 0) {
5575 cFYI(1, ("EA entry goes beyond length of list")); 5593 cFYI(1, "EA entry goes beyond length of list");
5576 rc = -EIO; 5594 rc = -EIO;
5577 goto QAllEAsOut; 5595 goto QAllEAsOut;
5578 } 5596 }
@@ -5639,7 +5657,7 @@ CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
5639 int bytes_returned = 0; 5657 int bytes_returned = 0;
5640 __u16 params, param_offset, byte_count, offset, count; 5658 __u16 params, param_offset, byte_count, offset, count;
5641 5659
5642 cFYI(1, ("In SetEA")); 5660 cFYI(1, "In SetEA");
5643SetEARetry: 5661SetEARetry:
5644 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5662 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
5645 (void **) &pSMBr); 5663 (void **) &pSMBr);
@@ -5721,7 +5739,7 @@ SetEARetry:
5721 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5739 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5722 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5740 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5723 if (rc) 5741 if (rc)
5724 cFYI(1, ("SetPathInfo (EA) returned %d", rc)); 5742 cFYI(1, "SetPathInfo (EA) returned %d", rc);
5725 5743
5726 cifs_buf_release(pSMB); 5744 cifs_buf_release(pSMB);
5727 5745
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d9566bf8f917..88c84a38bccb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -48,6 +48,7 @@
48#include "nterr.h" 48#include "nterr.h"
49#include "rfc1002pdu.h" 49#include "rfc1002pdu.h"
50#include "cn_cifs.h" 50#include "cn_cifs.h"
51#include "fscache.h"
51 52
52#define CIFS_PORT 445 53#define CIFS_PORT 445
53#define RFC1001_PORT 139 54#define RFC1001_PORT 139
@@ -66,6 +67,7 @@ struct smb_vol {
66 char *iocharset; /* local code page for mapping to and from Unicode */ 67 char *iocharset; /* local code page for mapping to and from Unicode */
67 char source_rfc1001_name[16]; /* netbios name of client */ 68 char source_rfc1001_name[16]; /* netbios name of client */
68 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ 69 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
70 uid_t cred_uid;
69 uid_t linux_uid; 71 uid_t linux_uid;
70 gid_t linux_gid; 72 gid_t linux_gid;
71 mode_t file_mode; 73 mode_t file_mode;
@@ -97,11 +99,13 @@ struct smb_vol {
97 bool noblocksnd:1; 99 bool noblocksnd:1;
98 bool noautotune:1; 100 bool noautotune:1;
99 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ 101 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
102 bool fsc:1; /* enable fscache */
100 unsigned int rsize; 103 unsigned int rsize;
101 unsigned int wsize; 104 unsigned int wsize;
102 bool sockopt_tcp_nodelay:1; 105 bool sockopt_tcp_nodelay:1;
103 unsigned short int port; 106 unsigned short int port;
104 char *prepath; 107 char *prepath;
108 struct nls_table *local_nls;
105}; 109};
106 110
107static int ipv4_connect(struct TCP_Server_Info *server); 111static int ipv4_connect(struct TCP_Server_Info *server);
@@ -135,7 +139,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
135 spin_unlock(&GlobalMid_Lock); 139 spin_unlock(&GlobalMid_Lock);
136 server->maxBuf = 0; 140 server->maxBuf = 0;
137 141
138 cFYI(1, ("Reconnecting tcp session")); 142 cFYI(1, "Reconnecting tcp session");
139 143
140 /* before reconnecting the tcp session, mark the smb session (uid) 144 /* before reconnecting the tcp session, mark the smb session (uid)
141 and the tid bad so they are not used until reconnected */ 145 and the tid bad so they are not used until reconnected */
@@ -153,12 +157,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
153 /* do not want to be sending data on a socket we are freeing */ 157 /* do not want to be sending data on a socket we are freeing */
154 mutex_lock(&server->srv_mutex); 158 mutex_lock(&server->srv_mutex);
155 if (server->ssocket) { 159 if (server->ssocket) {
156 cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state, 160 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
157 server->ssocket->flags)); 161 server->ssocket->flags);
158 kernel_sock_shutdown(server->ssocket, SHUT_WR); 162 kernel_sock_shutdown(server->ssocket, SHUT_WR);
159 cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx", 163 cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
160 server->ssocket->state, 164 server->ssocket->state,
161 server->ssocket->flags)); 165 server->ssocket->flags);
162 sock_release(server->ssocket); 166 sock_release(server->ssocket);
163 server->ssocket = NULL; 167 server->ssocket = NULL;
164 } 168 }
@@ -187,7 +191,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
187 else 191 else
188 rc = ipv4_connect(server); 192 rc = ipv4_connect(server);
189 if (rc) { 193 if (rc) {
190 cFYI(1, ("reconnect error %d", rc)); 194 cFYI(1, "reconnect error %d", rc);
191 msleep(3000); 195 msleep(3000);
192 } else { 196 } else {
193 atomic_inc(&tcpSesReconnectCount); 197 atomic_inc(&tcpSesReconnectCount);
@@ -223,7 +227,7 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
223 /* check for plausible wct, bcc and t2 data and parm sizes */ 227 /* check for plausible wct, bcc and t2 data and parm sizes */
224 /* check for parm and data offset going beyond end of smb */ 228 /* check for parm and data offset going beyond end of smb */
225 if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */ 229 if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
226 cFYI(1, ("invalid transact2 word count")); 230 cFYI(1, "invalid transact2 word count");
227 return -EINVAL; 231 return -EINVAL;
228 } 232 }
229 233
@@ -237,15 +241,15 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
237 if (remaining == 0) 241 if (remaining == 0)
238 return 0; 242 return 0;
239 else if (remaining < 0) { 243 else if (remaining < 0) {
240 cFYI(1, ("total data %d smaller than data in frame %d", 244 cFYI(1, "total data %d smaller than data in frame %d",
241 total_data_size, data_in_this_rsp)); 245 total_data_size, data_in_this_rsp);
242 return -EINVAL; 246 return -EINVAL;
243 } else { 247 } else {
244 cFYI(1, ("missing %d bytes from transact2, check next response", 248 cFYI(1, "missing %d bytes from transact2, check next response",
245 remaining)); 249 remaining);
246 if (total_data_size > maxBufSize) { 250 if (total_data_size > maxBufSize) {
247 cERROR(1, ("TotalDataSize %d is over maximum buffer %d", 251 cERROR(1, "TotalDataSize %d is over maximum buffer %d",
248 total_data_size, maxBufSize)); 252 total_data_size, maxBufSize);
249 return -EINVAL; 253 return -EINVAL;
250 } 254 }
251 return remaining; 255 return remaining;
@@ -267,7 +271,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
267 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 271 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
268 272
269 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) { 273 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
270 cFYI(1, ("total data size of primary and secondary t2 differ")); 274 cFYI(1, "total data size of primary and secondary t2 differ");
271 } 275 }
272 276
273 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount); 277 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
@@ -282,7 +286,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
282 286
283 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount); 287 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
284 if (remaining < total_in_buf2) { 288 if (remaining < total_in_buf2) {
285 cFYI(1, ("transact2 2nd response contains too much data")); 289 cFYI(1, "transact2 2nd response contains too much data");
286 } 290 }
287 291
288 /* find end of first SMB data area */ 292 /* find end of first SMB data area */
@@ -311,7 +315,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
311 pTargetSMB->smb_buf_length = byte_count; 315 pTargetSMB->smb_buf_length = byte_count;
312 316
313 if (remaining == total_in_buf2) { 317 if (remaining == total_in_buf2) {
314 cFYI(1, ("found the last secondary response")); 318 cFYI(1, "found the last secondary response");
315 return 0; /* we are done */ 319 return 0; /* we are done */
316 } else /* more responses to go */ 320 } else /* more responses to go */
317 return 1; 321 return 1;
@@ -339,7 +343,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
339 int reconnect; 343 int reconnect;
340 344
341 current->flags |= PF_MEMALLOC; 345 current->flags |= PF_MEMALLOC;
342 cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current))); 346 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
343 347
344 length = atomic_inc_return(&tcpSesAllocCount); 348 length = atomic_inc_return(&tcpSesAllocCount);
345 if (length > 1) 349 if (length > 1)
@@ -353,7 +357,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
353 if (bigbuf == NULL) { 357 if (bigbuf == NULL) {
354 bigbuf = cifs_buf_get(); 358 bigbuf = cifs_buf_get();
355 if (!bigbuf) { 359 if (!bigbuf) {
356 cERROR(1, ("No memory for large SMB response")); 360 cERROR(1, "No memory for large SMB response");
357 msleep(3000); 361 msleep(3000);
358 /* retry will check if exiting */ 362 /* retry will check if exiting */
359 continue; 363 continue;
@@ -366,7 +370,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
366 if (smallbuf == NULL) { 370 if (smallbuf == NULL) {
367 smallbuf = cifs_small_buf_get(); 371 smallbuf = cifs_small_buf_get();
368 if (!smallbuf) { 372 if (!smallbuf) {
369 cERROR(1, ("No memory for SMB response")); 373 cERROR(1, "No memory for SMB response");
370 msleep(1000); 374 msleep(1000);
371 /* retry will check if exiting */ 375 /* retry will check if exiting */
372 continue; 376 continue;
@@ -391,12 +395,14 @@ incomplete_rcv:
391 if (server->tcpStatus == CifsExiting) { 395 if (server->tcpStatus == CifsExiting) {
392 break; 396 break;
393 } else if (server->tcpStatus == CifsNeedReconnect) { 397 } else if (server->tcpStatus == CifsNeedReconnect) {
394 cFYI(1, ("Reconnect after server stopped responding")); 398 cFYI(1, "Reconnect after server stopped responding");
395 cifs_reconnect(server); 399 cifs_reconnect(server);
396 cFYI(1, ("call to reconnect done")); 400 cFYI(1, "call to reconnect done");
397 csocket = server->ssocket; 401 csocket = server->ssocket;
398 continue; 402 continue;
399 } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) { 403 } else if (length == -ERESTARTSYS ||
404 length == -EAGAIN ||
405 length == -EINTR) {
400 msleep(1); /* minimum sleep to prevent looping 406 msleep(1); /* minimum sleep to prevent looping
401 allowing socket to clear and app threads to set 407 allowing socket to clear and app threads to set
402 tcpStatus CifsNeedReconnect if server hung */ 408 tcpStatus CifsNeedReconnect if server hung */
@@ -410,27 +416,15 @@ incomplete_rcv:
410 } else 416 } else
411 continue; 417 continue;
412 } else if (length <= 0) { 418 } else if (length <= 0) {
413 if (server->tcpStatus == CifsNew) { 419 cFYI(1, "Reconnect after unexpected peek error %d",
414 cFYI(1, ("tcp session abend after SMBnegprot")); 420 length);
415 /* some servers kill the TCP session rather than
416 returning an SMB negprot error, in which
417 case reconnecting here is not going to help,
418 and so simply return error to mount */
419 break;
420 }
421 if (!try_to_freeze() && (length == -EINTR)) {
422 cFYI(1, ("cifsd thread killed"));
423 break;
424 }
425 cFYI(1, ("Reconnect after unexpected peek error %d",
426 length));
427 cifs_reconnect(server); 421 cifs_reconnect(server);
428 csocket = server->ssocket; 422 csocket = server->ssocket;
429 wake_up(&server->response_q); 423 wake_up(&server->response_q);
430 continue; 424 continue;
431 } else if (length < pdu_length) { 425 } else if (length < pdu_length) {
432 cFYI(1, ("requested %d bytes but only got %d bytes", 426 cFYI(1, "requested %d bytes but only got %d bytes",
433 pdu_length, length)); 427 pdu_length, length);
434 pdu_length -= length; 428 pdu_length -= length;
435 msleep(1); 429 msleep(1);
436 goto incomplete_rcv; 430 goto incomplete_rcv;
@@ -450,41 +444,33 @@ incomplete_rcv:
450 pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length); 444 pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
451 smb_buffer->smb_buf_length = pdu_length; 445 smb_buffer->smb_buf_length = pdu_length;
452 446
453 cFYI(1, ("rfc1002 length 0x%x", pdu_length+4)); 447 cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
454 448
455 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { 449 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
456 continue; 450 continue;
457 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { 451 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
458 cFYI(1, ("Good RFC 1002 session rsp")); 452 cFYI(1, "Good RFC 1002 session rsp");
459 continue; 453 continue;
460 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { 454 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
461 /* we get this from Windows 98 instead of 455 /* we get this from Windows 98 instead of
462 an error on SMB negprot response */ 456 an error on SMB negprot response */
463 cFYI(1, ("Negative RFC1002 Session Response Error 0x%x)", 457 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
464 pdu_length)); 458 pdu_length);
465 if (server->tcpStatus == CifsNew) { 459 /* give server a second to clean up */
466 /* if nack on negprot (rather than 460 msleep(1000);
467 ret of smb negprot error) reconnecting 461 /* always try 445 first on reconnect since we get NACK
468 not going to help, ret error to mount */ 462 * on some if we ever connected to port 139 (the NACK
469 break; 463 * is since we do not begin with RFC1001 session
470 } else { 464 * initialize frame)
471 /* give server a second to 465 */
472 clean up before reconnect attempt */ 466 cifs_set_port((struct sockaddr *)
473 msleep(1000); 467 &server->addr.sockAddr, CIFS_PORT);
474 /* always try 445 first on reconnect 468 cifs_reconnect(server);
475 since we get NACK on some if we ever 469 csocket = server->ssocket;
476 connected to port 139 (the NACK is 470 wake_up(&server->response_q);
477 since we do not begin with RFC1001 471 continue;
478 session initialize frame) */
479 server->addr.sockAddr.sin_port =
480 htons(CIFS_PORT);
481 cifs_reconnect(server);
482 csocket = server->ssocket;
483 wake_up(&server->response_q);
484 continue;
485 }
486 } else if (temp != (char) 0) { 472 } else if (temp != (char) 0) {
487 cERROR(1, ("Unknown RFC 1002 frame")); 473 cERROR(1, "Unknown RFC 1002 frame");
488 cifs_dump_mem(" Received Data: ", (char *)smb_buffer, 474 cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
489 length); 475 length);
490 cifs_reconnect(server); 476 cifs_reconnect(server);
@@ -495,8 +481,8 @@ incomplete_rcv:
495 /* else we have an SMB response */ 481 /* else we have an SMB response */
496 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || 482 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
497 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { 483 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
498 cERROR(1, ("Invalid size SMB length %d pdu_length %d", 484 cERROR(1, "Invalid size SMB length %d pdu_length %d",
499 length, pdu_length+4)); 485 length, pdu_length+4);
500 cifs_reconnect(server); 486 cifs_reconnect(server);
501 csocket = server->ssocket; 487 csocket = server->ssocket;
502 wake_up(&server->response_q); 488 wake_up(&server->response_q);
@@ -518,8 +504,7 @@ incomplete_rcv:
518 total_read += length) { 504 total_read += length) {
519 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, 505 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
520 pdu_length - total_read, 0); 506 pdu_length - total_read, 0);
521 if ((server->tcpStatus == CifsExiting) || 507 if (server->tcpStatus == CifsExiting) {
522 (length == -EINTR)) {
523 /* then will exit */ 508 /* then will exit */
524 reconnect = 2; 509 reconnect = 2;
525 break; 510 break;
@@ -530,8 +515,9 @@ incomplete_rcv:
530 /* Now we will reread sock */ 515 /* Now we will reread sock */
531 reconnect = 1; 516 reconnect = 1;
532 break; 517 break;
533 } else if ((length == -ERESTARTSYS) || 518 } else if (length == -ERESTARTSYS ||
534 (length == -EAGAIN)) { 519 length == -EAGAIN ||
520 length == -EINTR) {
535 msleep(1); /* minimum sleep to prevent looping, 521 msleep(1); /* minimum sleep to prevent looping,
536 allowing socket to clear and app 522 allowing socket to clear and app
537 threads to set tcpStatus 523 threads to set tcpStatus
@@ -539,8 +525,8 @@ incomplete_rcv:
539 length = 0; 525 length = 0;
540 continue; 526 continue;
541 } else if (length <= 0) { 527 } else if (length <= 0) {
542 cERROR(1, ("Received no data, expecting %d", 528 cERROR(1, "Received no data, expecting %d",
543 pdu_length - total_read)); 529 pdu_length - total_read);
544 cifs_reconnect(server); 530 cifs_reconnect(server);
545 csocket = server->ssocket; 531 csocket = server->ssocket;
546 reconnect = 1; 532 reconnect = 1;
@@ -588,7 +574,7 @@ incomplete_rcv:
588 } 574 }
589 } else { 575 } else {
590 if (!isLargeBuf) { 576 if (!isLargeBuf) {
591 cERROR(1,("1st trans2 resp needs bigbuf")); 577 cERROR(1, "1st trans2 resp needs bigbuf");
592 /* BB maybe we can fix this up, switch 578 /* BB maybe we can fix this up, switch
593 to already allocated large buffer? */ 579 to already allocated large buffer? */
594 } else { 580 } else {
@@ -630,8 +616,8 @@ multi_t2_fnd:
630 wake_up_process(task_to_wake); 616 wake_up_process(task_to_wake);
631 } else if (!is_valid_oplock_break(smb_buffer, server) && 617 } else if (!is_valid_oplock_break(smb_buffer, server) &&
632 !isMultiRsp) { 618 !isMultiRsp) {
633 cERROR(1, ("No task to wake, unknown frame received! " 619 cERROR(1, "No task to wake, unknown frame received! "
634 "NumMids %d", midCount.counter)); 620 "NumMids %d", midCount.counter);
635 cifs_dump_mem("Received Data is: ", (char *)smb_buffer, 621 cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
636 sizeof(struct smb_hdr)); 622 sizeof(struct smb_hdr));
637#ifdef CONFIG_CIFS_DEBUG2 623#ifdef CONFIG_CIFS_DEBUG2
@@ -708,8 +694,8 @@ multi_t2_fnd:
708 list_for_each(tmp, &server->pending_mid_q) { 694 list_for_each(tmp, &server->pending_mid_q) {
709 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 695 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
710 if (mid_entry->midState == MID_REQUEST_SUBMITTED) { 696 if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
711 cFYI(1, ("Clearing Mid 0x%x - waking up ", 697 cFYI(1, "Clearing Mid 0x%x - waking up ",
712 mid_entry->mid)); 698 mid_entry->mid);
713 task_to_wake = mid_entry->tsk; 699 task_to_wake = mid_entry->tsk;
714 if (task_to_wake) 700 if (task_to_wake)
715 wake_up_process(task_to_wake); 701 wake_up_process(task_to_wake);
@@ -728,7 +714,7 @@ multi_t2_fnd:
728 to wait at least 45 seconds before giving up 714 to wait at least 45 seconds before giving up
729 on a request getting a response and going ahead 715 on a request getting a response and going ahead
730 and killing cifsd */ 716 and killing cifsd */
731 cFYI(1, ("Wait for exit from demultiplex thread")); 717 cFYI(1, "Wait for exit from demultiplex thread");
732 msleep(46000); 718 msleep(46000);
733 /* if threads still have not exited they are probably never 719 /* if threads still have not exited they are probably never
734 coming home not much else we can do but free the memory */ 720 coming home not much else we can do but free the memory */
@@ -829,7 +815,8 @@ cifs_parse_mount_options(char *options, const char *devname,
829 /* null target name indicates to use *SMBSERVR default called name 815 /* null target name indicates to use *SMBSERVR default called name
830 if we end up sending RFC1001 session initialize */ 816 if we end up sending RFC1001 session initialize */
831 vol->target_rfc1001_name[0] = 0; 817 vol->target_rfc1001_name[0] = 0;
832 vol->linux_uid = current_uid(); /* use current_euid() instead? */ 818 vol->cred_uid = current_uid();
819 vol->linux_uid = current_uid();
833 vol->linux_gid = current_gid(); 820 vol->linux_gid = current_gid();
834 821
835 /* default to only allowing write access to owner of the mount */ 822 /* default to only allowing write access to owner of the mount */
@@ -849,7 +836,7 @@ cifs_parse_mount_options(char *options, const char *devname,
849 separator[0] = options[4]; 836 separator[0] = options[4];
850 options += 5; 837 options += 5;
851 } else { 838 } else {
852 cFYI(1, ("Null separator not allowed")); 839 cFYI(1, "Null separator not allowed");
853 } 840 }
854 } 841 }
855 842
@@ -974,7 +961,7 @@ cifs_parse_mount_options(char *options, const char *devname,
974 } 961 }
975 } else if (strnicmp(data, "sec", 3) == 0) { 962 } else if (strnicmp(data, "sec", 3) == 0) {
976 if (!value || !*value) { 963 if (!value || !*value) {
977 cERROR(1, ("no security value specified")); 964 cERROR(1, "no security value specified");
978 continue; 965 continue;
979 } else if (strnicmp(value, "krb5i", 5) == 0) { 966 } else if (strnicmp(value, "krb5i", 5) == 0) {
980 vol->secFlg |= CIFSSEC_MAY_KRB5 | 967 vol->secFlg |= CIFSSEC_MAY_KRB5 |
@@ -982,7 +969,7 @@ cifs_parse_mount_options(char *options, const char *devname,
982 } else if (strnicmp(value, "krb5p", 5) == 0) { 969 } else if (strnicmp(value, "krb5p", 5) == 0) {
983 /* vol->secFlg |= CIFSSEC_MUST_SEAL | 970 /* vol->secFlg |= CIFSSEC_MUST_SEAL |
984 CIFSSEC_MAY_KRB5; */ 971 CIFSSEC_MAY_KRB5; */
985 cERROR(1, ("Krb5 cifs privacy not supported")); 972 cERROR(1, "Krb5 cifs privacy not supported");
986 return 1; 973 return 1;
987 } else if (strnicmp(value, "krb5", 4) == 0) { 974 } else if (strnicmp(value, "krb5", 4) == 0) {
988 vol->secFlg |= CIFSSEC_MAY_KRB5; 975 vol->secFlg |= CIFSSEC_MAY_KRB5;
@@ -1014,7 +1001,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1014 } else if (strnicmp(value, "none", 4) == 0) { 1001 } else if (strnicmp(value, "none", 4) == 0) {
1015 vol->nullauth = 1; 1002 vol->nullauth = 1;
1016 } else { 1003 } else {
1017 cERROR(1, ("bad security option: %s", value)); 1004 cERROR(1, "bad security option: %s", value);
1018 return 1; 1005 return 1;
1019 } 1006 }
1020 } else if ((strnicmp(data, "unc", 3) == 0) 1007 } else if ((strnicmp(data, "unc", 3) == 0)
@@ -1053,7 +1040,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1053 a domain name and need special handling? */ 1040 a domain name and need special handling? */
1054 if (strnlen(value, 256) < 256) { 1041 if (strnlen(value, 256) < 256) {
1055 vol->domainname = value; 1042 vol->domainname = value;
1056 cFYI(1, ("Domain name set")); 1043 cFYI(1, "Domain name set");
1057 } else { 1044 } else {
1058 printk(KERN_WARNING "CIFS: domain name too " 1045 printk(KERN_WARNING "CIFS: domain name too "
1059 "long\n"); 1046 "long\n");
@@ -1076,7 +1063,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1076 strcpy(vol->prepath+1, value); 1063 strcpy(vol->prepath+1, value);
1077 } else 1064 } else
1078 strcpy(vol->prepath, value); 1065 strcpy(vol->prepath, value);
1079 cFYI(1, ("prefix path %s", vol->prepath)); 1066 cFYI(1, "prefix path %s", vol->prepath);
1080 } else { 1067 } else {
1081 printk(KERN_WARNING "CIFS: prefix too long\n"); 1068 printk(KERN_WARNING "CIFS: prefix too long\n");
1082 return 1; 1069 return 1;
@@ -1092,7 +1079,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1092 vol->iocharset = value; 1079 vol->iocharset = value;
1093 /* if iocharset not set then load_nls_default 1080 /* if iocharset not set then load_nls_default
1094 is used by caller */ 1081 is used by caller */
1095 cFYI(1, ("iocharset set to %s", value)); 1082 cFYI(1, "iocharset set to %s", value);
1096 } else { 1083 } else {
1097 printk(KERN_WARNING "CIFS: iocharset name " 1084 printk(KERN_WARNING "CIFS: iocharset name "
1098 "too long.\n"); 1085 "too long.\n");
@@ -1144,14 +1131,14 @@ cifs_parse_mount_options(char *options, const char *devname,
1144 } 1131 }
1145 } else if (strnicmp(data, "sockopt", 5) == 0) { 1132 } else if (strnicmp(data, "sockopt", 5) == 0) {
1146 if (!value || !*value) { 1133 if (!value || !*value) {
1147 cERROR(1, ("no socket option specified")); 1134 cERROR(1, "no socket option specified");
1148 continue; 1135 continue;
1149 } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) { 1136 } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
1150 vol->sockopt_tcp_nodelay = 1; 1137 vol->sockopt_tcp_nodelay = 1;
1151 } 1138 }
1152 } else if (strnicmp(data, "netbiosname", 4) == 0) { 1139 } else if (strnicmp(data, "netbiosname", 4) == 0) {
1153 if (!value || !*value || (*value == ' ')) { 1140 if (!value || !*value || (*value == ' ')) {
1154 cFYI(1, ("invalid (empty) netbiosname")); 1141 cFYI(1, "invalid (empty) netbiosname");
1155 } else { 1142 } else {
1156 memset(vol->source_rfc1001_name, 0x20, 15); 1143 memset(vol->source_rfc1001_name, 0x20, 15);
1157 for (i = 0; i < 15; i++) { 1144 for (i = 0; i < 15; i++) {
@@ -1175,7 +1162,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1175 } else if (strnicmp(data, "servern", 7) == 0) { 1162 } else if (strnicmp(data, "servern", 7) == 0) {
1176 /* servernetbiosname specified override *SMBSERVER */ 1163 /* servernetbiosname specified override *SMBSERVER */
1177 if (!value || !*value || (*value == ' ')) { 1164 if (!value || !*value || (*value == ' ')) {
1178 cFYI(1, ("empty server netbiosname specified")); 1165 cFYI(1, "empty server netbiosname specified");
1179 } else { 1166 } else {
1180 /* last byte, type, is 0x20 for servr type */ 1167 /* last byte, type, is 0x20 for servr type */
1181 memset(vol->target_rfc1001_name, 0x20, 16); 1168 memset(vol->target_rfc1001_name, 0x20, 16);
@@ -1256,6 +1243,12 @@ cifs_parse_mount_options(char *options, const char *devname,
1256 } else if ((strnicmp(data, "nocase", 6) == 0) || 1243 } else if ((strnicmp(data, "nocase", 6) == 0) ||
1257 (strnicmp(data, "ignorecase", 10) == 0)) { 1244 (strnicmp(data, "ignorecase", 10) == 0)) {
1258 vol->nocase = 1; 1245 vol->nocase = 1;
1246 } else if (strnicmp(data, "mand", 4) == 0) {
1247 /* ignore */
1248 } else if (strnicmp(data, "nomand", 6) == 0) {
1249 /* ignore */
1250 } else if (strnicmp(data, "_netdev", 7) == 0) {
1251 /* ignore */
1259 } else if (strnicmp(data, "brl", 3) == 0) { 1252 } else if (strnicmp(data, "brl", 3) == 0) {
1260 vol->nobrl = 0; 1253 vol->nobrl = 0;
1261 } else if ((strnicmp(data, "nobrl", 5) == 0) || 1254 } else if ((strnicmp(data, "nobrl", 5) == 0) ||
@@ -1330,6 +1323,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1330 printk(KERN_WARNING "CIFS: Mount option noac not " 1323 printk(KERN_WARNING "CIFS: Mount option noac not "
1331 "supported. Instead set " 1324 "supported. Instead set "
1332 "/proc/fs/cifs/LookupCacheEnabled to 0\n"); 1325 "/proc/fs/cifs/LookupCacheEnabled to 0\n");
1326 } else if (strnicmp(data, "fsc", 3) == 0) {
1327 vol->fsc = true;
1333 } else 1328 } else
1334 printk(KERN_WARNING "CIFS: Unknown mount option %s\n", 1329 printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
1335 data); 1330 data);
@@ -1379,18 +1374,92 @@ cifs_parse_mount_options(char *options, const char *devname,
1379 return 0; 1374 return 0;
1380} 1375}
1381 1376
1377static bool
1378match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
1379{
1380 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
1381 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
1382
1383 switch (addr->sa_family) {
1384 case AF_INET:
1385 if (addr4->sin_addr.s_addr !=
1386 server->addr.sockAddr.sin_addr.s_addr)
1387 return false;
1388 if (addr4->sin_port &&
1389 addr4->sin_port != server->addr.sockAddr.sin_port)
1390 return false;
1391 break;
1392 case AF_INET6:
1393 if (!ipv6_addr_equal(&addr6->sin6_addr,
1394 &server->addr.sockAddr6.sin6_addr))
1395 return false;
1396 if (addr6->sin6_scope_id !=
1397 server->addr.sockAddr6.sin6_scope_id)
1398 return false;
1399 if (addr6->sin6_port &&
1400 addr6->sin6_port != server->addr.sockAddr6.sin6_port)
1401 return false;
1402 break;
1403 }
1404
1405 return true;
1406}
1407
1408static bool
1409match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
1410{
1411 unsigned int secFlags;
1412
1413 if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
1414 secFlags = vol->secFlg;
1415 else
1416 secFlags = global_secflags | vol->secFlg;
1417
1418 switch (server->secType) {
1419 case LANMAN:
1420 if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
1421 return false;
1422 break;
1423 case NTLMv2:
1424 if (!(secFlags & CIFSSEC_MAY_NTLMV2))
1425 return false;
1426 break;
1427 case NTLM:
1428 if (!(secFlags & CIFSSEC_MAY_NTLM))
1429 return false;
1430 break;
1431 case Kerberos:
1432 if (!(secFlags & CIFSSEC_MAY_KRB5))
1433 return false;
1434 break;
1435 case RawNTLMSSP:
1436 if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
1437 return false;
1438 break;
1439 default:
1440 /* shouldn't happen */
1441 return false;
1442 }
1443
1444 /* now check if signing mode is acceptible */
1445 if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
1446 (server->secMode & SECMODE_SIGN_REQUIRED))
1447 return false;
1448 else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
1449 (server->secMode &
1450 (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
1451 return false;
1452
1453 return true;
1454}
1455
1382static struct TCP_Server_Info * 1456static struct TCP_Server_Info *
1383cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) 1457cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
1384{ 1458{
1385 struct list_head *tmp;
1386 struct TCP_Server_Info *server; 1459 struct TCP_Server_Info *server;
1387 struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
1388 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
1389 1460
1390 write_lock(&cifs_tcp_ses_lock); 1461 write_lock(&cifs_tcp_ses_lock);
1391 list_for_each(tmp, &cifs_tcp_ses_list) { 1462 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
1392 server = list_entry(tmp, struct TCP_Server_Info,
1393 tcp_ses_list);
1394 /* 1463 /*
1395 * the demux thread can exit on its own while still in CifsNew 1464 * the demux thread can exit on its own while still in CifsNew
1396 * so don't accept any sockets in that state. Since the 1465 * so don't accept any sockets in that state. Since the
@@ -1400,41 +1469,15 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
1400 if (server->tcpStatus == CifsNew) 1469 if (server->tcpStatus == CifsNew)
1401 continue; 1470 continue;
1402 1471
1403 switch (addr->ss_family) { 1472 if (!match_address(server, addr))
1404 case AF_INET: 1473 continue;
1405 if (addr4->sin_addr.s_addr ==
1406 server->addr.sockAddr.sin_addr.s_addr) {
1407 addr4->sin_port = htons(port);
1408 /* user overrode default port? */
1409 if (addr4->sin_port) {
1410 if (addr4->sin_port !=
1411 server->addr.sockAddr.sin_port)
1412 continue;
1413 }
1414 break;
1415 } else
1416 continue;
1417 1474
1418 case AF_INET6: 1475 if (!match_security(server, vol))
1419 if (ipv6_addr_equal(&addr6->sin6_addr, 1476 continue;
1420 &server->addr.sockAddr6.sin6_addr) &&
1421 (addr6->sin6_scope_id ==
1422 server->addr.sockAddr6.sin6_scope_id)) {
1423 addr6->sin6_port = htons(port);
1424 /* user overrode default port? */
1425 if (addr6->sin6_port) {
1426 if (addr6->sin6_port !=
1427 server->addr.sockAddr6.sin6_port)
1428 continue;
1429 }
1430 break;
1431 } else
1432 continue;
1433 }
1434 1477
1435 ++server->srv_count; 1478 ++server->srv_count;
1436 write_unlock(&cifs_tcp_ses_lock); 1479 write_unlock(&cifs_tcp_ses_lock);
1437 cFYI(1, ("Existing tcp session with server found")); 1480 cFYI(1, "Existing tcp session with server found");
1438 return server; 1481 return server;
1439 } 1482 }
1440 write_unlock(&cifs_tcp_ses_lock); 1483 write_unlock(&cifs_tcp_ses_lock);
@@ -1459,6 +1502,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
1459 server->tcpStatus = CifsExiting; 1502 server->tcpStatus = CifsExiting;
1460 spin_unlock(&GlobalMid_Lock); 1503 spin_unlock(&GlobalMid_Lock);
1461 1504
1505 cifs_fscache_release_client_cookie(server);
1506
1462 task = xchg(&server->tsk, NULL); 1507 task = xchg(&server->tsk, NULL);
1463 if (task) 1508 if (task)
1464 force_sig(SIGKILL, task); 1509 force_sig(SIGKILL, task);
@@ -1475,10 +1520,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1475 1520
1476 memset(&addr, 0, sizeof(struct sockaddr_storage)); 1521 memset(&addr, 0, sizeof(struct sockaddr_storage));
1477 1522
1478 cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip)); 1523 cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
1479 1524
1480 if (volume_info->UNCip && volume_info->UNC) { 1525 if (volume_info->UNCip && volume_info->UNC) {
1481 rc = cifs_convert_address(volume_info->UNCip, &addr); 1526 rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
1527 volume_info->UNCip,
1528 strlen(volume_info->UNCip),
1529 volume_info->port);
1482 if (!rc) { 1530 if (!rc) {
1483 /* we failed translating address */ 1531 /* we failed translating address */
1484 rc = -EINVAL; 1532 rc = -EINVAL;
@@ -1487,19 +1535,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1487 } else if (volume_info->UNCip) { 1535 } else if (volume_info->UNCip) {
1488 /* BB using ip addr as tcp_ses name to connect to the 1536 /* BB using ip addr as tcp_ses name to connect to the
1489 DFS root below */ 1537 DFS root below */
1490 cERROR(1, ("Connecting to DFS root not implemented yet")); 1538 cERROR(1, "Connecting to DFS root not implemented yet");
1491 rc = -EINVAL; 1539 rc = -EINVAL;
1492 goto out_err; 1540 goto out_err;
1493 } else /* which tcp_sess DFS root would we conect to */ { 1541 } else /* which tcp_sess DFS root would we conect to */ {
1494 cERROR(1, 1542 cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
1495 ("CIFS mount error: No UNC path (e.g. -o " 1543 "unc=//192.168.1.100/public) specified");
1496 "unc=//192.168.1.100/public) specified"));
1497 rc = -EINVAL; 1544 rc = -EINVAL;
1498 goto out_err; 1545 goto out_err;
1499 } 1546 }
1500 1547
1501 /* see if we already have a matching tcp_ses */ 1548 /* see if we already have a matching tcp_ses */
1502 tcp_ses = cifs_find_tcp_session(&addr, volume_info->port); 1549 tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
1503 if (tcp_ses) 1550 if (tcp_ses)
1504 return tcp_ses; 1551 return tcp_ses;
1505 1552
@@ -1540,21 +1587,19 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1540 ++tcp_ses->srv_count; 1587 ++tcp_ses->srv_count;
1541 1588
1542 if (addr.ss_family == AF_INET6) { 1589 if (addr.ss_family == AF_INET6) {
1543 cFYI(1, ("attempting ipv6 connect")); 1590 cFYI(1, "attempting ipv6 connect");
1544 /* BB should we allow ipv6 on port 139? */ 1591 /* BB should we allow ipv6 on port 139? */
1545 /* other OS never observed in Wild doing 139 with v6 */ 1592 /* other OS never observed in Wild doing 139 with v6 */
1546 sin_server6->sin6_port = htons(volume_info->port);
1547 memcpy(&tcp_ses->addr.sockAddr6, sin_server6, 1593 memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
1548 sizeof(struct sockaddr_in6)); 1594 sizeof(struct sockaddr_in6));
1549 rc = ipv6_connect(tcp_ses); 1595 rc = ipv6_connect(tcp_ses);
1550 } else { 1596 } else {
1551 sin_server->sin_port = htons(volume_info->port);
1552 memcpy(&tcp_ses->addr.sockAddr, sin_server, 1597 memcpy(&tcp_ses->addr.sockAddr, sin_server,
1553 sizeof(struct sockaddr_in)); 1598 sizeof(struct sockaddr_in));
1554 rc = ipv4_connect(tcp_ses); 1599 rc = ipv4_connect(tcp_ses);
1555 } 1600 }
1556 if (rc < 0) { 1601 if (rc < 0) {
1557 cERROR(1, ("Error connecting to socket. Aborting operation")); 1602 cERROR(1, "Error connecting to socket. Aborting operation");
1558 goto out_err; 1603 goto out_err;
1559 } 1604 }
1560 1605
@@ -1567,7 +1612,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1567 tcp_ses, "cifsd"); 1612 tcp_ses, "cifsd");
1568 if (IS_ERR(tcp_ses->tsk)) { 1613 if (IS_ERR(tcp_ses->tsk)) {
1569 rc = PTR_ERR(tcp_ses->tsk); 1614 rc = PTR_ERR(tcp_ses->tsk);
1570 cERROR(1, ("error %d create cifsd thread", rc)); 1615 cERROR(1, "error %d create cifsd thread", rc);
1571 module_put(THIS_MODULE); 1616 module_put(THIS_MODULE);
1572 goto out_err; 1617 goto out_err;
1573 } 1618 }
@@ -1577,6 +1622,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1577 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); 1622 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
1578 write_unlock(&cifs_tcp_ses_lock); 1623 write_unlock(&cifs_tcp_ses_lock);
1579 1624
1625 cifs_fscache_get_client_cookie(tcp_ses);
1626
1580 return tcp_ses; 1627 return tcp_ses;
1581 1628
1582out_err: 1629out_err:
@@ -1591,17 +1638,29 @@ out_err:
1591} 1638}
1592 1639
1593static struct cifsSesInfo * 1640static struct cifsSesInfo *
1594cifs_find_smb_ses(struct TCP_Server_Info *server, char *username) 1641cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1595{ 1642{
1596 struct list_head *tmp;
1597 struct cifsSesInfo *ses; 1643 struct cifsSesInfo *ses;
1598 1644
1599 write_lock(&cifs_tcp_ses_lock); 1645 write_lock(&cifs_tcp_ses_lock);
1600 list_for_each(tmp, &server->smb_ses_list) { 1646 list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
1601 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 1647 switch (server->secType) {
1602 if (strncmp(ses->userName, username, MAX_USERNAME_SIZE)) 1648 case Kerberos:
1603 continue; 1649 if (vol->cred_uid != ses->cred_uid)
1604 1650 continue;
1651 break;
1652 default:
1653 /* anything else takes username/password */
1654 if (strncmp(ses->userName, vol->username,
1655 MAX_USERNAME_SIZE))
1656 continue;
1657 if (strlen(vol->username) != 0 &&
1658 ses->password != NULL &&
1659 strncmp(ses->password,
1660 vol->password ? vol->password : "",
1661 MAX_PASSWORD_SIZE))
1662 continue;
1663 }
1605 ++ses->ses_count; 1664 ++ses->ses_count;
1606 write_unlock(&cifs_tcp_ses_lock); 1665 write_unlock(&cifs_tcp_ses_lock);
1607 return ses; 1666 return ses;
@@ -1616,6 +1675,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1616 int xid; 1675 int xid;
1617 struct TCP_Server_Info *server = ses->server; 1676 struct TCP_Server_Info *server = ses->server;
1618 1677
1678 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
1619 write_lock(&cifs_tcp_ses_lock); 1679 write_lock(&cifs_tcp_ses_lock);
1620 if (--ses->ses_count > 0) { 1680 if (--ses->ses_count > 0) {
1621 write_unlock(&cifs_tcp_ses_lock); 1681 write_unlock(&cifs_tcp_ses_lock);
@@ -1634,6 +1694,103 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1634 cifs_put_tcp_session(server); 1694 cifs_put_tcp_session(server);
1635} 1695}
1636 1696
1697static struct cifsSesInfo *
1698cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1699{
1700 int rc = -ENOMEM, xid;
1701 struct cifsSesInfo *ses;
1702
1703 xid = GetXid();
1704
1705 ses = cifs_find_smb_ses(server, volume_info);
1706 if (ses) {
1707 cFYI(1, "Existing smb sess found (status=%d)", ses->status);
1708
1709 mutex_lock(&ses->session_mutex);
1710 rc = cifs_negotiate_protocol(xid, ses);
1711 if (rc) {
1712 mutex_unlock(&ses->session_mutex);
1713 /* problem -- put our ses reference */
1714 cifs_put_smb_ses(ses);
1715 FreeXid(xid);
1716 return ERR_PTR(rc);
1717 }
1718 if (ses->need_reconnect) {
1719 cFYI(1, "Session needs reconnect");
1720 rc = cifs_setup_session(xid, ses,
1721 volume_info->local_nls);
1722 if (rc) {
1723 mutex_unlock(&ses->session_mutex);
1724 /* problem -- put our reference */
1725 cifs_put_smb_ses(ses);
1726 FreeXid(xid);
1727 return ERR_PTR(rc);
1728 }
1729 }
1730 mutex_unlock(&ses->session_mutex);
1731
1732 /* existing SMB ses has a server reference already */
1733 cifs_put_tcp_session(server);
1734 FreeXid(xid);
1735 return ses;
1736 }
1737
1738 cFYI(1, "Existing smb sess not found");
1739 ses = sesInfoAlloc();
1740 if (ses == NULL)
1741 goto get_ses_fail;
1742
1743 /* new SMB session uses our server ref */
1744 ses->server = server;
1745 if (server->addr.sockAddr6.sin6_family == AF_INET6)
1746 sprintf(ses->serverName, "%pI6",
1747 &server->addr.sockAddr6.sin6_addr);
1748 else
1749 sprintf(ses->serverName, "%pI4",
1750 &server->addr.sockAddr.sin_addr.s_addr);
1751
1752 if (volume_info->username)
1753 strncpy(ses->userName, volume_info->username,
1754 MAX_USERNAME_SIZE);
1755
1756 /* volume_info->password freed at unmount */
1757 if (volume_info->password) {
1758 ses->password = kstrdup(volume_info->password, GFP_KERNEL);
1759 if (!ses->password)
1760 goto get_ses_fail;
1761 }
1762 if (volume_info->domainname) {
1763 int len = strlen(volume_info->domainname);
1764 ses->domainName = kmalloc(len + 1, GFP_KERNEL);
1765 if (ses->domainName)
1766 strcpy(ses->domainName, volume_info->domainname);
1767 }
1768 ses->cred_uid = volume_info->cred_uid;
1769 ses->linux_uid = volume_info->linux_uid;
1770 ses->overrideSecFlg = volume_info->secFlg;
1771
1772 mutex_lock(&ses->session_mutex);
1773 rc = cifs_negotiate_protocol(xid, ses);
1774 if (!rc)
1775 rc = cifs_setup_session(xid, ses, volume_info->local_nls);
1776 mutex_unlock(&ses->session_mutex);
1777 if (rc)
1778 goto get_ses_fail;
1779
1780 /* success, put it on the list */
1781 write_lock(&cifs_tcp_ses_lock);
1782 list_add(&ses->smb_ses_list, &server->smb_ses_list);
1783 write_unlock(&cifs_tcp_ses_lock);
1784
1785 FreeXid(xid);
1786 return ses;
1787
1788get_ses_fail:
1789 sesInfoFree(ses);
1790 FreeXid(xid);
1791 return ERR_PTR(rc);
1792}
1793
1637static struct cifsTconInfo * 1794static struct cifsTconInfo *
1638cifs_find_tcon(struct cifsSesInfo *ses, const char *unc) 1795cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1639{ 1796{
@@ -1662,6 +1819,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1662 int xid; 1819 int xid;
1663 struct cifsSesInfo *ses = tcon->ses; 1820 struct cifsSesInfo *ses = tcon->ses;
1664 1821
1822 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
1665 write_lock(&cifs_tcp_ses_lock); 1823 write_lock(&cifs_tcp_ses_lock);
1666 if (--tcon->tc_count > 0) { 1824 if (--tcon->tc_count > 0) {
1667 write_unlock(&cifs_tcp_ses_lock); 1825 write_unlock(&cifs_tcp_ses_lock);
@@ -1675,10 +1833,87 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1675 CIFSSMBTDis(xid, tcon); 1833 CIFSSMBTDis(xid, tcon);
1676 _FreeXid(xid); 1834 _FreeXid(xid);
1677 1835
1836 cifs_fscache_release_super_cookie(tcon);
1678 tconInfoFree(tcon); 1837 tconInfoFree(tcon);
1679 cifs_put_smb_ses(ses); 1838 cifs_put_smb_ses(ses);
1680} 1839}
1681 1840
1841static struct cifsTconInfo *
1842cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
1843{
1844 int rc, xid;
1845 struct cifsTconInfo *tcon;
1846
1847 tcon = cifs_find_tcon(ses, volume_info->UNC);
1848 if (tcon) {
1849 cFYI(1, "Found match on UNC path");
1850 /* existing tcon already has a reference */
1851 cifs_put_smb_ses(ses);
1852 if (tcon->seal != volume_info->seal)
1853 cERROR(1, "transport encryption setting "
1854 "conflicts with existing tid");
1855 return tcon;
1856 }
1857
1858 tcon = tconInfoAlloc();
1859 if (tcon == NULL) {
1860 rc = -ENOMEM;
1861 goto out_fail;
1862 }
1863
1864 tcon->ses = ses;
1865 if (volume_info->password) {
1866 tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
1867 if (!tcon->password) {
1868 rc = -ENOMEM;
1869 goto out_fail;
1870 }
1871 }
1872
1873 if (strchr(volume_info->UNC + 3, '\\') == NULL
1874 && strchr(volume_info->UNC + 3, '/') == NULL) {
1875 cERROR(1, "Missing share name");
1876 rc = -ENODEV;
1877 goto out_fail;
1878 }
1879
1880 /* BB Do we need to wrap session_mutex around
1881 * this TCon call and Unix SetFS as
1882 * we do on SessSetup and reconnect? */
1883 xid = GetXid();
1884 rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
1885 FreeXid(xid);
1886 cFYI(1, "CIFS Tcon rc = %d", rc);
1887 if (rc)
1888 goto out_fail;
1889
1890 if (volume_info->nodfs) {
1891 tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
1892 cFYI(1, "DFS disabled (%d)", tcon->Flags);
1893 }
1894 tcon->seal = volume_info->seal;
1895 /* we can have only one retry value for a connection
1896 to a share so for resources mounted more than once
1897 to the same server share the last value passed in
1898 for the retry flag is used */
1899 tcon->retry = volume_info->retry;
1900 tcon->nocase = volume_info->nocase;
1901 tcon->local_lease = volume_info->local_lease;
1902
1903 write_lock(&cifs_tcp_ses_lock);
1904 list_add(&tcon->tcon_list, &ses->tcon_list);
1905 write_unlock(&cifs_tcp_ses_lock);
1906
1907 cifs_fscache_get_super_cookie(tcon);
1908
1909 return tcon;
1910
1911out_fail:
1912 tconInfoFree(tcon);
1913 return ERR_PTR(rc);
1914}
1915
1916
1682int 1917int
1683get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path, 1918get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
1684 const struct nls_table *nls_codepage, unsigned int *pnum_referrals, 1919 const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1703,8 +1938,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
1703 strcpy(temp_unc + 2, pSesInfo->serverName); 1938 strcpy(temp_unc + 2, pSesInfo->serverName);
1704 strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$"); 1939 strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
1705 rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage); 1940 rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
1706 cFYI(1, 1941 cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
1707 ("CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid));
1708 kfree(temp_unc); 1942 kfree(temp_unc);
1709 } 1943 }
1710 if (rc == 0) 1944 if (rc == 0)
@@ -1777,12 +2011,12 @@ ipv4_connect(struct TCP_Server_Info *server)
1777 rc = sock_create_kern(PF_INET, SOCK_STREAM, 2011 rc = sock_create_kern(PF_INET, SOCK_STREAM,
1778 IPPROTO_TCP, &socket); 2012 IPPROTO_TCP, &socket);
1779 if (rc < 0) { 2013 if (rc < 0) {
1780 cERROR(1, ("Error %d creating socket", rc)); 2014 cERROR(1, "Error %d creating socket", rc);
1781 return rc; 2015 return rc;
1782 } 2016 }
1783 2017
1784 /* BB other socket options to set KEEPALIVE, NODELAY? */ 2018 /* BB other socket options to set KEEPALIVE, NODELAY? */
1785 cFYI(1, ("Socket created")); 2019 cFYI(1, "Socket created");
1786 server->ssocket = socket; 2020 server->ssocket = socket;
1787 socket->sk->sk_allocation = GFP_NOFS; 2021 socket->sk->sk_allocation = GFP_NOFS;
1788 cifs_reclassify_socket4(socket); 2022 cifs_reclassify_socket4(socket);
@@ -1827,7 +2061,7 @@ ipv4_connect(struct TCP_Server_Info *server)
1827 if (!connected) { 2061 if (!connected) {
1828 if (orig_port) 2062 if (orig_port)
1829 server->addr.sockAddr.sin_port = orig_port; 2063 server->addr.sockAddr.sin_port = orig_port;
1830 cFYI(1, ("Error %d connecting to server via ipv4", rc)); 2064 cFYI(1, "Error %d connecting to server via ipv4", rc);
1831 sock_release(socket); 2065 sock_release(socket);
1832 server->ssocket = NULL; 2066 server->ssocket = NULL;
1833 return rc; 2067 return rc;
@@ -1855,12 +2089,12 @@ ipv4_connect(struct TCP_Server_Info *server)
1855 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, 2089 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
1856 (char *)&val, sizeof(val)); 2090 (char *)&val, sizeof(val));
1857 if (rc) 2091 if (rc)
1858 cFYI(1, ("set TCP_NODELAY socket option error %d", rc)); 2092 cFYI(1, "set TCP_NODELAY socket option error %d", rc);
1859 } 2093 }
1860 2094
1861 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx", 2095 cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
1862 socket->sk->sk_sndbuf, 2096 socket->sk->sk_sndbuf,
1863 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo)); 2097 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
1864 2098
1865 /* send RFC1001 sessinit */ 2099 /* send RFC1001 sessinit */
1866 if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) { 2100 if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
@@ -1938,13 +2172,13 @@ ipv6_connect(struct TCP_Server_Info *server)
1938 rc = sock_create_kern(PF_INET6, SOCK_STREAM, 2172 rc = sock_create_kern(PF_INET6, SOCK_STREAM,
1939 IPPROTO_TCP, &socket); 2173 IPPROTO_TCP, &socket);
1940 if (rc < 0) { 2174 if (rc < 0) {
1941 cERROR(1, ("Error %d creating ipv6 socket", rc)); 2175 cERROR(1, "Error %d creating ipv6 socket", rc);
1942 socket = NULL; 2176 socket = NULL;
1943 return rc; 2177 return rc;
1944 } 2178 }
1945 2179
1946 /* BB other socket options to set KEEPALIVE, NODELAY? */ 2180 /* BB other socket options to set KEEPALIVE, NODELAY? */
1947 cFYI(1, ("ipv6 Socket created")); 2181 cFYI(1, "ipv6 Socket created");
1948 server->ssocket = socket; 2182 server->ssocket = socket;
1949 socket->sk->sk_allocation = GFP_NOFS; 2183 socket->sk->sk_allocation = GFP_NOFS;
1950 cifs_reclassify_socket6(socket); 2184 cifs_reclassify_socket6(socket);
@@ -1988,7 +2222,7 @@ ipv6_connect(struct TCP_Server_Info *server)
1988 if (!connected) { 2222 if (!connected) {
1989 if (orig_port) 2223 if (orig_port)
1990 server->addr.sockAddr6.sin6_port = orig_port; 2224 server->addr.sockAddr6.sin6_port = orig_port;
1991 cFYI(1, ("Error %d connecting to server via ipv6", rc)); 2225 cFYI(1, "Error %d connecting to server via ipv6", rc);
1992 sock_release(socket); 2226 sock_release(socket);
1993 server->ssocket = NULL; 2227 server->ssocket = NULL;
1994 return rc; 2228 return rc;
@@ -2007,7 +2241,7 @@ ipv6_connect(struct TCP_Server_Info *server)
2007 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, 2241 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2008 (char *)&val, sizeof(val)); 2242 (char *)&val, sizeof(val));
2009 if (rc) 2243 if (rc)
2010 cFYI(1, ("set TCP_NODELAY socket option error %d", rc)); 2244 cFYI(1, "set TCP_NODELAY socket option error %d", rc);
2011 } 2245 }
2012 2246
2013 server->ssocket = socket; 2247 server->ssocket = socket;
@@ -2032,13 +2266,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2032 if (vol_info && vol_info->no_linux_ext) { 2266 if (vol_info && vol_info->no_linux_ext) {
2033 tcon->fsUnixInfo.Capability = 0; 2267 tcon->fsUnixInfo.Capability = 0;
2034 tcon->unix_ext = 0; /* Unix Extensions disabled */ 2268 tcon->unix_ext = 0; /* Unix Extensions disabled */
2035 cFYI(1, ("Linux protocol extensions disabled")); 2269 cFYI(1, "Linux protocol extensions disabled");
2036 return; 2270 return;
2037 } else if (vol_info) 2271 } else if (vol_info)
2038 tcon->unix_ext = 1; /* Unix Extensions supported */ 2272 tcon->unix_ext = 1; /* Unix Extensions supported */
2039 2273
2040 if (tcon->unix_ext == 0) { 2274 if (tcon->unix_ext == 0) {
2041 cFYI(1, ("Unix extensions disabled so not set on reconnect")); 2275 cFYI(1, "Unix extensions disabled so not set on reconnect");
2042 return; 2276 return;
2043 } 2277 }
2044 2278
@@ -2054,12 +2288,11 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2054 cap &= ~CIFS_UNIX_POSIX_ACL_CAP; 2288 cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
2055 if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { 2289 if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
2056 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) 2290 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
2057 cERROR(1, ("POSIXPATH support change")); 2291 cERROR(1, "POSIXPATH support change");
2058 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; 2292 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
2059 } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { 2293 } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
2060 cERROR(1, ("possible reconnect error")); 2294 cERROR(1, "possible reconnect error");
2061 cERROR(1, 2295 cERROR(1, "server disabled POSIX path support");
2062 ("server disabled POSIX path support"));
2063 } 2296 }
2064 } 2297 }
2065 2298
@@ -2067,7 +2300,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2067 if (vol_info && vol_info->no_psx_acl) 2300 if (vol_info && vol_info->no_psx_acl)
2068 cap &= ~CIFS_UNIX_POSIX_ACL_CAP; 2301 cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
2069 else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { 2302 else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
2070 cFYI(1, ("negotiated posix acl support")); 2303 cFYI(1, "negotiated posix acl support");
2071 if (sb) 2304 if (sb)
2072 sb->s_flags |= MS_POSIXACL; 2305 sb->s_flags |= MS_POSIXACL;
2073 } 2306 }
@@ -2075,7 +2308,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2075 if (vol_info && vol_info->posix_paths == 0) 2308 if (vol_info && vol_info->posix_paths == 0)
2076 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; 2309 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
2077 else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { 2310 else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
2078 cFYI(1, ("negotiate posix pathnames")); 2311 cFYI(1, "negotiate posix pathnames");
2079 if (sb) 2312 if (sb)
2080 CIFS_SB(sb)->mnt_cifs_flags |= 2313 CIFS_SB(sb)->mnt_cifs_flags |=
2081 CIFS_MOUNT_POSIX_PATHS; 2314 CIFS_MOUNT_POSIX_PATHS;
@@ -2090,39 +2323,38 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2090 if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) { 2323 if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
2091 if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { 2324 if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
2092 CIFS_SB(sb)->rsize = 127 * 1024; 2325 CIFS_SB(sb)->rsize = 127 * 1024;
2093 cFYI(DBG2, 2326 cFYI(DBG2, "larger reads not supported by srv");
2094 ("larger reads not supported by srv"));
2095 } 2327 }
2096 } 2328 }
2097 2329
2098 2330
2099 cFYI(1, ("Negotiate caps 0x%x", (int)cap)); 2331 cFYI(1, "Negotiate caps 0x%x", (int)cap);
2100#ifdef CONFIG_CIFS_DEBUG2 2332#ifdef CONFIG_CIFS_DEBUG2
2101 if (cap & CIFS_UNIX_FCNTL_CAP) 2333 if (cap & CIFS_UNIX_FCNTL_CAP)
2102 cFYI(1, ("FCNTL cap")); 2334 cFYI(1, "FCNTL cap");
2103 if (cap & CIFS_UNIX_EXTATTR_CAP) 2335 if (cap & CIFS_UNIX_EXTATTR_CAP)
2104 cFYI(1, ("EXTATTR cap")); 2336 cFYI(1, "EXTATTR cap");
2105 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) 2337 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
2106 cFYI(1, ("POSIX path cap")); 2338 cFYI(1, "POSIX path cap");
2107 if (cap & CIFS_UNIX_XATTR_CAP) 2339 if (cap & CIFS_UNIX_XATTR_CAP)
2108 cFYI(1, ("XATTR cap")); 2340 cFYI(1, "XATTR cap");
2109 if (cap & CIFS_UNIX_POSIX_ACL_CAP) 2341 if (cap & CIFS_UNIX_POSIX_ACL_CAP)
2110 cFYI(1, ("POSIX ACL cap")); 2342 cFYI(1, "POSIX ACL cap");
2111 if (cap & CIFS_UNIX_LARGE_READ_CAP) 2343 if (cap & CIFS_UNIX_LARGE_READ_CAP)
2112 cFYI(1, ("very large read cap")); 2344 cFYI(1, "very large read cap");
2113 if (cap & CIFS_UNIX_LARGE_WRITE_CAP) 2345 if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
2114 cFYI(1, ("very large write cap")); 2346 cFYI(1, "very large write cap");
2115#endif /* CIFS_DEBUG2 */ 2347#endif /* CIFS_DEBUG2 */
2116 if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { 2348 if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
2117 if (vol_info == NULL) { 2349 if (vol_info == NULL) {
2118 cFYI(1, ("resetting capabilities failed")); 2350 cFYI(1, "resetting capabilities failed");
2119 } else 2351 } else
2120 cERROR(1, ("Negotiating Unix capabilities " 2352 cERROR(1, "Negotiating Unix capabilities "
2121 "with the server failed. Consider " 2353 "with the server failed. Consider "
2122 "mounting with the Unix Extensions\n" 2354 "mounting with the Unix Extensions\n"
2123 "disabled, if problems are found, " 2355 "disabled, if problems are found, "
2124 "by specifying the nounix mount " 2356 "by specifying the nounix mount "
2125 "option.")); 2357 "option.");
2126 2358
2127 } 2359 }
2128 } 2360 }
@@ -2152,8 +2384,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2152 struct cifs_sb_info *cifs_sb) 2384 struct cifs_sb_info *cifs_sb)
2153{ 2385{
2154 if (pvolume_info->rsize > CIFSMaxBufSize) { 2386 if (pvolume_info->rsize > CIFSMaxBufSize) {
2155 cERROR(1, ("rsize %d too large, using MaxBufSize", 2387 cERROR(1, "rsize %d too large, using MaxBufSize",
2156 pvolume_info->rsize)); 2388 pvolume_info->rsize);
2157 cifs_sb->rsize = CIFSMaxBufSize; 2389 cifs_sb->rsize = CIFSMaxBufSize;
2158 } else if ((pvolume_info->rsize) && 2390 } else if ((pvolume_info->rsize) &&
2159 (pvolume_info->rsize <= CIFSMaxBufSize)) 2391 (pvolume_info->rsize <= CIFSMaxBufSize))
@@ -2162,8 +2394,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2162 cifs_sb->rsize = CIFSMaxBufSize; 2394 cifs_sb->rsize = CIFSMaxBufSize;
2163 2395
2164 if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) { 2396 if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
2165 cERROR(1, ("wsize %d too large, using 4096 instead", 2397 cERROR(1, "wsize %d too large, using 4096 instead",
2166 pvolume_info->wsize)); 2398 pvolume_info->wsize);
2167 cifs_sb->wsize = 4096; 2399 cifs_sb->wsize = 4096;
2168 } else if (pvolume_info->wsize) 2400 } else if (pvolume_info->wsize)
2169 cifs_sb->wsize = pvolume_info->wsize; 2401 cifs_sb->wsize = pvolume_info->wsize;
@@ -2181,7 +2413,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2181 if (cifs_sb->rsize < 2048) { 2413 if (cifs_sb->rsize < 2048) {
2182 cifs_sb->rsize = 2048; 2414 cifs_sb->rsize = 2048;
2183 /* Windows ME may prefer this */ 2415 /* Windows ME may prefer this */
2184 cFYI(1, ("readsize set to minimum: 2048")); 2416 cFYI(1, "readsize set to minimum: 2048");
2185 } 2417 }
2186 /* calculate prepath */ 2418 /* calculate prepath */
2187 cifs_sb->prepath = pvolume_info->prepath; 2419 cifs_sb->prepath = pvolume_info->prepath;
@@ -2199,8 +2431,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2199 cifs_sb->mnt_gid = pvolume_info->linux_gid; 2431 cifs_sb->mnt_gid = pvolume_info->linux_gid;
2200 cifs_sb->mnt_file_mode = pvolume_info->file_mode; 2432 cifs_sb->mnt_file_mode = pvolume_info->file_mode;
2201 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; 2433 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
2202 cFYI(1, ("file mode: 0x%x dir mode: 0x%x", 2434 cFYI(1, "file mode: 0x%x dir mode: 0x%x",
2203 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode)); 2435 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
2204 2436
2205 if (pvolume_info->noperm) 2437 if (pvolume_info->noperm)
2206 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; 2438 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2228,14 +2460,16 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2228 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; 2460 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
2229 if (pvolume_info->dynperm) 2461 if (pvolume_info->dynperm)
2230 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; 2462 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
2463 if (pvolume_info->fsc)
2464 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
2231 if (pvolume_info->direct_io) { 2465 if (pvolume_info->direct_io) {
2232 cFYI(1, ("mounting share using direct i/o")); 2466 cFYI(1, "mounting share using direct i/o");
2233 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2467 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
2234 } 2468 }
2235 2469
2236 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) 2470 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
2237 cERROR(1, ("mount option dynperm ignored if cifsacl " 2471 cERROR(1, "mount option dynperm ignored if cifsacl "
2238 "mount option supported")); 2472 "mount option supported");
2239} 2473}
2240 2474
2241static int 2475static int
@@ -2262,7 +2496,7 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
2262{ 2496{
2263 struct smb_vol *volume_info; 2497 struct smb_vol *volume_info;
2264 2498
2265 if (!pvolume_info && !*pvolume_info) 2499 if (!pvolume_info || !*pvolume_info)
2266 return; 2500 return;
2267 2501
2268 volume_info = *pvolume_info; 2502 volume_info = *pvolume_info;
@@ -2344,11 +2578,11 @@ try_mount_again:
2344 } 2578 }
2345 2579
2346 if (volume_info->nullauth) { 2580 if (volume_info->nullauth) {
2347 cFYI(1, ("null user")); 2581 cFYI(1, "null user");
2348 volume_info->username = ""; 2582 volume_info->username = "";
2349 } else if (volume_info->username) { 2583 } else if (volume_info->username) {
2350 /* BB fixme parse for domain name here */ 2584 /* BB fixme parse for domain name here */
2351 cFYI(1, ("Username: %s", volume_info->username)); 2585 cFYI(1, "Username: %s", volume_info->username);
2352 } else { 2586 } else {
2353 cifserror("No username specified"); 2587 cifserror("No username specified");
2354 /* In userspace mount helper we can get user name from alternate 2588 /* In userspace mount helper we can get user name from alternate
@@ -2357,20 +2591,20 @@ try_mount_again:
2357 goto out; 2591 goto out;
2358 } 2592 }
2359 2593
2360
2361 /* this is needed for ASCII cp to Unicode converts */ 2594 /* this is needed for ASCII cp to Unicode converts */
2362 if (volume_info->iocharset == NULL) { 2595 if (volume_info->iocharset == NULL) {
2363 cifs_sb->local_nls = load_nls_default(); 2596 /* load_nls_default cannot return null */
2364 /* load_nls_default can not return null */ 2597 volume_info->local_nls = load_nls_default();
2365 } else { 2598 } else {
2366 cifs_sb->local_nls = load_nls(volume_info->iocharset); 2599 volume_info->local_nls = load_nls(volume_info->iocharset);
2367 if (cifs_sb->local_nls == NULL) { 2600 if (volume_info->local_nls == NULL) {
2368 cERROR(1, ("CIFS mount error: iocharset %s not found", 2601 cERROR(1, "CIFS mount error: iocharset %s not found",
2369 volume_info->iocharset)); 2602 volume_info->iocharset);
2370 rc = -ELIBACC; 2603 rc = -ELIBACC;
2371 goto out; 2604 goto out;
2372 } 2605 }
2373 } 2606 }
2607 cifs_sb->local_nls = volume_info->local_nls;
2374 2608
2375 /* get a reference to a tcp session */ 2609 /* get a reference to a tcp session */
2376 srvTcp = cifs_get_tcp_session(volume_info); 2610 srvTcp = cifs_get_tcp_session(volume_info);
@@ -2379,148 +2613,30 @@ try_mount_again:
2379 goto out; 2613 goto out;
2380 } 2614 }
2381 2615
2382 pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username); 2616 /* get a reference to a SMB session */
2383 if (pSesInfo) { 2617 pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
2384 cFYI(1, ("Existing smb sess found (status=%d)", 2618 if (IS_ERR(pSesInfo)) {
2385 pSesInfo->status)); 2619 rc = PTR_ERR(pSesInfo);
2386 /* 2620 pSesInfo = NULL;
2387 * The existing SMB session already has a reference to srvTcp, 2621 goto mount_fail_check;
2388 * so we can put back the extra one we got before
2389 */
2390 cifs_put_tcp_session(srvTcp);
2391
2392 mutex_lock(&pSesInfo->session_mutex);
2393 if (pSesInfo->need_reconnect) {
2394 cFYI(1, ("Session needs reconnect"));
2395 rc = cifs_setup_session(xid, pSesInfo,
2396 cifs_sb->local_nls);
2397 }
2398 mutex_unlock(&pSesInfo->session_mutex);
2399 } else if (!rc) {
2400 cFYI(1, ("Existing smb sess not found"));
2401 pSesInfo = sesInfoAlloc();
2402 if (pSesInfo == NULL) {
2403 rc = -ENOMEM;
2404 goto mount_fail_check;
2405 }
2406
2407 /* new SMB session uses our srvTcp ref */
2408 pSesInfo->server = srvTcp;
2409 if (srvTcp->addr.sockAddr6.sin6_family == AF_INET6)
2410 sprintf(pSesInfo->serverName, "%pI6",
2411 &srvTcp->addr.sockAddr6.sin6_addr);
2412 else
2413 sprintf(pSesInfo->serverName, "%pI4",
2414 &srvTcp->addr.sockAddr.sin_addr.s_addr);
2415
2416 write_lock(&cifs_tcp_ses_lock);
2417 list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
2418 write_unlock(&cifs_tcp_ses_lock);
2419
2420 /* volume_info->password freed at unmount */
2421 if (volume_info->password) {
2422 pSesInfo->password = kstrdup(volume_info->password,
2423 GFP_KERNEL);
2424 if (!pSesInfo->password) {
2425 rc = -ENOMEM;
2426 goto mount_fail_check;
2427 }
2428 }
2429 if (volume_info->username)
2430 strncpy(pSesInfo->userName, volume_info->username,
2431 MAX_USERNAME_SIZE);
2432 if (volume_info->domainname) {
2433 int len = strlen(volume_info->domainname);
2434 pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
2435 if (pSesInfo->domainName)
2436 strcpy(pSesInfo->domainName,
2437 volume_info->domainname);
2438 }
2439 pSesInfo->linux_uid = volume_info->linux_uid;
2440 pSesInfo->overrideSecFlg = volume_info->secFlg;
2441 mutex_lock(&pSesInfo->session_mutex);
2442
2443 /* BB FIXME need to pass vol->secFlgs BB */
2444 rc = cifs_setup_session(xid, pSesInfo,
2445 cifs_sb->local_nls);
2446 mutex_unlock(&pSesInfo->session_mutex);
2447 } 2622 }
2448 2623
2449 /* search for existing tcon to this server share */ 2624 setup_cifs_sb(volume_info, cifs_sb);
2450 if (!rc) { 2625 if (pSesInfo->capabilities & CAP_LARGE_FILES)
2451 setup_cifs_sb(volume_info, cifs_sb); 2626 sb->s_maxbytes = MAX_LFS_FILESIZE;
2452 2627 else
2453 tcon = cifs_find_tcon(pSesInfo, volume_info->UNC); 2628 sb->s_maxbytes = MAX_NON_LFS;
2454 if (tcon) {
2455 cFYI(1, ("Found match on UNC path"));
2456 /* existing tcon already has a reference */
2457 cifs_put_smb_ses(pSesInfo);
2458 if (tcon->seal != volume_info->seal)
2459 cERROR(1, ("transport encryption setting "
2460 "conflicts with existing tid"));
2461 } else {
2462 tcon = tconInfoAlloc();
2463 if (tcon == NULL) {
2464 rc = -ENOMEM;
2465 goto mount_fail_check;
2466 }
2467
2468 tcon->ses = pSesInfo;
2469 if (volume_info->password) {
2470 tcon->password = kstrdup(volume_info->password,
2471 GFP_KERNEL);
2472 if (!tcon->password) {
2473 rc = -ENOMEM;
2474 goto mount_fail_check;
2475 }
2476 }
2477
2478 if ((strchr(volume_info->UNC + 3, '\\') == NULL)
2479 && (strchr(volume_info->UNC + 3, '/') == NULL)) {
2480 cERROR(1, ("Missing share name"));
2481 rc = -ENODEV;
2482 goto mount_fail_check;
2483 } else {
2484 /* BB Do we need to wrap sesSem around
2485 * this TCon call and Unix SetFS as
2486 * we do on SessSetup and reconnect? */
2487 rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
2488 tcon, cifs_sb->local_nls);
2489 cFYI(1, ("CIFS Tcon rc = %d", rc));
2490 if (volume_info->nodfs) {
2491 tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
2492 cFYI(1, ("DFS disabled (%d)",
2493 tcon->Flags));
2494 }
2495 }
2496 if (rc)
2497 goto remote_path_check;
2498 tcon->seal = volume_info->seal;
2499 write_lock(&cifs_tcp_ses_lock);
2500 list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
2501 write_unlock(&cifs_tcp_ses_lock);
2502 }
2503
2504 /* we can have only one retry value for a connection
2505 to a share so for resources mounted more than once
2506 to the same server share the last value passed in
2507 for the retry flag is used */
2508 tcon->retry = volume_info->retry;
2509 tcon->nocase = volume_info->nocase;
2510 tcon->local_lease = volume_info->local_lease;
2511 }
2512 if (pSesInfo) {
2513 if (pSesInfo->capabilities & CAP_LARGE_FILES)
2514 sb->s_maxbytes = MAX_LFS_FILESIZE;
2515 else
2516 sb->s_maxbytes = MAX_NON_LFS;
2517 }
2518 2629
2519 /* BB FIXME fix time_gran to be larger for LANMAN sessions */ 2630 /* BB FIXME fix time_gran to be larger for LANMAN sessions */
2520 sb->s_time_gran = 100; 2631 sb->s_time_gran = 100;
2521 2632
2522 if (rc) 2633 /* search for existing tcon to this server share */
2634 tcon = cifs_get_tcon(pSesInfo, volume_info);
2635 if (IS_ERR(tcon)) {
2636 rc = PTR_ERR(tcon);
2637 tcon = NULL;
2523 goto remote_path_check; 2638 goto remote_path_check;
2639 }
2524 2640
2525 cifs_sb->tcon = tcon; 2641 cifs_sb->tcon = tcon;
2526 2642
@@ -2544,7 +2660,7 @@ try_mount_again:
2544 2660
2545 if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) { 2661 if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
2546 cifs_sb->rsize = 1024 * 127; 2662 cifs_sb->rsize = 1024 * 127;
2547 cFYI(DBG2, ("no very large read support, rsize now 127K")); 2663 cFYI(DBG2, "no very large read support, rsize now 127K");
2548 } 2664 }
2549 if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X)) 2665 if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
2550 cifs_sb->wsize = min(cifs_sb->wsize, 2666 cifs_sb->wsize = min(cifs_sb->wsize,
@@ -2593,7 +2709,7 @@ remote_path_check:
2593 goto mount_fail_check; 2709 goto mount_fail_check;
2594 } 2710 }
2595 2711
2596 cFYI(1, ("Getting referral for: %s", full_path)); 2712 cFYI(1, "Getting referral for: %s", full_path);
2597 rc = get_dfs_path(xid, pSesInfo , full_path + 1, 2713 rc = get_dfs_path(xid, pSesInfo , full_path + 1,
2598 cifs_sb->local_nls, &num_referrals, &referrals, 2714 cifs_sb->local_nls, &num_referrals, &referrals,
2599 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 2715 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -2707,7 +2823,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2707 by Samba (not sure whether other servers allow 2823 by Samba (not sure whether other servers allow
2708 NTLMv2 password here) */ 2824 NTLMv2 password here) */
2709#ifdef CONFIG_CIFS_WEAK_PW_HASH 2825#ifdef CONFIG_CIFS_WEAK_PW_HASH
2710 if ((extended_security & CIFSSEC_MAY_LANMAN) && 2826 if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
2711 (ses->server->secType == LANMAN)) 2827 (ses->server->secType == LANMAN))
2712 calc_lanman_hash(tcon->password, ses->server->cryptKey, 2828 calc_lanman_hash(tcon->password, ses->server->cryptKey,
2713 ses->server->secMode & 2829 ses->server->secMode &
@@ -2778,13 +2894,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2778 if (length == 3) { 2894 if (length == 3) {
2779 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && 2895 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
2780 (bcc_ptr[2] == 'C')) { 2896 (bcc_ptr[2] == 'C')) {
2781 cFYI(1, ("IPC connection")); 2897 cFYI(1, "IPC connection");
2782 tcon->ipc = 1; 2898 tcon->ipc = 1;
2783 } 2899 }
2784 } else if (length == 2) { 2900 } else if (length == 2) {
2785 if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) { 2901 if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
2786 /* the most common case */ 2902 /* the most common case */
2787 cFYI(1, ("disk share connection")); 2903 cFYI(1, "disk share connection");
2788 } 2904 }
2789 } 2905 }
2790 bcc_ptr += length + 1; 2906 bcc_ptr += length + 1;
@@ -2797,7 +2913,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2797 bytes_left, is_unicode, 2913 bytes_left, is_unicode,
2798 nls_codepage); 2914 nls_codepage);
2799 2915
2800 cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem)); 2916 cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
2801 2917
2802 if ((smb_buffer_response->WordCount == 3) || 2918 if ((smb_buffer_response->WordCount == 3) ||
2803 (smb_buffer_response->WordCount == 7)) 2919 (smb_buffer_response->WordCount == 7))
@@ -2805,7 +2921,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2805 tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); 2921 tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
2806 else 2922 else
2807 tcon->Flags = 0; 2923 tcon->Flags = 0;
2808 cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags)); 2924 cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
2809 } else if ((rc == 0) && tcon == NULL) { 2925 } else if ((rc == 0) && tcon == NULL) {
2810 /* all we need to save for IPC$ connection */ 2926 /* all we need to save for IPC$ connection */
2811 ses->ipc_tid = smb_buffer_response->Tid; 2927 ses->ipc_tid = smb_buffer_response->Tid;
@@ -2833,57 +2949,61 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
2833 return rc; 2949 return rc;
2834} 2950}
2835 2951
2836int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, 2952int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
2837 struct nls_table *nls_info)
2838{ 2953{
2839 int rc = 0; 2954 int rc = 0;
2840 int first_time = 0; 2955 struct TCP_Server_Info *server = ses->server;
2841 struct TCP_Server_Info *server = pSesInfo->server; 2956
2842 2957 /* only send once per connect */
2843 /* what if server changes its buffer size after dropping the session? */ 2958 if (server->maxBuf != 0)
2844 if (server->maxBuf == 0) /* no need to send on reconnect */ { 2959 return 0;
2845 rc = CIFSSMBNegotiate(xid, pSesInfo); 2960
2846 if (rc == -EAGAIN) { 2961 rc = CIFSSMBNegotiate(xid, ses);
2847 /* retry only once on 1st time connection */ 2962 if (rc == -EAGAIN) {
2848 rc = CIFSSMBNegotiate(xid, pSesInfo); 2963 /* retry only once on 1st time connection */
2849 if (rc == -EAGAIN) 2964 rc = CIFSSMBNegotiate(xid, ses);
2850 rc = -EHOSTDOWN; 2965 if (rc == -EAGAIN)
2851 } 2966 rc = -EHOSTDOWN;
2852 if (rc == 0) { 2967 }
2853 spin_lock(&GlobalMid_Lock); 2968 if (rc == 0) {
2854 if (server->tcpStatus != CifsExiting) 2969 spin_lock(&GlobalMid_Lock);
2855 server->tcpStatus = CifsGood; 2970 if (server->tcpStatus != CifsExiting)
2856 else 2971 server->tcpStatus = CifsGood;
2857 rc = -EHOSTDOWN; 2972 else
2858 spin_unlock(&GlobalMid_Lock); 2973 rc = -EHOSTDOWN;
2974 spin_unlock(&GlobalMid_Lock);
2859 2975
2860 }
2861 first_time = 1;
2862 } 2976 }
2863 2977
2864 if (rc) 2978 return rc;
2865 goto ss_err_exit; 2979}
2980
2981
2982int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
2983 struct nls_table *nls_info)
2984{
2985 int rc = 0;
2986 struct TCP_Server_Info *server = ses->server;
2866 2987
2867 pSesInfo->flags = 0; 2988 ses->flags = 0;
2868 pSesInfo->capabilities = server->capabilities; 2989 ses->capabilities = server->capabilities;
2869 if (linuxExtEnabled == 0) 2990 if (linuxExtEnabled == 0)
2870 pSesInfo->capabilities &= (~CAP_UNIX); 2991 ses->capabilities &= (~CAP_UNIX);
2871 2992
2872 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", 2993 cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
2873 server->secMode, server->capabilities, server->timeAdj)); 2994 server->secMode, server->capabilities, server->timeAdj);
2874 2995
2875 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info); 2996 rc = CIFS_SessSetup(xid, ses, nls_info);
2876 if (rc) { 2997 if (rc) {
2877 cERROR(1, ("Send error in SessSetup = %d", rc)); 2998 cERROR(1, "Send error in SessSetup = %d", rc);
2878 } else { 2999 } else {
2879 cFYI(1, ("CIFS Session Established successfully")); 3000 cFYI(1, "CIFS Session Established successfully");
2880 spin_lock(&GlobalMid_Lock); 3001 spin_lock(&GlobalMid_Lock);
2881 pSesInfo->status = CifsGood; 3002 ses->status = CifsGood;
2882 pSesInfo->need_reconnect = false; 3003 ses->need_reconnect = false;
2883 spin_unlock(&GlobalMid_Lock); 3004 spin_unlock(&GlobalMid_Lock);
2884 } 3005 }
2885 3006
2886ss_err_exit:
2887 return rc; 3007 return rc;
2888} 3008}
2889 3009
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e9f7ecc2714b..f9ed0751cc12 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -25,6 +25,7 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/file.h>
28#include "cifsfs.h" 29#include "cifsfs.h"
29#include "cifspdu.h" 30#include "cifspdu.h"
30#include "cifsglob.h" 31#include "cifsglob.h"
@@ -73,7 +74,7 @@ cifs_bp_rename_retry:
73 namelen += (1 + temp->d_name.len); 74 namelen += (1 + temp->d_name.len);
74 temp = temp->d_parent; 75 temp = temp->d_parent;
75 if (temp == NULL) { 76 if (temp == NULL) {
76 cERROR(1, ("corrupt dentry")); 77 cERROR(1, "corrupt dentry");
77 return NULL; 78 return NULL;
78 } 79 }
79 } 80 }
@@ -90,19 +91,18 @@ cifs_bp_rename_retry:
90 full_path[namelen] = dirsep; 91 full_path[namelen] = dirsep;
91 strncpy(full_path + namelen + 1, temp->d_name.name, 92 strncpy(full_path + namelen + 1, temp->d_name.name,
92 temp->d_name.len); 93 temp->d_name.len);
93 cFYI(0, ("name: %s", full_path + namelen)); 94 cFYI(0, "name: %s", full_path + namelen);
94 } 95 }
95 temp = temp->d_parent; 96 temp = temp->d_parent;
96 if (temp == NULL) { 97 if (temp == NULL) {
97 cERROR(1, ("corrupt dentry")); 98 cERROR(1, "corrupt dentry");
98 kfree(full_path); 99 kfree(full_path);
99 return NULL; 100 return NULL;
100 } 101 }
101 } 102 }
102 if (namelen != pplen + dfsplen) { 103 if (namelen != pplen + dfsplen) {
103 cERROR(1, 104 cERROR(1, "did not end path lookup where expected namelen is %d",
104 ("did not end path lookup where expected namelen is %d", 105 namelen);
105 namelen));
106 /* presumably this is only possible if racing with a rename 106 /* presumably this is only possible if racing with a rename
107 of one of the parent directories (we can not lock the dentries 107 of one of the parent directories (we can not lock the dentries
108 above us to prevent this, but retrying should be harmless) */ 108 above us to prevent this, but retrying should be harmless) */
@@ -157,7 +157,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
157 mutex_init(&pCifsFile->lock_mutex); 157 mutex_init(&pCifsFile->lock_mutex);
158 INIT_LIST_HEAD(&pCifsFile->llist); 158 INIT_LIST_HEAD(&pCifsFile->llist);
159 atomic_set(&pCifsFile->count, 1); 159 atomic_set(&pCifsFile->count, 1);
160 slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops); 160 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
161 161
162 write_lock(&GlobalSMBSeslock); 162 write_lock(&GlobalSMBSeslock);
163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList); 163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
@@ -173,26 +173,28 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
174 pCifsInode->clientCanCacheAll = true; 174 pCifsInode->clientCanCacheAll = true;
175 pCifsInode->clientCanCacheRead = true; 175 pCifsInode->clientCanCacheRead = true;
176 cFYI(1, ("Exclusive Oplock inode %p", newinode)); 176 cFYI(1, "Exclusive Oplock inode %p", newinode);
177 } else if ((oplock & 0xF) == OPLOCK_READ) 177 } else if ((oplock & 0xF) == OPLOCK_READ)
178 pCifsInode->clientCanCacheRead = true; 178 pCifsInode->clientCanCacheRead = true;
179 } 179 }
180 write_unlock(&GlobalSMBSeslock); 180 write_unlock(&GlobalSMBSeslock);
181 181
182 file->private_data = pCifsFile;
183
182 return pCifsFile; 184 return pCifsFile;
183} 185}
184 186
185int cifs_posix_open(char *full_path, struct inode **pinode, 187int cifs_posix_open(char *full_path, struct inode **pinode,
186 struct vfsmount *mnt, int mode, int oflags, 188 struct super_block *sb, int mode, int oflags,
187 __u32 *poplock, __u16 *pnetfid, int xid) 189 __u32 *poplock, __u16 *pnetfid, int xid)
188{ 190{
189 int rc; 191 int rc;
190 FILE_UNIX_BASIC_INFO *presp_data; 192 FILE_UNIX_BASIC_INFO *presp_data;
191 __u32 posix_flags = 0; 193 __u32 posix_flags = 0;
192 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb); 194 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
193 struct cifs_fattr fattr; 195 struct cifs_fattr fattr;
194 196
195 cFYI(1, ("posix open %s", full_path)); 197 cFYI(1, "posix open %s", full_path);
196 198
197 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); 199 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
198 if (presp_data == NULL) 200 if (presp_data == NULL)
@@ -242,7 +244,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
242 244
243 /* get new inode and set it up */ 245 /* get new inode and set it up */
244 if (*pinode == NULL) { 246 if (*pinode == NULL) {
245 *pinode = cifs_iget(mnt->mnt_sb, &fattr); 247 cifs_fill_uniqueid(sb, &fattr);
248 *pinode = cifs_iget(sb, &fattr);
246 if (!*pinode) { 249 if (!*pinode) {
247 rc = -ENOMEM; 250 rc = -ENOMEM;
248 goto posix_open_ret; 251 goto posix_open_ret;
@@ -251,8 +254,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
251 cifs_fattr_to_inode(*pinode, &fattr); 254 cifs_fattr_to_inode(*pinode, &fattr);
252 } 255 }
253 256
254 cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
255
256posix_open_ret: 257posix_open_ret:
257 kfree(presp_data); 258 kfree(presp_data);
258 return rc; 259 return rc;
@@ -280,7 +281,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
280 int create_options = CREATE_NOT_DIR; 281 int create_options = CREATE_NOT_DIR;
281 __u32 oplock = 0; 282 __u32 oplock = 0;
282 int oflags; 283 int oflags;
283 bool posix_create = false;
284 /* 284 /*
285 * BB below access is probably too much for mknod to request 285 * BB below access is probably too much for mknod to request
286 * but we have to do query and setpathinfo so requesting 286 * but we have to do query and setpathinfo so requesting
@@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
305 full_path = build_path_from_dentry(direntry); 305 full_path = build_path_from_dentry(direntry);
306 if (full_path == NULL) { 306 if (full_path == NULL) {
307 rc = -ENOMEM; 307 rc = -ENOMEM;
308 FreeXid(xid); 308 goto cifs_create_out;
309 return rc;
310 } 309 }
311 310
312 if (oplockEnabled) 311 if (oplockEnabled)
@@ -315,20 +314,19 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
315 if (nd && (nd->flags & LOOKUP_OPEN)) 314 if (nd && (nd->flags & LOOKUP_OPEN))
316 oflags = nd->intent.open.flags; 315 oflags = nd->intent.open.flags;
317 else 316 else
318 oflags = FMODE_READ; 317 oflags = FMODE_READ | SMB_O_CREAT;
319 318
320 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 319 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
321 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 320 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
322 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 321 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
323 rc = cifs_posix_open(full_path, &newinode, nd->path.mnt, 322 rc = cifs_posix_open(full_path, &newinode,
324 mode, oflags, &oplock, &fileHandle, xid); 323 inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
325 /* EIO could indicate that (posix open) operation is not 324 /* EIO could indicate that (posix open) operation is not
326 supported, despite what server claimed in capability 325 supported, despite what server claimed in capability
327 negotation. EREMOTE indicates DFS junction, which is not 326 negotation. EREMOTE indicates DFS junction, which is not
328 handled in posix open */ 327 handled in posix open */
329 328
330 if (rc == 0) { 329 if (rc == 0) {
331 posix_create = true;
332 if (newinode == NULL) /* query inode info */ 330 if (newinode == NULL) /* query inode info */
333 goto cifs_create_get_file_info; 331 goto cifs_create_get_file_info;
334 else /* success, no need to query */ 332 else /* success, no need to query */
@@ -358,7 +356,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
358 else if ((oflags & O_CREAT) == O_CREAT) 356 else if ((oflags & O_CREAT) == O_CREAT)
359 disposition = FILE_OPEN_IF; 357 disposition = FILE_OPEN_IF;
360 else 358 else
361 cFYI(1, ("Create flag not set in create function")); 359 cFYI(1, "Create flag not set in create function");
362 } 360 }
363 361
364 /* BB add processing to set equivalent of mode - e.g. via CreateX with 362 /* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -366,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
366 364
367 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 365 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
368 if (buf == NULL) { 366 if (buf == NULL) {
369 kfree(full_path); 367 rc = -ENOMEM;
370 FreeXid(xid); 368 goto cifs_create_out;
371 return -ENOMEM;
372 } 369 }
373 370
374 /* 371 /*
@@ -394,7 +391,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
394 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 391 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
395 } 392 }
396 if (rc) { 393 if (rc) {
397 cFYI(1, ("cifs_create returned 0x%x", rc)); 394 cFYI(1, "cifs_create returned 0x%x", rc);
398 goto cifs_create_out; 395 goto cifs_create_out;
399 } 396 }
400 397
@@ -457,16 +454,30 @@ cifs_create_set_dentry:
457 if (rc == 0) 454 if (rc == 0)
458 setup_cifs_dentry(tcon, direntry, newinode); 455 setup_cifs_dentry(tcon, direntry, newinode);
459 else 456 else
460 cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc)); 457 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
458
459 if (newinode && nd && (nd->flags & LOOKUP_OPEN)) {
460 struct cifsFileInfo *pfile_info;
461 struct file *filp;
462
463 filp = lookup_instantiate_filp(nd, direntry, generic_file_open);
464 if (IS_ERR(filp)) {
465 rc = PTR_ERR(filp);
466 CIFSSMBClose(xid, tcon, fileHandle);
467 goto cifs_create_out;
468 }
461 469
462 /* nfsd case - nfs srv does not set nd */ 470 pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
463 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) { 471 nd->path.mnt, oflags);
464 /* mknod case - do not leave file open */ 472 if (pfile_info == NULL) {
473 fput(filp);
474 CIFSSMBClose(xid, tcon, fileHandle);
475 rc = -ENOMEM;
476 }
477 } else {
465 CIFSSMBClose(xid, tcon, fileHandle); 478 CIFSSMBClose(xid, tcon, fileHandle);
466 } else if (!(posix_create) && (newinode)) {
467 cifs_new_fileinfo(newinode, fileHandle, NULL,
468 nd->path.mnt, oflags);
469 } 479 }
480
470cifs_create_out: 481cifs_create_out:
471 kfree(buf); 482 kfree(buf);
472 kfree(full_path); 483 kfree(full_path);
@@ -483,6 +494,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
483 struct cifsTconInfo *pTcon; 494 struct cifsTconInfo *pTcon;
484 char *full_path = NULL; 495 char *full_path = NULL;
485 struct inode *newinode = NULL; 496 struct inode *newinode = NULL;
497 int oplock = 0;
498 u16 fileHandle;
499 FILE_ALL_INFO *buf = NULL;
500 unsigned int bytes_written;
501 struct win_dev *pdev;
486 502
487 if (!old_valid_dev(device_number)) 503 if (!old_valid_dev(device_number))
488 return -EINVAL; 504 return -EINVAL;
@@ -493,9 +509,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
493 pTcon = cifs_sb->tcon; 509 pTcon = cifs_sb->tcon;
494 510
495 full_path = build_path_from_dentry(direntry); 511 full_path = build_path_from_dentry(direntry);
496 if (full_path == NULL) 512 if (full_path == NULL) {
497 rc = -ENOMEM; 513 rc = -ENOMEM;
498 else if (pTcon->unix_ext) { 514 goto mknod_out;
515 }
516
517 if (pTcon->unix_ext) {
499 struct cifs_unix_set_info_args args = { 518 struct cifs_unix_set_info_args args = {
500 .mode = mode & ~current_umask(), 519 .mode = mode & ~current_umask(),
501 .ctime = NO_CHANGE_64, 520 .ctime = NO_CHANGE_64,
@@ -514,87 +533,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
514 cifs_sb->local_nls, 533 cifs_sb->local_nls,
515 cifs_sb->mnt_cifs_flags & 534 cifs_sb->mnt_cifs_flags &
516 CIFS_MOUNT_MAP_SPECIAL_CHR); 535 CIFS_MOUNT_MAP_SPECIAL_CHR);
536 if (rc)
537 goto mknod_out;
517 538
518 if (!rc) { 539 rc = cifs_get_inode_info_unix(&newinode, full_path,
519 rc = cifs_get_inode_info_unix(&newinode, full_path,
520 inode->i_sb, xid); 540 inode->i_sb, xid);
521 if (pTcon->nocase) 541 if (pTcon->nocase)
522 direntry->d_op = &cifs_ci_dentry_ops; 542 direntry->d_op = &cifs_ci_dentry_ops;
523 else 543 else
524 direntry->d_op = &cifs_dentry_ops; 544 direntry->d_op = &cifs_dentry_ops;
525 if (rc == 0)
526 d_instantiate(direntry, newinode);
527 }
528 } else {
529 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
530 int oplock = 0;
531 u16 fileHandle;
532 FILE_ALL_INFO *buf;
533 545
534 cFYI(1, ("sfu compat create special file")); 546 if (rc == 0)
547 d_instantiate(direntry, newinode);
548 goto mknod_out;
549 }
535 550
536 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 551 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
537 if (buf == NULL) { 552 goto mknod_out;
538 kfree(full_path);
539 rc = -ENOMEM;
540 FreeXid(xid);
541 return rc;
542 }
543 553
544 rc = CIFSSMBOpen(xid, pTcon, full_path, 554
545 FILE_CREATE, /* fail if exists */ 555 cFYI(1, "sfu compat create special file");
546 GENERIC_WRITE /* BB would 556
547 WRITE_OWNER | WRITE_DAC be better? */, 557 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
548 /* Create a file and set the 558 if (buf == NULL) {
549 file attribute to SYSTEM */ 559 kfree(full_path);
550 CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, 560 rc = -ENOMEM;
551 &fileHandle, &oplock, buf, 561 FreeXid(xid);
552 cifs_sb->local_nls, 562 return rc;
553 cifs_sb->mnt_cifs_flags &
554 CIFS_MOUNT_MAP_SPECIAL_CHR);
555
556 /* BB FIXME - add handling for backlevel servers
557 which need legacy open and check for all
558 calls to SMBOpen for fallback to SMBLeagcyOpen */
559 if (!rc) {
560 /* BB Do not bother to decode buf since no
561 local inode yet to put timestamps in,
562 but we can reuse it safely */
563 unsigned int bytes_written;
564 struct win_dev *pdev;
565 pdev = (struct win_dev *)buf;
566 if (S_ISCHR(mode)) {
567 memcpy(pdev->type, "IntxCHR", 8);
568 pdev->major =
569 cpu_to_le64(MAJOR(device_number));
570 pdev->minor =
571 cpu_to_le64(MINOR(device_number));
572 rc = CIFSSMBWrite(xid, pTcon,
573 fileHandle,
574 sizeof(struct win_dev),
575 0, &bytes_written, (char *)pdev,
576 NULL, 0);
577 } else if (S_ISBLK(mode)) {
578 memcpy(pdev->type, "IntxBLK", 8);
579 pdev->major =
580 cpu_to_le64(MAJOR(device_number));
581 pdev->minor =
582 cpu_to_le64(MINOR(device_number));
583 rc = CIFSSMBWrite(xid, pTcon,
584 fileHandle,
585 sizeof(struct win_dev),
586 0, &bytes_written, (char *)pdev,
587 NULL, 0);
588 } /* else if(S_ISFIFO */
589 CIFSSMBClose(xid, pTcon, fileHandle);
590 d_drop(direntry);
591 }
592 kfree(buf);
593 /* add code here to set EAs */
594 }
595 } 563 }
596 564
565 /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
566 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
567 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
568 &fileHandle, &oplock, buf, cifs_sb->local_nls,
569 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
570 if (rc)
571 goto mknod_out;
572
573 /* BB Do not bother to decode buf since no local inode yet to put
574 * timestamps in, but we can reuse it safely */
575
576 pdev = (struct win_dev *)buf;
577 if (S_ISCHR(mode)) {
578 memcpy(pdev->type, "IntxCHR", 8);
579 pdev->major =
580 cpu_to_le64(MAJOR(device_number));
581 pdev->minor =
582 cpu_to_le64(MINOR(device_number));
583 rc = CIFSSMBWrite(xid, pTcon,
584 fileHandle,
585 sizeof(struct win_dev),
586 0, &bytes_written, (char *)pdev,
587 NULL, 0);
588 } else if (S_ISBLK(mode)) {
589 memcpy(pdev->type, "IntxBLK", 8);
590 pdev->major =
591 cpu_to_le64(MAJOR(device_number));
592 pdev->minor =
593 cpu_to_le64(MINOR(device_number));
594 rc = CIFSSMBWrite(xid, pTcon,
595 fileHandle,
596 sizeof(struct win_dev),
597 0, &bytes_written, (char *)pdev,
598 NULL, 0);
599 } /* else if (S_ISFIFO) */
600 CIFSSMBClose(xid, pTcon, fileHandle);
601 d_drop(direntry);
602
603 /* FIXME: add code here to set EAs */
604
605mknod_out:
597 kfree(full_path); 606 kfree(full_path);
607 kfree(buf);
598 FreeXid(xid); 608 FreeXid(xid);
599 return rc; 609 return rc;
600} 610}
@@ -610,14 +620,15 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
610 bool posix_open = false; 620 bool posix_open = false;
611 struct cifs_sb_info *cifs_sb; 621 struct cifs_sb_info *cifs_sb;
612 struct cifsTconInfo *pTcon; 622 struct cifsTconInfo *pTcon;
623 struct cifsFileInfo *cfile;
613 struct inode *newInode = NULL; 624 struct inode *newInode = NULL;
614 char *full_path = NULL; 625 char *full_path = NULL;
615 struct file *filp; 626 struct file *filp;
616 627
617 xid = GetXid(); 628 xid = GetXid();
618 629
619 cFYI(1, ("parent inode = 0x%p name is: %s and dentry = 0x%p", 630 cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
620 parent_dir_inode, direntry->d_name.name, direntry)); 631 parent_dir_inode, direntry->d_name.name, direntry);
621 632
622 /* check whether path exists */ 633 /* check whether path exists */
623 634
@@ -632,7 +643,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
632 int i; 643 int i;
633 for (i = 0; i < direntry->d_name.len; i++) 644 for (i = 0; i < direntry->d_name.len; i++)
634 if (direntry->d_name.name[i] == '\\') { 645 if (direntry->d_name.name[i] == '\\') {
635 cFYI(1, ("Invalid file name")); 646 cFYI(1, "Invalid file name");
636 FreeXid(xid); 647 FreeXid(xid);
637 return ERR_PTR(-EINVAL); 648 return ERR_PTR(-EINVAL);
638 } 649 }
@@ -657,11 +668,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
657 } 668 }
658 669
659 if (direntry->d_inode != NULL) { 670 if (direntry->d_inode != NULL) {
660 cFYI(1, ("non-NULL inode in lookup")); 671 cFYI(1, "non-NULL inode in lookup");
661 } else { 672 } else {
662 cFYI(1, ("NULL inode in lookup")); 673 cFYI(1, "NULL inode in lookup");
663 } 674 }
664 cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode)); 675 cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
665 676
666 /* Posix open is only called (at lookup time) for file create now. 677 /* Posix open is only called (at lookup time) for file create now.
667 * For opens (rather than creates), because we do not know if it 678 * For opens (rather than creates), because we do not know if it
@@ -677,7 +688,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
677 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && 688 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
678 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 689 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
679 (nd->intent.open.flags & O_CREAT)) { 690 (nd->intent.open.flags & O_CREAT)) {
680 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt, 691 rc = cifs_posix_open(full_path, &newInode,
692 parent_dir_inode->i_sb,
681 nd->intent.open.create_mode, 693 nd->intent.open.create_mode,
682 nd->intent.open.flags, &oplock, 694 nd->intent.open.flags, &oplock,
683 &fileHandle, xid); 695 &fileHandle, xid);
@@ -706,8 +718,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
706 else 718 else
707 direntry->d_op = &cifs_dentry_ops; 719 direntry->d_op = &cifs_dentry_ops;
708 d_add(direntry, newInode); 720 d_add(direntry, newInode);
709 if (posix_open) 721 if (posix_open) {
710 filp = lookup_instantiate_filp(nd, direntry, NULL); 722 filp = lookup_instantiate_filp(nd, direntry,
723 generic_file_open);
724 if (IS_ERR(filp)) {
725 rc = PTR_ERR(filp);
726 CIFSSMBClose(xid, pTcon, fileHandle);
727 goto lookup_out;
728 }
729
730 cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
731 nd->path.mnt,
732 nd->intent.open.flags);
733 if (cfile == NULL) {
734 fput(filp);
735 CIFSSMBClose(xid, pTcon, fileHandle);
736 rc = -ENOMEM;
737 goto lookup_out;
738 }
739 }
711 /* since paths are not looked up by component - the parent 740 /* since paths are not looked up by component - the parent
712 directories are presumed to be good here */ 741 directories are presumed to be good here */
713 renew_parental_timestamps(direntry); 742 renew_parental_timestamps(direntry);
@@ -723,11 +752,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
723 /* if it was once a directory (but how can we tell?) we could do 752 /* if it was once a directory (but how can we tell?) we could do
724 shrink_dcache_parent(direntry); */ 753 shrink_dcache_parent(direntry); */
725 } else if (rc != -EACCES) { 754 } else if (rc != -EACCES) {
726 cERROR(1, ("Unexpected lookup error %d", rc)); 755 cERROR(1, "Unexpected lookup error %d", rc);
727 /* We special case check for Access Denied - since that 756 /* We special case check for Access Denied - since that
728 is a common return code */ 757 is a common return code */
729 } 758 }
730 759
760lookup_out:
731 kfree(full_path); 761 kfree(full_path);
732 FreeXid(xid); 762 FreeXid(xid);
733 return ERR_PTR(rc); 763 return ERR_PTR(rc);
@@ -742,8 +772,8 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
742 if (cifs_revalidate_dentry(direntry)) 772 if (cifs_revalidate_dentry(direntry))
743 return 0; 773 return 0;
744 } else { 774 } else {
745 cFYI(1, ("neg dentry 0x%p name = %s", 775 cFYI(1, "neg dentry 0x%p name = %s",
746 direntry, direntry->d_name.name)); 776 direntry, direntry->d_name.name);
747 if (time_after(jiffies, direntry->d_time + HZ) || 777 if (time_after(jiffies, direntry->d_time + HZ) ||
748 !lookupCacheEnabled) { 778 !lookupCacheEnabled) {
749 d_drop(direntry); 779 d_drop(direntry);
@@ -758,7 +788,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
758{ 788{
759 int rc = 0; 789 int rc = 0;
760 790
761 cFYI(1, ("In cifs d_delete, name = %s", direntry->d_name.name)); 791 cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
762 792
763 return rc; 793 return rc;
764} */ 794} */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 6f8a0e3fb25b..0eb87026cad3 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -4,6 +4,8 @@
4 * Copyright (c) 2007 Igor Mammedov 4 * Copyright (c) 2007 Igor Mammedov
5 * Author(s): Igor Mammedov (niallain@gmail.com) 5 * Author(s): Igor Mammedov (niallain@gmail.com)
6 * Steve French (sfrench@us.ibm.com) 6 * Steve French (sfrench@us.ibm.com)
7 * Wang Lei (wang840925@gmail.com)
8 * David Howells (dhowells@redhat.com)
7 * 9 *
8 * Contains the CIFS DFS upcall routines used for hostname to 10 * Contains the CIFS DFS upcall routines used for hostname to
9 * IP address translation. 11 * IP address translation.
@@ -24,145 +26,73 @@
24 */ 26 */
25 27
26#include <linux/slab.h> 28#include <linux/slab.h>
27#include <keys/user-type.h> 29#include <linux/dns_resolver.h>
28#include "dns_resolve.h" 30#include "dns_resolve.h"
29#include "cifsglob.h" 31#include "cifsglob.h"
30#include "cifsproto.h" 32#include "cifsproto.h"
31#include "cifs_debug.h" 33#include "cifs_debug.h"
32 34
33/* Checks if supplied name is IP address 35/**
34 * returns: 36 * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
35 * 1 - name is IP 37 * @unc: UNC path specifying the server
36 * 0 - name is not IP 38 * @ip_addr: Where to return the IP address.
37 */ 39 *
38static int 40 * The IP address will be returned in string form, and the caller is
39is_ip(char *name) 41 * responsible for freeing it.
40{ 42 *
41 struct sockaddr_storage ss; 43 * Returns length of result on success, -ve on error.
42
43 return cifs_convert_address(name, &ss);
44}
45
46static int
47dns_resolver_instantiate(struct key *key, const void *data,
48 size_t datalen)
49{
50 int rc = 0;
51 char *ip;
52
53 ip = kmalloc(datalen + 1, GFP_KERNEL);
54 if (!ip)
55 return -ENOMEM;
56
57 memcpy(ip, data, datalen);
58 ip[datalen] = '\0';
59
60 /* make sure this looks like an address */
61 if (!is_ip(ip)) {
62 kfree(ip);
63 return -EINVAL;
64 }
65
66 key->type_data.x[0] = datalen;
67 key->payload.data = ip;
68
69 return rc;
70}
71
72static void
73dns_resolver_destroy(struct key *key)
74{
75 kfree(key->payload.data);
76}
77
78struct key_type key_type_dns_resolver = {
79 .name = "dns_resolver",
80 .def_datalen = sizeof(struct in_addr),
81 .describe = user_describe,
82 .instantiate = dns_resolver_instantiate,
83 .destroy = dns_resolver_destroy,
84 .match = user_match,
85};
86
87/* Resolves server name to ip address.
88 * input:
89 * unc - server UNC
90 * output:
91 * *ip_addr - pointer to server ip, caller responcible for freeing it.
92 * return 0 on success
93 */ 44 */
94int 45int
95dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) 46dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
96{ 47{
97 int rc = -EAGAIN; 48 struct sockaddr_storage ss;
98 struct key *rkey = ERR_PTR(-EAGAIN); 49 const char *hostname, *sep;
99 char *name; 50 char *name;
100 char *data = NULL; 51 int len, rc;
101 int len;
102 52
103 if (!ip_addr || !unc) 53 if (!ip_addr || !unc)
104 return -EINVAL; 54 return -EINVAL;
105 55
106 /* search for server name delimiter */
107 len = strlen(unc); 56 len = strlen(unc);
108 if (len < 3) { 57 if (len < 3) {
109 cFYI(1, ("%s: unc is too short: %s", __func__, unc)); 58 cFYI(1, "%s: unc is too short: %s", __func__, unc);
110 return -EINVAL; 59 return -EINVAL;
111 } 60 }
61
62 /* Discount leading slashes for cifs */
112 len -= 2; 63 len -= 2;
113 name = memchr(unc+2, '\\', len); 64 hostname = unc + 2;
114 if (!name) { 65
115 cFYI(1, ("%s: probably server name is whole unc: %s", 66 /* Search for server name delimiter */
116 __func__, unc)); 67 sep = memchr(hostname, '\\', len);
117 } else { 68 if (sep)
118 len = (name - unc) - 2/* leading // */; 69 len = sep - unc;
119 } 70 else
71 cFYI(1, "%s: probably server name is whole unc: %s",
72 __func__, unc);
73
74 /* Try to interpret hostname as an IPv4 or IPv6 address */
75 rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
76 if (rc > 0)
77 goto name_is_IP_address;
78
79 /* Perform the upcall */
80 rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
81 if (rc < 0)
82 cERROR(1, "%s: unable to resolve: %*.*s",
83 __func__, len, len, hostname);
84 else
85 cFYI(1, "%s: resolved: %*.*s to %s",
86 __func__, len, len, hostname, *ip_addr);
87 return rc;
120 88
121 name = kmalloc(len+1, GFP_KERNEL); 89name_is_IP_address:
122 if (!name) { 90 name = kmalloc(len + 1, GFP_KERNEL);
123 rc = -ENOMEM; 91 if (!name)
124 return rc; 92 return -ENOMEM;
125 } 93 memcpy(name, hostname, len);
126 memcpy(name, unc+2, len);
127 name[len] = 0; 94 name[len] = 0;
128 95 cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
129 if (is_ip(name)) { 96 *ip_addr = name;
130 cFYI(1, ("%s: it is IP, skipping dns upcall: %s", 97 return 0;
131 __func__, name));
132 data = name;
133 goto skip_upcall;
134 }
135
136 rkey = request_key(&key_type_dns_resolver, name, "");
137 if (!IS_ERR(rkey)) {
138 len = rkey->type_data.x[0];
139 data = rkey->payload.data;
140 } else {
141 cERROR(1, ("%s: unable to resolve: %s", __func__, name));
142 goto out;
143 }
144
145skip_upcall:
146 if (data) {
147 *ip_addr = kmalloc(len + 1, GFP_KERNEL);
148 if (*ip_addr) {
149 memcpy(*ip_addr, data, len + 1);
150 if (!IS_ERR(rkey))
151 cFYI(1, ("%s: resolved: %s to %s", __func__,
152 name,
153 *ip_addr
154 ));
155 rc = 0;
156 } else {
157 rc = -ENOMEM;
158 }
159 if (!IS_ERR(rkey))
160 key_put(rkey);
161 }
162
163out:
164 kfree(name);
165 return rc;
166} 98}
167
168
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 966e9288930b..d3f5d27f4d06 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,6 @@
24#define _DNS_RESOLVE_H 24#define _DNS_RESOLVE_H
25 25
26#ifdef __KERNEL__ 26#ifdef __KERNEL__
27#include <linux/key-type.h>
28extern struct key_type key_type_dns_resolver;
29extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); 27extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
30#endif /* KERNEL */ 28#endif /* KERNEL */
31 29
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 6177f7cca16a..993f82045bf6 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
49static struct dentry *cifs_get_parent(struct dentry *dentry) 49static struct dentry *cifs_get_parent(struct dentry *dentry)
50{ 50{
51 /* BB need to add code here eventually to enable export via NFSD */ 51 /* BB need to add code here eventually to enable export via NFSD */
52 cFYI(1, ("get parent for %p", dentry)); 52 cFYI(1, "get parent for %p", dentry);
53 return ERR_PTR(-EACCES); 53 return ERR_PTR(-EACCES);
54} 54}
55 55
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9b11a8f56f3a..de748c652d11 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * vfs operations that deal with files 4 * vfs operations that deal with files
5 * 5 *
6 * Copyright (C) International Business Machines Corp., 2002,2007 6 * Copyright (C) International Business Machines Corp., 2002,2010
7 * Author(s): Steve French (sfrench@us.ibm.com) 7 * Author(s): Steve French (sfrench@us.ibm.com)
8 * Jeremy Allison (jra@samba.org) 8 * Jeremy Allison (jra@samba.org)
9 * 9 *
@@ -40,6 +40,7 @@
40#include "cifs_unicode.h" 40#include "cifs_unicode.h"
41#include "cifs_debug.h" 41#include "cifs_debug.h"
42#include "cifs_fs_sb.h" 42#include "cifs_fs_sb.h"
43#include "fscache.h"
43 44
44static inline int cifs_convert_flags(unsigned int flags) 45static inline int cifs_convert_flags(unsigned int flags)
45{ 46{
@@ -108,8 +109,7 @@ static inline int cifs_get_disposition(unsigned int flags)
108/* all arguments to this function must be checked for validity in caller */ 109/* all arguments to this function must be checked for validity in caller */
109static inline int 110static inline int
110cifs_posix_open_inode_helper(struct inode *inode, struct file *file, 111cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
111 struct cifsInodeInfo *pCifsInode, 112 struct cifsInodeInfo *pCifsInode, __u32 oplock,
112 struct cifsFileInfo *pCifsFile, __u32 oplock,
113 u16 netfid) 113 u16 netfid)
114{ 114{
115 115
@@ -136,15 +136,15 @@ cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
136 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && 136 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
137 (file->f_path.dentry->d_inode->i_size == 137 (file->f_path.dentry->d_inode->i_size ==
138 (loff_t)le64_to_cpu(buf->EndOfFile))) { 138 (loff_t)le64_to_cpu(buf->EndOfFile))) {
139 cFYI(1, ("inode unchanged on server")); 139 cFYI(1, "inode unchanged on server");
140 } else { 140 } else {
141 if (file->f_path.dentry->d_inode->i_mapping) { 141 if (file->f_path.dentry->d_inode->i_mapping) {
142 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 142 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
143 if (rc != 0) 143 if (rc != 0)
144 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 144 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
145 } 145 }
146 cFYI(1, ("invalidating remote inode since open detected it " 146 cFYI(1, "invalidating remote inode since open detected it "
147 "changed")); 147 "changed");
148 invalidate_remote_inode(file->f_path.dentry->d_inode); 148 invalidate_remote_inode(file->f_path.dentry->d_inode);
149 } */ 149 } */
150 150
@@ -152,8 +152,8 @@ psx_client_can_cache:
152 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 152 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
153 pCifsInode->clientCanCacheAll = true; 153 pCifsInode->clientCanCacheAll = true;
154 pCifsInode->clientCanCacheRead = true; 154 pCifsInode->clientCanCacheRead = true;
155 cFYI(1, ("Exclusive Oplock granted on inode %p", 155 cFYI(1, "Exclusive Oplock granted on inode %p",
156 file->f_path.dentry->d_inode)); 156 file->f_path.dentry->d_inode);
157 } else if ((oplock & 0xF) == OPLOCK_READ) 157 } else if ((oplock & 0xF) == OPLOCK_READ)
158 pCifsInode->clientCanCacheRead = true; 158 pCifsInode->clientCanCacheRead = true;
159 159
@@ -163,44 +163,12 @@ psx_client_can_cache:
163 return 0; 163 return 0;
164} 164}
165 165
166static struct cifsFileInfo *
167cifs_fill_filedata(struct file *file)
168{
169 struct list_head *tmp;
170 struct cifsFileInfo *pCifsFile = NULL;
171 struct cifsInodeInfo *pCifsInode = NULL;
172
173 /* search inode for this file and fill in file->private_data */
174 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
175 read_lock(&GlobalSMBSeslock);
176 list_for_each(tmp, &pCifsInode->openFileList) {
177 pCifsFile = list_entry(tmp, struct cifsFileInfo, flist);
178 if ((pCifsFile->pfile == NULL) &&
179 (pCifsFile->pid == current->tgid)) {
180 /* mode set in cifs_create */
181
182 /* needed for writepage */
183 pCifsFile->pfile = file;
184 file->private_data = pCifsFile;
185 break;
186 }
187 }
188 read_unlock(&GlobalSMBSeslock);
189
190 if (file->private_data != NULL) {
191 return pCifsFile;
192 } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
193 cERROR(1, ("could not find file instance for "
194 "new file %p", file));
195 return NULL;
196}
197
198/* all arguments to this function must be checked for validity in caller */ 166/* all arguments to this function must be checked for validity in caller */
199static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, 167static inline int cifs_open_inode_helper(struct inode *inode,
200 struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
201 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf, 168 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
202 char *full_path, int xid) 169 char *full_path, int xid)
203{ 170{
171 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
204 struct timespec temp; 172 struct timespec temp;
205 int rc; 173 int rc;
206 174
@@ -214,36 +182,35 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
214 /* if not oplocked, invalidate inode pages if mtime or file 182 /* if not oplocked, invalidate inode pages if mtime or file
215 size changed */ 183 size changed */
216 temp = cifs_NTtimeToUnix(buf->LastWriteTime); 184 temp = cifs_NTtimeToUnix(buf->LastWriteTime);
217 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && 185 if (timespec_equal(&inode->i_mtime, &temp) &&
218 (file->f_path.dentry->d_inode->i_size == 186 (inode->i_size ==
219 (loff_t)le64_to_cpu(buf->EndOfFile))) { 187 (loff_t)le64_to_cpu(buf->EndOfFile))) {
220 cFYI(1, ("inode unchanged on server")); 188 cFYI(1, "inode unchanged on server");
221 } else { 189 } else {
222 if (file->f_path.dentry->d_inode->i_mapping) { 190 if (inode->i_mapping) {
223 /* BB no need to lock inode until after invalidate 191 /* BB no need to lock inode until after invalidate
224 since namei code should already have it locked? */ 192 since namei code should already have it locked? */
225 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 193 rc = filemap_write_and_wait(inode->i_mapping);
226 if (rc != 0) 194 if (rc != 0)
227 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 195 pCifsInode->write_behind_rc = rc;
228 } 196 }
229 cFYI(1, ("invalidating remote inode since open detected it " 197 cFYI(1, "invalidating remote inode since open detected it "
230 "changed")); 198 "changed");
231 invalidate_remote_inode(file->f_path.dentry->d_inode); 199 invalidate_remote_inode(inode);
232 } 200 }
233 201
234client_can_cache: 202client_can_cache:
235 if (pTcon->unix_ext) 203 if (pTcon->unix_ext)
236 rc = cifs_get_inode_info_unix(&file->f_path.dentry->d_inode, 204 rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
237 full_path, inode->i_sb, xid); 205 xid);
238 else 206 else
239 rc = cifs_get_inode_info(&file->f_path.dentry->d_inode, 207 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
240 full_path, buf, inode->i_sb, xid, NULL); 208 xid, NULL);
241 209
242 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) { 210 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
243 pCifsInode->clientCanCacheAll = true; 211 pCifsInode->clientCanCacheAll = true;
244 pCifsInode->clientCanCacheRead = true; 212 pCifsInode->clientCanCacheRead = true;
245 cFYI(1, ("Exclusive Oplock granted on inode %p", 213 cFYI(1, "Exclusive Oplock granted on inode %p", inode);
246 file->f_path.dentry->d_inode));
247 } else if ((*oplock & 0xF) == OPLOCK_READ) 214 } else if ((*oplock & 0xF) == OPLOCK_READ)
248 pCifsInode->clientCanCacheRead = true; 215 pCifsInode->clientCanCacheRead = true;
249 216
@@ -257,7 +224,7 @@ int cifs_open(struct inode *inode, struct file *file)
257 __u32 oplock; 224 __u32 oplock;
258 struct cifs_sb_info *cifs_sb; 225 struct cifs_sb_info *cifs_sb;
259 struct cifsTconInfo *tcon; 226 struct cifsTconInfo *tcon;
260 struct cifsFileInfo *pCifsFile; 227 struct cifsFileInfo *pCifsFile = NULL;
261 struct cifsInodeInfo *pCifsInode; 228 struct cifsInodeInfo *pCifsInode;
262 char *full_path = NULL; 229 char *full_path = NULL;
263 int desiredAccess; 230 int desiredAccess;
@@ -271,22 +238,15 @@ int cifs_open(struct inode *inode, struct file *file)
271 tcon = cifs_sb->tcon; 238 tcon = cifs_sb->tcon;
272 239
273 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 240 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
274 pCifsFile = cifs_fill_filedata(file);
275 if (pCifsFile) {
276 rc = 0;
277 FreeXid(xid);
278 return rc;
279 }
280 241
281 full_path = build_path_from_dentry(file->f_path.dentry); 242 full_path = build_path_from_dentry(file->f_path.dentry);
282 if (full_path == NULL) { 243 if (full_path == NULL) {
283 rc = -ENOMEM; 244 rc = -ENOMEM;
284 FreeXid(xid); 245 goto out;
285 return rc;
286 } 246 }
287 247
288 cFYI(1, ("inode = 0x%p file flags are 0x%x for %s", 248 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
289 inode, file->f_flags, full_path)); 249 inode, file->f_flags, full_path);
290 250
291 if (oplockEnabled) 251 if (oplockEnabled)
292 oplock = REQ_OPLOCK; 252 oplock = REQ_OPLOCK;
@@ -298,27 +258,42 @@ int cifs_open(struct inode *inode, struct file *file)
298 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 258 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
299 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 259 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
300 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 260 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
261 oflags |= SMB_O_CREAT;
301 /* can not refresh inode info since size could be stale */ 262 /* can not refresh inode info since size could be stale */
302 rc = cifs_posix_open(full_path, &inode, file->f_path.mnt, 263 rc = cifs_posix_open(full_path, &inode, inode->i_sb,
303 cifs_sb->mnt_file_mode /* ignored */, 264 cifs_sb->mnt_file_mode /* ignored */,
304 oflags, &oplock, &netfid, xid); 265 oflags, &oplock, &netfid, xid);
305 if (rc == 0) { 266 if (rc == 0) {
306 cFYI(1, ("posix open succeeded")); 267 cFYI(1, "posix open succeeded");
307 /* no need for special case handling of setting mode 268 /* no need for special case handling of setting mode
308 on read only files needed here */ 269 on read only files needed here */
309 270
310 pCifsFile = cifs_fill_filedata(file); 271 rc = cifs_posix_open_inode_helper(inode, file,
311 cifs_posix_open_inode_helper(inode, file, pCifsInode, 272 pCifsInode, oplock, netfid);
312 pCifsFile, oplock, netfid); 273 if (rc != 0) {
274 CIFSSMBClose(xid, tcon, netfid);
275 goto out;
276 }
277
278 pCifsFile = cifs_new_fileinfo(inode, netfid, file,
279 file->f_path.mnt,
280 oflags);
281 if (pCifsFile == NULL) {
282 CIFSSMBClose(xid, tcon, netfid);
283 rc = -ENOMEM;
284 }
285
286 cifs_fscache_set_inode_cookie(inode, file);
287
313 goto out; 288 goto out;
314 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 289 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
315 if (tcon->ses->serverNOS) 290 if (tcon->ses->serverNOS)
316 cERROR(1, ("server %s of type %s returned" 291 cERROR(1, "server %s of type %s returned"
317 " unexpected error on SMB posix open" 292 " unexpected error on SMB posix open"
318 ", disabling posix open support." 293 ", disabling posix open support."
319 " Check if server update available.", 294 " Check if server update available.",
320 tcon->ses->serverName, 295 tcon->ses->serverName,
321 tcon->ses->serverNOS)); 296 tcon->ses->serverNOS);
322 tcon->broken_posix_open = true; 297 tcon->broken_posix_open = true;
323 } else if ((rc != -EIO) && (rc != -EREMOTE) && 298 } else if ((rc != -EIO) && (rc != -EREMOTE) &&
324 (rc != -EOPNOTSUPP)) /* path not found or net err */ 299 (rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -386,20 +361,22 @@ int cifs_open(struct inode *inode, struct file *file)
386 & CIFS_MOUNT_MAP_SPECIAL_CHR); 361 & CIFS_MOUNT_MAP_SPECIAL_CHR);
387 } 362 }
388 if (rc) { 363 if (rc) {
389 cFYI(1, ("cifs_open returned 0x%x", rc)); 364 cFYI(1, "cifs_open returned 0x%x", rc);
390 goto out; 365 goto out;
391 } 366 }
392 367
368 rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid);
369 if (rc != 0)
370 goto out;
371
393 pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt, 372 pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
394 file->f_flags); 373 file->f_flags);
395 file->private_data = pCifsFile; 374 if (pCifsFile == NULL) {
396 if (file->private_data == NULL) {
397 rc = -ENOMEM; 375 rc = -ENOMEM;
398 goto out; 376 goto out;
399 } 377 }
400 378
401 rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon, 379 cifs_fscache_set_inode_cookie(inode, file);
402 &oplock, buf, full_path, xid);
403 380
404 if (oplock & CIFS_CREATE_ACTION) { 381 if (oplock & CIFS_CREATE_ACTION) {
405 /* time to set mode which we can not set earlier due to 382 /* time to set mode which we can not set earlier due to
@@ -455,7 +432,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
455 __u16 netfid; 432 __u16 netfid;
456 433
457 if (file->private_data) 434 if (file->private_data)
458 pCifsFile = (struct cifsFileInfo *)file->private_data; 435 pCifsFile = file->private_data;
459 else 436 else
460 return -EBADF; 437 return -EBADF;
461 438
@@ -469,7 +446,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
469 } 446 }
470 447
471 if (file->f_path.dentry == NULL) { 448 if (file->f_path.dentry == NULL) {
472 cERROR(1, ("no valid name if dentry freed")); 449 cERROR(1, "no valid name if dentry freed");
473 dump_stack(); 450 dump_stack();
474 rc = -EBADF; 451 rc = -EBADF;
475 goto reopen_error_exit; 452 goto reopen_error_exit;
@@ -477,7 +454,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
477 454
478 inode = file->f_path.dentry->d_inode; 455 inode = file->f_path.dentry->d_inode;
479 if (inode == NULL) { 456 if (inode == NULL) {
480 cERROR(1, ("inode not valid")); 457 cERROR(1, "inode not valid");
481 dump_stack(); 458 dump_stack();
482 rc = -EBADF; 459 rc = -EBADF;
483 goto reopen_error_exit; 460 goto reopen_error_exit;
@@ -499,8 +476,8 @@ reopen_error_exit:
499 return rc; 476 return rc;
500 } 477 }
501 478
502 cFYI(1, ("inode = 0x%p file flags 0x%x for %s", 479 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
503 inode, file->f_flags, full_path)); 480 inode, file->f_flags, full_path);
504 481
505 if (oplockEnabled) 482 if (oplockEnabled)
506 oplock = REQ_OPLOCK; 483 oplock = REQ_OPLOCK;
@@ -512,11 +489,11 @@ reopen_error_exit:
512 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 489 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
513 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 490 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
514 /* can not refresh inode info since size could be stale */ 491 /* can not refresh inode info since size could be stale */
515 rc = cifs_posix_open(full_path, NULL, file->f_path.mnt, 492 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
516 cifs_sb->mnt_file_mode /* ignored */, 493 cifs_sb->mnt_file_mode /* ignored */,
517 oflags, &oplock, &netfid, xid); 494 oflags, &oplock, &netfid, xid);
518 if (rc == 0) { 495 if (rc == 0) {
519 cFYI(1, ("posix reopen succeeded")); 496 cFYI(1, "posix reopen succeeded");
520 goto reopen_success; 497 goto reopen_success;
521 } 498 }
522 /* fallthrough to retry open the old way on errors, especially 499 /* fallthrough to retry open the old way on errors, especially
@@ -537,8 +514,8 @@ reopen_error_exit:
537 CIFS_MOUNT_MAP_SPECIAL_CHR); 514 CIFS_MOUNT_MAP_SPECIAL_CHR);
538 if (rc) { 515 if (rc) {
539 mutex_unlock(&pCifsFile->fh_mutex); 516 mutex_unlock(&pCifsFile->fh_mutex);
540 cFYI(1, ("cifs_open returned 0x%x", rc)); 517 cFYI(1, "cifs_open returned 0x%x", rc);
541 cFYI(1, ("oplock: %d", oplock)); 518 cFYI(1, "oplock: %d", oplock);
542 } else { 519 } else {
543reopen_success: 520reopen_success:
544 pCifsFile->netfid = netfid; 521 pCifsFile->netfid = netfid;
@@ -570,8 +547,8 @@ reopen_success:
570 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 547 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
571 pCifsInode->clientCanCacheAll = true; 548 pCifsInode->clientCanCacheAll = true;
572 pCifsInode->clientCanCacheRead = true; 549 pCifsInode->clientCanCacheRead = true;
573 cFYI(1, ("Exclusive Oplock granted on inode %p", 550 cFYI(1, "Exclusive Oplock granted on inode %p",
574 file->f_path.dentry->d_inode)); 551 file->f_path.dentry->d_inode);
575 } else if ((oplock & 0xF) == OPLOCK_READ) { 552 } else if ((oplock & 0xF) == OPLOCK_READ) {
576 pCifsInode->clientCanCacheRead = true; 553 pCifsInode->clientCanCacheRead = true;
577 pCifsInode->clientCanCacheAll = false; 554 pCifsInode->clientCanCacheAll = false;
@@ -593,8 +570,7 @@ int cifs_close(struct inode *inode, struct file *file)
593 int xid, timeout; 570 int xid, timeout;
594 struct cifs_sb_info *cifs_sb; 571 struct cifs_sb_info *cifs_sb;
595 struct cifsTconInfo *pTcon; 572 struct cifsTconInfo *pTcon;
596 struct cifsFileInfo *pSMBFile = 573 struct cifsFileInfo *pSMBFile = file->private_data;
597 (struct cifsFileInfo *)file->private_data;
598 574
599 xid = GetXid(); 575 xid = GetXid();
600 576
@@ -619,8 +595,7 @@ int cifs_close(struct inode *inode, struct file *file)
619 the struct would be in each open file, 595 the struct would be in each open file,
620 but this should give enough time to 596 but this should give enough time to
621 clear the socket */ 597 clear the socket */
622 cFYI(DBG2, 598 cFYI(DBG2, "close delay, write pending");
623 ("close delay, write pending"));
624 msleep(timeout); 599 msleep(timeout);
625 timeout *= 4; 600 timeout *= 4;
626 } 601 }
@@ -653,7 +628,7 @@ int cifs_close(struct inode *inode, struct file *file)
653 628
654 read_lock(&GlobalSMBSeslock); 629 read_lock(&GlobalSMBSeslock);
655 if (list_empty(&(CIFS_I(inode)->openFileList))) { 630 if (list_empty(&(CIFS_I(inode)->openFileList))) {
656 cFYI(1, ("closing last open instance for inode %p", inode)); 631 cFYI(1, "closing last open instance for inode %p", inode);
657 /* if the file is not open we do not know if we can cache info 632 /* if the file is not open we do not know if we can cache info
658 on this inode, much less write behind and read ahead */ 633 on this inode, much less write behind and read ahead */
659 CIFS_I(inode)->clientCanCacheRead = false; 634 CIFS_I(inode)->clientCanCacheRead = false;
@@ -670,11 +645,10 @@ int cifs_closedir(struct inode *inode, struct file *file)
670{ 645{
671 int rc = 0; 646 int rc = 0;
672 int xid; 647 int xid;
673 struct cifsFileInfo *pCFileStruct = 648 struct cifsFileInfo *pCFileStruct = file->private_data;
674 (struct cifsFileInfo *)file->private_data;
675 char *ptmp; 649 char *ptmp;
676 650
677 cFYI(1, ("Closedir inode = 0x%p", inode)); 651 cFYI(1, "Closedir inode = 0x%p", inode);
678 652
679 xid = GetXid(); 653 xid = GetXid();
680 654
@@ -685,22 +659,22 @@ int cifs_closedir(struct inode *inode, struct file *file)
685 659
686 pTcon = cifs_sb->tcon; 660 pTcon = cifs_sb->tcon;
687 661
688 cFYI(1, ("Freeing private data in close dir")); 662 cFYI(1, "Freeing private data in close dir");
689 write_lock(&GlobalSMBSeslock); 663 write_lock(&GlobalSMBSeslock);
690 if (!pCFileStruct->srch_inf.endOfSearch && 664 if (!pCFileStruct->srch_inf.endOfSearch &&
691 !pCFileStruct->invalidHandle) { 665 !pCFileStruct->invalidHandle) {
692 pCFileStruct->invalidHandle = true; 666 pCFileStruct->invalidHandle = true;
693 write_unlock(&GlobalSMBSeslock); 667 write_unlock(&GlobalSMBSeslock);
694 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); 668 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
695 cFYI(1, ("Closing uncompleted readdir with rc %d", 669 cFYI(1, "Closing uncompleted readdir with rc %d",
696 rc)); 670 rc);
697 /* not much we can do if it fails anyway, ignore rc */ 671 /* not much we can do if it fails anyway, ignore rc */
698 rc = 0; 672 rc = 0;
699 } else 673 } else
700 write_unlock(&GlobalSMBSeslock); 674 write_unlock(&GlobalSMBSeslock);
701 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; 675 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
702 if (ptmp) { 676 if (ptmp) {
703 cFYI(1, ("closedir free smb buf in srch struct")); 677 cFYI(1, "closedir free smb buf in srch struct");
704 pCFileStruct->srch_inf.ntwrk_buf_start = NULL; 678 pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
705 if (pCFileStruct->srch_inf.smallBuf) 679 if (pCFileStruct->srch_inf.smallBuf)
706 cifs_small_buf_release(ptmp); 680 cifs_small_buf_release(ptmp);
@@ -748,49 +722,49 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
748 rc = -EACCES; 722 rc = -EACCES;
749 xid = GetXid(); 723 xid = GetXid();
750 724
751 cFYI(1, ("Lock parm: 0x%x flockflags: " 725 cFYI(1, "Lock parm: 0x%x flockflags: "
752 "0x%x flocktype: 0x%x start: %lld end: %lld", 726 "0x%x flocktype: 0x%x start: %lld end: %lld",
753 cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, 727 cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
754 pfLock->fl_end)); 728 pfLock->fl_end);
755 729
756 if (pfLock->fl_flags & FL_POSIX) 730 if (pfLock->fl_flags & FL_POSIX)
757 cFYI(1, ("Posix")); 731 cFYI(1, "Posix");
758 if (pfLock->fl_flags & FL_FLOCK) 732 if (pfLock->fl_flags & FL_FLOCK)
759 cFYI(1, ("Flock")); 733 cFYI(1, "Flock");
760 if (pfLock->fl_flags & FL_SLEEP) { 734 if (pfLock->fl_flags & FL_SLEEP) {
761 cFYI(1, ("Blocking lock")); 735 cFYI(1, "Blocking lock");
762 wait_flag = true; 736 wait_flag = true;
763 } 737 }
764 if (pfLock->fl_flags & FL_ACCESS) 738 if (pfLock->fl_flags & FL_ACCESS)
765 cFYI(1, ("Process suspended by mandatory locking - " 739 cFYI(1, "Process suspended by mandatory locking - "
766 "not implemented yet")); 740 "not implemented yet");
767 if (pfLock->fl_flags & FL_LEASE) 741 if (pfLock->fl_flags & FL_LEASE)
768 cFYI(1, ("Lease on file - not implemented yet")); 742 cFYI(1, "Lease on file - not implemented yet");
769 if (pfLock->fl_flags & 743 if (pfLock->fl_flags &
770 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) 744 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
771 cFYI(1, ("Unknown lock flags 0x%x", pfLock->fl_flags)); 745 cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
772 746
773 if (pfLock->fl_type == F_WRLCK) { 747 if (pfLock->fl_type == F_WRLCK) {
774 cFYI(1, ("F_WRLCK ")); 748 cFYI(1, "F_WRLCK ");
775 numLock = 1; 749 numLock = 1;
776 } else if (pfLock->fl_type == F_UNLCK) { 750 } else if (pfLock->fl_type == F_UNLCK) {
777 cFYI(1, ("F_UNLCK")); 751 cFYI(1, "F_UNLCK");
778 numUnlock = 1; 752 numUnlock = 1;
779 /* Check if unlock includes more than 753 /* Check if unlock includes more than
780 one lock range */ 754 one lock range */
781 } else if (pfLock->fl_type == F_RDLCK) { 755 } else if (pfLock->fl_type == F_RDLCK) {
782 cFYI(1, ("F_RDLCK")); 756 cFYI(1, "F_RDLCK");
783 lockType |= LOCKING_ANDX_SHARED_LOCK; 757 lockType |= LOCKING_ANDX_SHARED_LOCK;
784 numLock = 1; 758 numLock = 1;
785 } else if (pfLock->fl_type == F_EXLCK) { 759 } else if (pfLock->fl_type == F_EXLCK) {
786 cFYI(1, ("F_EXLCK")); 760 cFYI(1, "F_EXLCK");
787 numLock = 1; 761 numLock = 1;
788 } else if (pfLock->fl_type == F_SHLCK) { 762 } else if (pfLock->fl_type == F_SHLCK) {
789 cFYI(1, ("F_SHLCK")); 763 cFYI(1, "F_SHLCK");
790 lockType |= LOCKING_ANDX_SHARED_LOCK; 764 lockType |= LOCKING_ANDX_SHARED_LOCK;
791 numLock = 1; 765 numLock = 1;
792 } else 766 } else
793 cFYI(1, ("Unknown type of lock")); 767 cFYI(1, "Unknown type of lock");
794 768
795 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 769 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
796 tcon = cifs_sb->tcon; 770 tcon = cifs_sb->tcon;
@@ -833,8 +807,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
833 0 /* wait flag */ ); 807 0 /* wait flag */ );
834 pfLock->fl_type = F_UNLCK; 808 pfLock->fl_type = F_UNLCK;
835 if (rc != 0) 809 if (rc != 0)
836 cERROR(1, ("Error unlocking previously locked " 810 cERROR(1, "Error unlocking previously locked "
837 "range %d during test of lock", rc)); 811 "range %d during test of lock", rc);
838 rc = 0; 812 rc = 0;
839 813
840 } else { 814 } else {
@@ -856,9 +830,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
856 0 /* wait flag */); 830 0 /* wait flag */);
857 pfLock->fl_type = F_RDLCK; 831 pfLock->fl_type = F_RDLCK;
858 if (rc != 0) 832 if (rc != 0)
859 cERROR(1, ("Error unlocking " 833 cERROR(1, "Error unlocking "
860 "previously locked range %d " 834 "previously locked range %d "
861 "during test of lock", rc)); 835 "during test of lock", rc);
862 rc = 0; 836 rc = 0;
863 } else { 837 } else {
864 pfLock->fl_type = F_WRLCK; 838 pfLock->fl_type = F_WRLCK;
@@ -892,8 +866,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
892 length, pfLock, 866 length, pfLock,
893 posix_lock_type, wait_flag); 867 posix_lock_type, wait_flag);
894 } else { 868 } else {
895 struct cifsFileInfo *fid = 869 struct cifsFileInfo *fid = file->private_data;
896 (struct cifsFileInfo *)file->private_data;
897 870
898 if (numLock) { 871 if (numLock) {
899 rc = CIFSSMBLock(xid, tcon, netfid, length, 872 rc = CIFSSMBLock(xid, tcon, netfid, length,
@@ -923,9 +896,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
923 1, 0, li->type, false); 896 1, 0, li->type, false);
924 if (stored_rc) 897 if (stored_rc)
925 rc = stored_rc; 898 rc = stored_rc;
926 899 else {
927 list_del(&li->llist); 900 list_del(&li->llist);
928 kfree(li); 901 kfree(li);
902 }
929 } 903 }
930 } 904 }
931 mutex_unlock(&fid->lock_mutex); 905 mutex_unlock(&fid->lock_mutex);
@@ -988,13 +962,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
988 962
989 pTcon = cifs_sb->tcon; 963 pTcon = cifs_sb->tcon;
990 964
991 /* cFYI(1, 965 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
992 (" write %d bytes to offset %lld of %s", write_size, 966 *poffset, file->f_path.dentry->d_name.name); */
993 *poffset, file->f_path.dentry->d_name.name)); */
994 967
995 if (file->private_data == NULL) 968 if (file->private_data == NULL)
996 return -EBADF; 969 return -EBADF;
997 open_file = (struct cifsFileInfo *) file->private_data; 970 open_file = file->private_data;
998 971
999 rc = generic_write_checks(file, poffset, &write_size, 0); 972 rc = generic_write_checks(file, poffset, &write_size, 0);
1000 if (rc) 973 if (rc)
@@ -1091,12 +1064,12 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1091 1064
1092 pTcon = cifs_sb->tcon; 1065 pTcon = cifs_sb->tcon;
1093 1066
1094 cFYI(1, ("write %zd bytes to offset %lld of %s", write_size, 1067 cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
1095 *poffset, file->f_path.dentry->d_name.name)); 1068 *poffset, file->f_path.dentry->d_name.name);
1096 1069
1097 if (file->private_data == NULL) 1070 if (file->private_data == NULL)
1098 return -EBADF; 1071 return -EBADF;
1099 open_file = (struct cifsFileInfo *)file->private_data; 1072 open_file = file->private_data;
1100 1073
1101 xid = GetXid(); 1074 xid = GetXid();
1102 1075
@@ -1233,7 +1206,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1233 it being zero) during stress testcases so we need to check for it */ 1206 it being zero) during stress testcases so we need to check for it */
1234 1207
1235 if (cifs_inode == NULL) { 1208 if (cifs_inode == NULL) {
1236 cERROR(1, ("Null inode passed to cifs_writeable_file")); 1209 cERROR(1, "Null inode passed to cifs_writeable_file");
1237 dump_stack(); 1210 dump_stack();
1238 return NULL; 1211 return NULL;
1239 } 1212 }
@@ -1277,7 +1250,7 @@ refind_writable:
1277 again. Note that it would be bad 1250 again. Note that it would be bad
1278 to hold up writepages here (rather than 1251 to hold up writepages here (rather than
1279 in caller) with continuous retries */ 1252 in caller) with continuous retries */
1280 cFYI(1, ("wp failed on reopen file")); 1253 cFYI(1, "wp failed on reopen file");
1281 read_lock(&GlobalSMBSeslock); 1254 read_lock(&GlobalSMBSeslock);
1282 /* can not use this handle, no write 1255 /* can not use this handle, no write
1283 pending on this one after all */ 1256 pending on this one after all */
@@ -1353,7 +1326,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1353 else if (bytes_written < 0) 1326 else if (bytes_written < 0)
1354 rc = bytes_written; 1327 rc = bytes_written;
1355 } else { 1328 } else {
1356 cFYI(1, ("No writeable filehandles for inode")); 1329 cFYI(1, "No writeable filehandles for inode");
1357 rc = -EIO; 1330 rc = -EIO;
1358 } 1331 }
1359 1332
@@ -1525,7 +1498,7 @@ retry:
1525 */ 1498 */
1526 open_file = find_writable_file(CIFS_I(mapping->host)); 1499 open_file = find_writable_file(CIFS_I(mapping->host));
1527 if (!open_file) { 1500 if (!open_file) {
1528 cERROR(1, ("No writable handles for inode")); 1501 cERROR(1, "No writable handles for inode");
1529 rc = -EBADF; 1502 rc = -EBADF;
1530 } else { 1503 } else {
1531 long_op = cifs_write_timeout(cifsi, offset); 1504 long_op = cifs_write_timeout(cifsi, offset);
@@ -1538,8 +1511,8 @@ retry:
1538 cifs_update_eof(cifsi, offset, bytes_written); 1511 cifs_update_eof(cifsi, offset, bytes_written);
1539 1512
1540 if (rc || bytes_written < bytes_to_write) { 1513 if (rc || bytes_written < bytes_to_write) {
1541 cERROR(1, ("Write2 ret %d, wrote %d", 1514 cERROR(1, "Write2 ret %d, wrote %d",
1542 rc, bytes_written)); 1515 rc, bytes_written);
1543 /* BB what if continued retry is 1516 /* BB what if continued retry is
1544 requested via mount flags? */ 1517 requested via mount flags? */
1545 if (rc == -ENOSPC) 1518 if (rc == -ENOSPC)
@@ -1600,7 +1573,7 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
1600/* BB add check for wbc flags */ 1573/* BB add check for wbc flags */
1601 page_cache_get(page); 1574 page_cache_get(page);
1602 if (!PageUptodate(page)) 1575 if (!PageUptodate(page))
1603 cFYI(1, ("ppw - page not up to date")); 1576 cFYI(1, "ppw - page not up to date");
1604 1577
1605 /* 1578 /*
1606 * Set the "writeback" flag, and clear "dirty" in the radix tree. 1579 * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1629,8 +1602,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1629 int rc; 1602 int rc;
1630 struct inode *inode = mapping->host; 1603 struct inode *inode = mapping->host;
1631 1604
1632 cFYI(1, ("write_end for page %p from pos %lld with %d bytes", 1605 cFYI(1, "write_end for page %p from pos %lld with %d bytes",
1633 page, pos, copied)); 1606 page, pos, copied);
1634 1607
1635 if (PageChecked(page)) { 1608 if (PageChecked(page)) {
1636 if (copied == len) 1609 if (copied == len)
@@ -1675,19 +1648,18 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1675 return rc; 1648 return rc;
1676} 1649}
1677 1650
1678int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1651int cifs_fsync(struct file *file, int datasync)
1679{ 1652{
1680 int xid; 1653 int xid;
1681 int rc = 0; 1654 int rc = 0;
1682 struct cifsTconInfo *tcon; 1655 struct cifsTconInfo *tcon;
1683 struct cifsFileInfo *smbfile = 1656 struct cifsFileInfo *smbfile = file->private_data;
1684 (struct cifsFileInfo *)file->private_data;
1685 struct inode *inode = file->f_path.dentry->d_inode; 1657 struct inode *inode = file->f_path.dentry->d_inode;
1686 1658
1687 xid = GetXid(); 1659 xid = GetXid();
1688 1660
1689 cFYI(1, ("Sync file - name: %s datasync: 0x%x", 1661 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1690 dentry->d_name.name, datasync)); 1662 file->f_path.dentry->d_name.name, datasync);
1691 1663
1692 rc = filemap_write_and_wait(inode->i_mapping); 1664 rc = filemap_write_and_wait(inode->i_mapping);
1693 if (rc == 0) { 1665 if (rc == 0) {
@@ -1711,7 +1683,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1711 unsigned int rpages = 0; 1683 unsigned int rpages = 0;
1712 int rc = 0; 1684 int rc = 0;
1713 1685
1714 cFYI(1, ("sync page %p",page)); 1686 cFYI(1, "sync page %p", page);
1715 mapping = page->mapping; 1687 mapping = page->mapping;
1716 if (!mapping) 1688 if (!mapping)
1717 return 0; 1689 return 0;
@@ -1722,7 +1694,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1722/* fill in rpages then 1694/* fill in rpages then
1723 result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */ 1695 result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
1724 1696
1725/* cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index)); 1697/* cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
1726 1698
1727#if 0 1699#if 0
1728 if (rc < 0) 1700 if (rc < 0)
@@ -1756,7 +1728,7 @@ int cifs_flush(struct file *file, fl_owner_t id)
1756 CIFS_I(inode)->write_behind_rc = 0; 1728 CIFS_I(inode)->write_behind_rc = 0;
1757 } 1729 }
1758 1730
1759 cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc)); 1731 cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
1760 1732
1761 return rc; 1733 return rc;
1762} 1734}
@@ -1785,10 +1757,10 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1785 FreeXid(xid); 1757 FreeXid(xid);
1786 return rc; 1758 return rc;
1787 } 1759 }
1788 open_file = (struct cifsFileInfo *)file->private_data; 1760 open_file = file->private_data;
1789 1761
1790 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1762 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1791 cFYI(1, ("attempting read on write only file instance")); 1763 cFYI(1, "attempting read on write only file instance");
1792 1764
1793 for (total_read = 0, current_offset = read_data; 1765 for (total_read = 0, current_offset = read_data;
1794 read_size > total_read; 1766 read_size > total_read;
@@ -1866,10 +1838,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1866 FreeXid(xid); 1838 FreeXid(xid);
1867 return rc; 1839 return rc;
1868 } 1840 }
1869 open_file = (struct cifsFileInfo *)file->private_data; 1841 open_file = file->private_data;
1870 1842
1871 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1843 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1872 cFYI(1, ("attempting read on write only file instance")); 1844 cFYI(1, "attempting read on write only file instance");
1873 1845
1874 for (total_read = 0, current_offset = read_data; 1846 for (total_read = 0, current_offset = read_data;
1875 read_size > total_read; 1847 read_size > total_read;
@@ -1920,7 +1892,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1920 xid = GetXid(); 1892 xid = GetXid();
1921 rc = cifs_revalidate_file(file); 1893 rc = cifs_revalidate_file(file);
1922 if (rc) { 1894 if (rc) {
1923 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1895 cFYI(1, "Validation prior to mmap failed, error=%d", rc);
1924 FreeXid(xid); 1896 FreeXid(xid);
1925 return rc; 1897 return rc;
1926 } 1898 }
@@ -1931,8 +1903,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1931 1903
1932 1904
1933static void cifs_copy_cache_pages(struct address_space *mapping, 1905static void cifs_copy_cache_pages(struct address_space *mapping,
1934 struct list_head *pages, int bytes_read, char *data, 1906 struct list_head *pages, int bytes_read, char *data)
1935 struct pagevec *plru_pvec)
1936{ 1907{
1937 struct page *page; 1908 struct page *page;
1938 char *target; 1909 char *target;
@@ -1944,14 +1915,15 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
1944 page = list_entry(pages->prev, struct page, lru); 1915 page = list_entry(pages->prev, struct page, lru);
1945 list_del(&page->lru); 1916 list_del(&page->lru);
1946 1917
1947 if (add_to_page_cache(page, mapping, page->index, 1918 if (add_to_page_cache_lru(page, mapping, page->index,
1948 GFP_KERNEL)) { 1919 GFP_KERNEL)) {
1949 page_cache_release(page); 1920 page_cache_release(page);
1950 cFYI(1, ("Add page cache failed")); 1921 cFYI(1, "Add page cache failed");
1951 data += PAGE_CACHE_SIZE; 1922 data += PAGE_CACHE_SIZE;
1952 bytes_read -= PAGE_CACHE_SIZE; 1923 bytes_read -= PAGE_CACHE_SIZE;
1953 continue; 1924 continue;
1954 } 1925 }
1926 page_cache_release(page);
1955 1927
1956 target = kmap_atomic(page, KM_USER0); 1928 target = kmap_atomic(page, KM_USER0);
1957 1929
@@ -1970,9 +1942,10 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
1970 flush_dcache_page(page); 1942 flush_dcache_page(page);
1971 SetPageUptodate(page); 1943 SetPageUptodate(page);
1972 unlock_page(page); 1944 unlock_page(page);
1973 if (!pagevec_add(plru_pvec, page))
1974 __pagevec_lru_add_file(plru_pvec);
1975 data += PAGE_CACHE_SIZE; 1945 data += PAGE_CACHE_SIZE;
1946
1947 /* add page to FS-Cache */
1948 cifs_readpage_to_fscache(mapping->host, page);
1976 } 1949 }
1977 return; 1950 return;
1978} 1951}
@@ -1990,7 +1963,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1990 unsigned int read_size, i; 1963 unsigned int read_size, i;
1991 char *smb_read_data = NULL; 1964 char *smb_read_data = NULL;
1992 struct smb_com_read_rsp *pSMBr; 1965 struct smb_com_read_rsp *pSMBr;
1993 struct pagevec lru_pvec;
1994 struct cifsFileInfo *open_file; 1966 struct cifsFileInfo *open_file;
1995 int buf_type = CIFS_NO_BUFFER; 1967 int buf_type = CIFS_NO_BUFFER;
1996 1968
@@ -2000,12 +1972,20 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2000 FreeXid(xid); 1972 FreeXid(xid);
2001 return rc; 1973 return rc;
2002 } 1974 }
2003 open_file = (struct cifsFileInfo *)file->private_data; 1975 open_file = file->private_data;
2004 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1976 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2005 pTcon = cifs_sb->tcon; 1977 pTcon = cifs_sb->tcon;
2006 1978
2007 pagevec_init(&lru_pvec, 0); 1979 /*
2008 cFYI(DBG2, ("rpages: num pages %d", num_pages)); 1980 * Reads as many pages as possible from fscache. Returns -ENOBUFS
1981 * immediately if the cookie is negative
1982 */
1983 rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
1984 &num_pages);
1985 if (rc == 0)
1986 goto read_complete;
1987
1988 cFYI(DBG2, "rpages: num pages %d", num_pages);
2009 for (i = 0; i < num_pages; ) { 1989 for (i = 0; i < num_pages; ) {
2010 unsigned contig_pages; 1990 unsigned contig_pages;
2011 struct page *tmp_page; 1991 struct page *tmp_page;
@@ -2038,8 +2018,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2038 /* Read size needs to be in multiples of one page */ 2018 /* Read size needs to be in multiples of one page */
2039 read_size = min_t(const unsigned int, read_size, 2019 read_size = min_t(const unsigned int, read_size,
2040 cifs_sb->rsize & PAGE_CACHE_MASK); 2020 cifs_sb->rsize & PAGE_CACHE_MASK);
2041 cFYI(DBG2, ("rpages: read size 0x%x contiguous pages %d", 2021 cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d",
2042 read_size, contig_pages)); 2022 read_size, contig_pages);
2043 rc = -EAGAIN; 2023 rc = -EAGAIN;
2044 while (rc == -EAGAIN) { 2024 while (rc == -EAGAIN) {
2045 if ((open_file->invalidHandle) && 2025 if ((open_file->invalidHandle) &&
@@ -2066,14 +2046,14 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2066 } 2046 }
2067 } 2047 }
2068 if ((rc < 0) || (smb_read_data == NULL)) { 2048 if ((rc < 0) || (smb_read_data == NULL)) {
2069 cFYI(1, ("Read error in readpages: %d", rc)); 2049 cFYI(1, "Read error in readpages: %d", rc);
2070 break; 2050 break;
2071 } else if (bytes_read > 0) { 2051 } else if (bytes_read > 0) {
2072 task_io_account_read(bytes_read); 2052 task_io_account_read(bytes_read);
2073 pSMBr = (struct smb_com_read_rsp *)smb_read_data; 2053 pSMBr = (struct smb_com_read_rsp *)smb_read_data;
2074 cifs_copy_cache_pages(mapping, page_list, bytes_read, 2054 cifs_copy_cache_pages(mapping, page_list, bytes_read,
2075 smb_read_data + 4 /* RFC1001 hdr */ + 2055 smb_read_data + 4 /* RFC1001 hdr */ +
2076 le16_to_cpu(pSMBr->DataOffset), &lru_pvec); 2056 le16_to_cpu(pSMBr->DataOffset));
2077 2057
2078 i += bytes_read >> PAGE_CACHE_SHIFT; 2058 i += bytes_read >> PAGE_CACHE_SHIFT;
2079 cifs_stats_bytes_read(pTcon, bytes_read); 2059 cifs_stats_bytes_read(pTcon, bytes_read);
@@ -2089,9 +2069,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2089 /* break; */ 2069 /* break; */
2090 } 2070 }
2091 } else { 2071 } else {
2092 cFYI(1, ("No bytes read (%d) at offset %lld . " 2072 cFYI(1, "No bytes read (%d) at offset %lld . "
2093 "Cleaning remaining pages from readahead list", 2073 "Cleaning remaining pages from readahead list",
2094 bytes_read, offset)); 2074 bytes_read, offset);
2095 /* BB turn off caching and do new lookup on 2075 /* BB turn off caching and do new lookup on
2096 file size at server? */ 2076 file size at server? */
2097 break; 2077 break;
@@ -2106,8 +2086,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2106 bytes_read = 0; 2086 bytes_read = 0;
2107 } 2087 }
2108 2088
2109 pagevec_lru_add_file(&lru_pvec);
2110
2111/* need to free smb_read_data buf before exit */ 2089/* need to free smb_read_data buf before exit */
2112 if (smb_read_data) { 2090 if (smb_read_data) {
2113 if (buf_type == CIFS_SMALL_BUFFER) 2091 if (buf_type == CIFS_SMALL_BUFFER)
@@ -2117,6 +2095,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2117 smb_read_data = NULL; 2095 smb_read_data = NULL;
2118 } 2096 }
2119 2097
2098read_complete:
2120 FreeXid(xid); 2099 FreeXid(xid);
2121 return rc; 2100 return rc;
2122} 2101}
@@ -2127,6 +2106,11 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
2127 char *read_data; 2106 char *read_data;
2128 int rc; 2107 int rc;
2129 2108
2109 /* Is the page cached? */
2110 rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
2111 if (rc == 0)
2112 goto read_complete;
2113
2130 page_cache_get(page); 2114 page_cache_get(page);
2131 read_data = kmap(page); 2115 read_data = kmap(page);
2132 /* for reads over a certain size could initiate async read ahead */ 2116 /* for reads over a certain size could initiate async read ahead */
@@ -2136,7 +2120,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
2136 if (rc < 0) 2120 if (rc < 0)
2137 goto io_error; 2121 goto io_error;
2138 else 2122 else
2139 cFYI(1, ("Bytes read %d", rc)); 2123 cFYI(1, "Bytes read %d", rc);
2140 2124
2141 file->f_path.dentry->d_inode->i_atime = 2125 file->f_path.dentry->d_inode->i_atime =
2142 current_fs_time(file->f_path.dentry->d_inode->i_sb); 2126 current_fs_time(file->f_path.dentry->d_inode->i_sb);
@@ -2146,11 +2130,17 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
2146 2130
2147 flush_dcache_page(page); 2131 flush_dcache_page(page);
2148 SetPageUptodate(page); 2132 SetPageUptodate(page);
2133
2134 /* send this page to the cache */
2135 cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
2136
2149 rc = 0; 2137 rc = 0;
2150 2138
2151io_error: 2139io_error:
2152 kunmap(page); 2140 kunmap(page);
2153 page_cache_release(page); 2141 page_cache_release(page);
2142
2143read_complete:
2154 return rc; 2144 return rc;
2155} 2145}
2156 2146
@@ -2168,8 +2158,8 @@ static int cifs_readpage(struct file *file, struct page *page)
2168 return rc; 2158 return rc;
2169 } 2159 }
2170 2160
2171 cFYI(1, ("readpage %p at offset %d 0x%x\n", 2161 cFYI(1, "readpage %p at offset %d 0x%x\n",
2172 page, (int)offset, (int)offset)); 2162 page, (int)offset, (int)offset);
2173 2163
2174 rc = cifs_readpage_worker(file, page, &offset); 2164 rc = cifs_readpage_worker(file, page, &offset);
2175 2165
@@ -2239,7 +2229,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
2239 struct page *page; 2229 struct page *page;
2240 int rc = 0; 2230 int rc = 0;
2241 2231
2242 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); 2232 cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
2243 2233
2244 page = grab_cache_page_write_begin(mapping, index, flags); 2234 page = grab_cache_page_write_begin(mapping, index, flags);
2245 if (!page) { 2235 if (!page) {
@@ -2300,8 +2290,23 @@ out:
2300 return rc; 2290 return rc;
2301} 2291}
2302 2292
2303static void 2293static int cifs_release_page(struct page *page, gfp_t gfp)
2304cifs_oplock_break(struct slow_work *work) 2294{
2295 if (PagePrivate(page))
2296 return 0;
2297
2298 return cifs_fscache_release_page(page, gfp);
2299}
2300
2301static void cifs_invalidate_page(struct page *page, unsigned long offset)
2302{
2303 struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
2304
2305 if (offset == 0)
2306 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
2307}
2308
2309void cifs_oplock_break(struct work_struct *work)
2305{ 2310{
2306 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 2311 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2307 oplock_break); 2312 oplock_break);
@@ -2311,12 +2316,10 @@ cifs_oplock_break(struct slow_work *work)
2311 int rc, waitrc = 0; 2316 int rc, waitrc = 0;
2312 2317
2313 if (inode && S_ISREG(inode->i_mode)) { 2318 if (inode && S_ISREG(inode->i_mode)) {
2314#ifdef CONFIG_CIFS_EXPERIMENTAL 2319 if (cinode->clientCanCacheRead)
2315 if (cinode->clientCanCacheAll == 0)
2316 break_lease(inode, O_RDONLY); 2320 break_lease(inode, O_RDONLY);
2317 else if (cinode->clientCanCacheRead == 0) 2321 else
2318 break_lease(inode, O_WRONLY); 2322 break_lease(inode, O_WRONLY);
2319#endif
2320 rc = filemap_fdatawrite(inode->i_mapping); 2323 rc = filemap_fdatawrite(inode->i_mapping);
2321 if (cinode->clientCanCacheRead == 0) { 2324 if (cinode->clientCanCacheRead == 0) {
2322 waitrc = filemap_fdatawait(inode->i_mapping); 2325 waitrc = filemap_fdatawait(inode->i_mapping);
@@ -2326,7 +2329,7 @@ cifs_oplock_break(struct slow_work *work)
2326 rc = waitrc; 2329 rc = waitrc;
2327 if (rc) 2330 if (rc)
2328 cinode->write_behind_rc = rc; 2331 cinode->write_behind_rc = rc;
2329 cFYI(1, ("Oplock flush inode %p rc %d", inode, rc)); 2332 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
2330 } 2333 }
2331 2334
2332 /* 2335 /*
@@ -2338,35 +2341,32 @@ cifs_oplock_break(struct slow_work *work)
2338 if (!cfile->closePend && !cfile->oplock_break_cancelled) { 2341 if (!cfile->closePend && !cfile->oplock_break_cancelled) {
2339 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0, 2342 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
2340 LOCKING_ANDX_OPLOCK_RELEASE, false); 2343 LOCKING_ANDX_OPLOCK_RELEASE, false);
2341 cFYI(1, ("Oplock release rc = %d", rc)); 2344 cFYI(1, "Oplock release rc = %d", rc);
2342 } 2345 }
2346
2347 /*
2348 * We might have kicked in before is_valid_oplock_break()
2349 * finished grabbing reference for us. Make sure it's done by
2350 * waiting for GlobalSMSSeslock.
2351 */
2352 write_lock(&GlobalSMBSeslock);
2353 write_unlock(&GlobalSMBSeslock);
2354
2355 cifs_oplock_break_put(cfile);
2343} 2356}
2344 2357
2345static int 2358void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2346cifs_oplock_break_get(struct slow_work *work)
2347{ 2359{
2348 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2349 oplock_break);
2350 mntget(cfile->mnt); 2360 mntget(cfile->mnt);
2351 cifsFileInfo_get(cfile); 2361 cifsFileInfo_get(cfile);
2352 return 0;
2353} 2362}
2354 2363
2355static void 2364void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2356cifs_oplock_break_put(struct slow_work *work)
2357{ 2365{
2358 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2359 oplock_break);
2360 mntput(cfile->mnt); 2366 mntput(cfile->mnt);
2361 cifsFileInfo_put(cfile); 2367 cifsFileInfo_put(cfile);
2362} 2368}
2363 2369
2364const struct slow_work_ops cifs_oplock_break_ops = {
2365 .get_ref = cifs_oplock_break_get,
2366 .put_ref = cifs_oplock_break_put,
2367 .execute = cifs_oplock_break,
2368};
2369
2370const struct address_space_operations cifs_addr_ops = { 2370const struct address_space_operations cifs_addr_ops = {
2371 .readpage = cifs_readpage, 2371 .readpage = cifs_readpage,
2372 .readpages = cifs_readpages, 2372 .readpages = cifs_readpages,
@@ -2375,6 +2375,8 @@ const struct address_space_operations cifs_addr_ops = {
2375 .write_begin = cifs_write_begin, 2375 .write_begin = cifs_write_begin,
2376 .write_end = cifs_write_end, 2376 .write_end = cifs_write_end,
2377 .set_page_dirty = __set_page_dirty_nobuffers, 2377 .set_page_dirty = __set_page_dirty_nobuffers,
2378 .releasepage = cifs_release_page,
2379 .invalidatepage = cifs_invalidate_page,
2378 /* .sync_page = cifs_sync_page, */ 2380 /* .sync_page = cifs_sync_page, */
2379 /* .direct_IO = */ 2381 /* .direct_IO = */
2380}; 2382};
@@ -2391,6 +2393,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
2391 .write_begin = cifs_write_begin, 2393 .write_begin = cifs_write_begin,
2392 .write_end = cifs_write_end, 2394 .write_end = cifs_write_end,
2393 .set_page_dirty = __set_page_dirty_nobuffers, 2395 .set_page_dirty = __set_page_dirty_nobuffers,
2396 .releasepage = cifs_release_page,
2397 .invalidatepage = cifs_invalidate_page,
2394 /* .sync_page = cifs_sync_page, */ 2398 /* .sync_page = cifs_sync_page, */
2395 /* .direct_IO = */ 2399 /* .direct_IO = */
2396}; 2400};
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
new file mode 100644
index 000000000000..9f3f5c4be161
--- /dev/null
+++ b/fs/cifs/fscache.c
@@ -0,0 +1,236 @@
1/*
2 * fs/cifs/fscache.c - CIFS filesystem cache interface
3 *
4 * Copyright (c) 2010 Novell, Inc.
5 * Author(s): Suresh Jayaraman (sjayaraman@suse.de>
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21#include "fscache.h"
22#include "cifsglob.h"
23#include "cifs_debug.h"
24#include "cifs_fs_sb.h"
25
26void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
27{
28 server->fscache =
29 fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
30 &cifs_fscache_server_index_def, server);
31 cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server,
32 server->fscache);
33}
34
35void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
36{
37 cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server,
38 server->fscache);
39 fscache_relinquish_cookie(server->fscache, 0);
40 server->fscache = NULL;
41}
42
43void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
44{
45 struct TCP_Server_Info *server = tcon->ses->server;
46
47 tcon->fscache =
48 fscache_acquire_cookie(server->fscache,
49 &cifs_fscache_super_index_def, tcon);
50 cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)",
51 server->fscache, tcon->fscache);
52}
53
54void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon)
55{
56 cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
57 fscache_relinquish_cookie(tcon->fscache, 0);
58 tcon->fscache = NULL;
59}
60
61static void cifs_fscache_enable_inode_cookie(struct inode *inode)
62{
63 struct cifsInodeInfo *cifsi = CIFS_I(inode);
64 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
65
66 if (cifsi->fscache)
67 return;
68
69 cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
70 &cifs_fscache_inode_object_def,
71 cifsi);
72 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)",
73 cifs_sb->tcon->fscache, cifsi->fscache);
74}
75
76void cifs_fscache_release_inode_cookie(struct inode *inode)
77{
78 struct cifsInodeInfo *cifsi = CIFS_I(inode);
79
80 if (cifsi->fscache) {
81 cFYI(1, "CIFS releasing inode cookie (0x%p)",
82 cifsi->fscache);
83 fscache_relinquish_cookie(cifsi->fscache, 0);
84 cifsi->fscache = NULL;
85 }
86}
87
88static void cifs_fscache_disable_inode_cookie(struct inode *inode)
89{
90 struct cifsInodeInfo *cifsi = CIFS_I(inode);
91
92 if (cifsi->fscache) {
93 cFYI(1, "CIFS disabling inode cookie (0x%p)",
94 cifsi->fscache);
95 fscache_relinquish_cookie(cifsi->fscache, 1);
96 cifsi->fscache = NULL;
97 }
98}
99
100void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
101{
102 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
103 cifs_fscache_disable_inode_cookie(inode);
104 else {
105 cifs_fscache_enable_inode_cookie(inode);
106 cFYI(1, "CIFS: fscache inode cookie set");
107 }
108}
109
110void cifs_fscache_reset_inode_cookie(struct inode *inode)
111{
112 struct cifsInodeInfo *cifsi = CIFS_I(inode);
113 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
114 struct fscache_cookie *old = cifsi->fscache;
115
116 if (cifsi->fscache) {
117 /* retire the current fscache cache and get a new one */
118 fscache_relinquish_cookie(cifsi->fscache, 1);
119
120 cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
121 &cifs_fscache_inode_object_def,
122 cifsi);
123 cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
124 cifsi->fscache, old);
125 }
126}
127
128int cifs_fscache_release_page(struct page *page, gfp_t gfp)
129{
130 if (PageFsCache(page)) {
131 struct inode *inode = page->mapping->host;
132 struct cifsInodeInfo *cifsi = CIFS_I(inode);
133
134 cFYI(1, "CIFS: fscache release page (0x%p/0x%p)",
135 page, cifsi->fscache);
136 if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
137 return 0;
138 }
139
140 return 1;
141}
142
143static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
144 int error)
145{
146 cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)",
147 page, error);
148 if (!error)
149 SetPageUptodate(page);
150 unlock_page(page);
151}
152
153/*
154 * Retrieve a page from FS-Cache
155 */
156int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
157{
158 int ret;
159
160 cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p",
161 CIFS_I(inode)->fscache, page, inode);
162 ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
163 cifs_readpage_from_fscache_complete,
164 NULL,
165 GFP_KERNEL);
166 switch (ret) {
167
168 case 0: /* page found in fscache, read submitted */
169 cFYI(1, "CIFS: readpage_from_fscache: submitted");
170 return ret;
171 case -ENOBUFS: /* page won't be cached */
172 case -ENODATA: /* page not in cache */
173 cFYI(1, "CIFS: readpage_from_fscache %d", ret);
174 return 1;
175
176 default:
177 cERROR(1, "unknown error ret = %d", ret);
178 }
179 return ret;
180}
181
182/*
183 * Retrieve a set of pages from FS-Cache
184 */
185int __cifs_readpages_from_fscache(struct inode *inode,
186 struct address_space *mapping,
187 struct list_head *pages,
188 unsigned *nr_pages)
189{
190 int ret;
191
192 cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)",
193 CIFS_I(inode)->fscache, *nr_pages, inode);
194 ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
195 pages, nr_pages,
196 cifs_readpage_from_fscache_complete,
197 NULL,
198 mapping_gfp_mask(mapping));
199 switch (ret) {
200 case 0: /* read submitted to the cache for all pages */
201 cFYI(1, "CIFS: readpages_from_fscache: submitted");
202 return ret;
203
204 case -ENOBUFS: /* some pages are not cached and can't be */
205 case -ENODATA: /* some pages are not cached */
206 cFYI(1, "CIFS: readpages_from_fscache: no page");
207 return 1;
208
209 default:
210 cFYI(1, "unknown error ret = %d", ret);
211 }
212
213 return ret;
214}
215
216void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
217{
218 int ret;
219
220 cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p",
221 CIFS_I(inode)->fscache, page, inode);
222 ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
223 if (ret != 0)
224 fscache_uncache_page(CIFS_I(inode)->fscache, page);
225}
226
227void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
228{
229 struct cifsInodeInfo *cifsi = CIFS_I(inode);
230 struct fscache_cookie *cookie = cifsi->fscache;
231
232 cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie);
233 fscache_wait_on_page_write(cookie, page);
234 fscache_uncache_page(cookie, page);
235}
236
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
new file mode 100644
index 000000000000..31b88ec2341e
--- /dev/null
+++ b/fs/cifs/fscache.h
@@ -0,0 +1,136 @@
1/*
2 * fs/cifs/fscache.h - CIFS filesystem cache interface definitions
3 *
4 * Copyright (c) 2010 Novell, Inc.
5 * Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21#ifndef _CIFS_FSCACHE_H
22#define _CIFS_FSCACHE_H
23
24#include <linux/fscache.h>
25
26#include "cifsglob.h"
27
28#ifdef CONFIG_CIFS_FSCACHE
29
30extern struct fscache_netfs cifs_fscache_netfs;
31extern const struct fscache_cookie_def cifs_fscache_server_index_def;
32extern const struct fscache_cookie_def cifs_fscache_super_index_def;
33extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
34
35extern int cifs_fscache_register(void);
36extern void cifs_fscache_unregister(void);
37
38/*
39 * fscache.c
40 */
41extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
42extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
43extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *);
44extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *);
45
46extern void cifs_fscache_release_inode_cookie(struct inode *);
47extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
48extern void cifs_fscache_reset_inode_cookie(struct inode *);
49
50extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
51extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
52extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
53extern int __cifs_readpages_from_fscache(struct inode *,
54 struct address_space *,
55 struct list_head *,
56 unsigned *);
57
58extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
59
60static inline void cifs_fscache_invalidate_page(struct page *page,
61 struct inode *inode)
62{
63 if (PageFsCache(page))
64 __cifs_fscache_invalidate_page(page, inode);
65}
66
67static inline int cifs_readpage_from_fscache(struct inode *inode,
68 struct page *page)
69{
70 if (CIFS_I(inode)->fscache)
71 return __cifs_readpage_from_fscache(inode, page);
72
73 return -ENOBUFS;
74}
75
76static inline int cifs_readpages_from_fscache(struct inode *inode,
77 struct address_space *mapping,
78 struct list_head *pages,
79 unsigned *nr_pages)
80{
81 if (CIFS_I(inode)->fscache)
82 return __cifs_readpages_from_fscache(inode, mapping, pages,
83 nr_pages);
84 return -ENOBUFS;
85}
86
87static inline void cifs_readpage_to_fscache(struct inode *inode,
88 struct page *page)
89{
90 if (PageFsCache(page))
91 __cifs_readpage_to_fscache(inode, page);
92}
93
94#else /* CONFIG_CIFS_FSCACHE */
95static inline int cifs_fscache_register(void) { return 0; }
96static inline void cifs_fscache_unregister(void) {}
97
98static inline void
99cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
100static inline void
101cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
102static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {}
103static inline void
104cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {}
105
106static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
107static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
108 struct file *filp) {}
109static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
110static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
111{
112 return 1; /* May release page */
113}
114
115static inline void cifs_fscache_invalidate_page(struct page *page,
116 struct inode *inode) {}
117static inline int
118cifs_readpage_from_fscache(struct inode *inode, struct page *page)
119{
120 return -ENOBUFS;
121}
122
123static inline int cifs_readpages_from_fscache(struct inode *inode,
124 struct address_space *mapping,
125 struct list_head *pages,
126 unsigned *nr_pages)
127{
128 return -ENOBUFS;
129}
130
131static inline void cifs_readpage_to_fscache(struct inode *inode,
132 struct page *page) {}
133
134#endif /* CONFIG_CIFS_FSCACHE */
135
136#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 29b9ea244c81..53cce8cc2224 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/inode.c 2 * fs/cifs/inode.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2008 4 * Copyright (C) International Business Machines Corp., 2002,2010
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -29,6 +29,7 @@
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "cifs_debug.h" 30#include "cifs_debug.h"
31#include "cifs_fs_sb.h" 31#include "cifs_fs_sb.h"
32#include "fscache.h"
32 33
33 34
34static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) 35static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
@@ -86,30 +87,30 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
86{ 87{
87 struct cifsInodeInfo *cifs_i = CIFS_I(inode); 88 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
88 89
89 cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid)); 90 cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
90 91
91 if (inode->i_state & I_NEW) { 92 if (inode->i_state & I_NEW) {
92 cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid)); 93 cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
93 return; 94 return;
94 } 95 }
95 96
96 /* don't bother with revalidation if we have an oplock */ 97 /* don't bother with revalidation if we have an oplock */
97 if (cifs_i->clientCanCacheRead) { 98 if (cifs_i->clientCanCacheRead) {
98 cFYI(1, ("%s: inode %llu is oplocked", __func__, 99 cFYI(1, "%s: inode %llu is oplocked", __func__,
99 cifs_i->uniqueid)); 100 cifs_i->uniqueid);
100 return; 101 return;
101 } 102 }
102 103
103 /* revalidate if mtime or size have changed */ 104 /* revalidate if mtime or size have changed */
104 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) && 105 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
105 cifs_i->server_eof == fattr->cf_eof) { 106 cifs_i->server_eof == fattr->cf_eof) {
106 cFYI(1, ("%s: inode %llu is unchanged", __func__, 107 cFYI(1, "%s: inode %llu is unchanged", __func__,
107 cifs_i->uniqueid)); 108 cifs_i->uniqueid);
108 return; 109 return;
109 } 110 }
110 111
111 cFYI(1, ("%s: invalidating inode %llu mapping", __func__, 112 cFYI(1, "%s: invalidating inode %llu mapping", __func__,
112 cifs_i->uniqueid)); 113 cifs_i->uniqueid);
113 cifs_i->invalid_mapping = true; 114 cifs_i->invalid_mapping = true;
114} 115}
115 116
@@ -137,15 +138,14 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
137 inode->i_mode = fattr->cf_mode; 138 inode->i_mode = fattr->cf_mode;
138 139
139 cifs_i->cifsAttrs = fattr->cf_cifsattrs; 140 cifs_i->cifsAttrs = fattr->cf_cifsattrs;
140 cifs_i->uniqueid = fattr->cf_uniqueid;
141 141
142 if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) 142 if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
143 cifs_i->time = 0; 143 cifs_i->time = 0;
144 else 144 else
145 cifs_i->time = jiffies; 145 cifs_i->time = jiffies;
146 146
147 cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode, 147 cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
148 oldtime, cifs_i->time)); 148 oldtime, cifs_i->time);
149 149
150 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; 150 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
151 151
@@ -170,6 +170,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
170 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); 170 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
171} 171}
172 172
173void
174cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
175{
176 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
177
178 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
179 return;
180
181 fattr->cf_uniqueid = iunique(sb, ROOT_I);
182}
183
173/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */ 184/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
174void 185void
175cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, 186cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
@@ -227,7 +238,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
227 /* safest to call it a file if we do not know */ 238 /* safest to call it a file if we do not know */
228 fattr->cf_mode |= S_IFREG; 239 fattr->cf_mode |= S_IFREG;
229 fattr->cf_dtype = DT_REG; 240 fattr->cf_dtype = DT_REG;
230 cFYI(1, ("unknown type %d", le32_to_cpu(info->Type))); 241 cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
231 break; 242 break;
232 } 243 }
233 244
@@ -256,7 +267,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
256{ 267{
257 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 268 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
258 269
259 cFYI(1, ("creating fake fattr for DFS referral")); 270 cFYI(1, "creating fake fattr for DFS referral");
260 271
261 memset(fattr, 0, sizeof(*fattr)); 272 memset(fattr, 0, sizeof(*fattr));
262 fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; 273 fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -278,7 +289,7 @@ int cifs_get_file_info_unix(struct file *filp)
278 struct inode *inode = filp->f_path.dentry->d_inode; 289 struct inode *inode = filp->f_path.dentry->d_inode;
279 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
280 struct cifsTconInfo *tcon = cifs_sb->tcon; 291 struct cifsTconInfo *tcon = cifs_sb->tcon;
281 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; 292 struct cifsFileInfo *cfile = filp->private_data;
282 293
283 xid = GetXid(); 294 xid = GetXid();
284 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); 295 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -305,7 +316,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
305 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 316 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
306 317
307 tcon = cifs_sb->tcon; 318 tcon = cifs_sb->tcon;
308 cFYI(1, ("Getting info on %s", full_path)); 319 cFYI(1, "Getting info on %s", full_path);
309 320
310 /* could have done a find first instead but this returns more info */ 321 /* could have done a find first instead but this returns more info */
311 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, 322 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
@@ -323,6 +334,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
323 334
324 if (*pinode == NULL) { 335 if (*pinode == NULL) {
325 /* get new inode */ 336 /* get new inode */
337 cifs_fill_uniqueid(sb, &fattr);
326 *pinode = cifs_iget(sb, &fattr); 338 *pinode = cifs_iget(sb, &fattr);
327 if (!*pinode) 339 if (!*pinode)
328 rc = -ENOMEM; 340 rc = -ENOMEM;
@@ -373,7 +385,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
373 &bytes_read, &pbuf, &buf_type); 385 &bytes_read, &pbuf, &buf_type);
374 if ((rc == 0) && (bytes_read >= 8)) { 386 if ((rc == 0) && (bytes_read >= 8)) {
375 if (memcmp("IntxBLK", pbuf, 8) == 0) { 387 if (memcmp("IntxBLK", pbuf, 8) == 0) {
376 cFYI(1, ("Block device")); 388 cFYI(1, "Block device");
377 fattr->cf_mode |= S_IFBLK; 389 fattr->cf_mode |= S_IFBLK;
378 fattr->cf_dtype = DT_BLK; 390 fattr->cf_dtype = DT_BLK;
379 if (bytes_read == 24) { 391 if (bytes_read == 24) {
@@ -385,7 +397,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
385 fattr->cf_rdev = MKDEV(mjr, mnr); 397 fattr->cf_rdev = MKDEV(mjr, mnr);
386 } 398 }
387 } else if (memcmp("IntxCHR", pbuf, 8) == 0) { 399 } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
388 cFYI(1, ("Char device")); 400 cFYI(1, "Char device");
389 fattr->cf_mode |= S_IFCHR; 401 fattr->cf_mode |= S_IFCHR;
390 fattr->cf_dtype = DT_CHR; 402 fattr->cf_dtype = DT_CHR;
391 if (bytes_read == 24) { 403 if (bytes_read == 24) {
@@ -397,7 +409,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
397 fattr->cf_rdev = MKDEV(mjr, mnr); 409 fattr->cf_rdev = MKDEV(mjr, mnr);
398 } 410 }
399 } else if (memcmp("IntxLNK", pbuf, 7) == 0) { 411 } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
400 cFYI(1, ("Symlink")); 412 cFYI(1, "Symlink");
401 fattr->cf_mode |= S_IFLNK; 413 fattr->cf_mode |= S_IFLNK;
402 fattr->cf_dtype = DT_LNK; 414 fattr->cf_dtype = DT_LNK;
403 } else { 415 } else {
@@ -439,10 +451,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
439 else if (rc > 3) { 451 else if (rc > 3) {
440 mode = le32_to_cpu(*((__le32 *)ea_value)); 452 mode = le32_to_cpu(*((__le32 *)ea_value));
441 fattr->cf_mode &= ~SFBITS_MASK; 453 fattr->cf_mode &= ~SFBITS_MASK;
442 cFYI(1, ("special bits 0%o org mode 0%o", mode, 454 cFYI(1, "special bits 0%o org mode 0%o", mode,
443 fattr->cf_mode)); 455 fattr->cf_mode);
444 fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode; 456 fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
445 cFYI(1, ("special mode bits 0%o", mode)); 457 cFYI(1, "special mode bits 0%o", mode);
446 } 458 }
447 459
448 return 0; 460 return 0;
@@ -504,7 +516,7 @@ int cifs_get_file_info(struct file *filp)
504 struct inode *inode = filp->f_path.dentry->d_inode; 516 struct inode *inode = filp->f_path.dentry->d_inode;
505 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 517 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
506 struct cifsTconInfo *tcon = cifs_sb->tcon; 518 struct cifsTconInfo *tcon = cifs_sb->tcon;
507 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; 519 struct cifsFileInfo *cfile = filp->private_data;
508 520
509 xid = GetXid(); 521 xid = GetXid();
510 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); 522 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -548,11 +560,11 @@ int cifs_get_inode_info(struct inode **pinode,
548 struct cifs_fattr fattr; 560 struct cifs_fattr fattr;
549 561
550 pTcon = cifs_sb->tcon; 562 pTcon = cifs_sb->tcon;
551 cFYI(1, ("Getting info on %s", full_path)); 563 cFYI(1, "Getting info on %s", full_path);
552 564
553 if ((pfindData == NULL) && (*pinode != NULL)) { 565 if ((pfindData == NULL) && (*pinode != NULL)) {
554 if (CIFS_I(*pinode)->clientCanCacheRead) { 566 if (CIFS_I(*pinode)->clientCanCacheRead) {
555 cFYI(1, ("No need to revalidate cached inode sizes")); 567 cFYI(1, "No need to revalidate cached inode sizes");
556 return rc; 568 return rc;
557 } 569 }
558 } 570 }
@@ -618,7 +630,7 @@ int cifs_get_inode_info(struct inode **pinode,
618 cifs_sb->mnt_cifs_flags & 630 cifs_sb->mnt_cifs_flags &
619 CIFS_MOUNT_MAP_SPECIAL_CHR); 631 CIFS_MOUNT_MAP_SPECIAL_CHR);
620 if (rc1 || !fattr.cf_uniqueid) { 632 if (rc1 || !fattr.cf_uniqueid) {
621 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 633 cFYI(1, "GetSrvInodeNum rc %d", rc1);
622 fattr.cf_uniqueid = iunique(sb, ROOT_I); 634 fattr.cf_uniqueid = iunique(sb, ROOT_I);
623 cifs_autodisable_serverino(cifs_sb); 635 cifs_autodisable_serverino(cifs_sb);
624 } 636 }
@@ -634,13 +646,13 @@ int cifs_get_inode_info(struct inode **pinode,
634 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { 646 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
635 tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); 647 tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
636 if (tmprc) 648 if (tmprc)
637 cFYI(1, ("cifs_sfu_type failed: %d", tmprc)); 649 cFYI(1, "cifs_sfu_type failed: %d", tmprc);
638 } 650 }
639 651
640#ifdef CONFIG_CIFS_EXPERIMENTAL 652#ifdef CONFIG_CIFS_EXPERIMENTAL
641 /* fill in 0777 bits from ACL */ 653 /* fill in 0777 bits from ACL */
642 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 654 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
643 cFYI(1, ("Getting mode bits from ACL")); 655 cFYI(1, "Getting mode bits from ACL");
644 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); 656 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
645 } 657 }
646#endif 658#endif
@@ -712,18 +724,17 @@ cifs_find_inode(struct inode *inode, void *opaque)
712{ 724{
713 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; 725 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
714 726
727 /* don't match inode with different uniqueid */
715 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) 728 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
716 return 0; 729 return 0;
717 730
718 /* 731 /* don't match inode of different type */
719 * uh oh -- it's a directory. We can't use it since hardlinked dirs are 732 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
720 * verboten. Disable serverino and return it as if it were found, the 733 return 0;
721 * caller can discard it, generate a uniqueid and retry the find 734
722 */ 735 /* if it's not a directory or has no dentries, then flag it */
723 if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) { 736 if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry))
724 fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; 737 fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
725 cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
726 }
727 738
728 return 1; 739 return 1;
729} 740}
@@ -737,6 +748,27 @@ cifs_init_inode(struct inode *inode, void *opaque)
737 return 0; 748 return 0;
738} 749}
739 750
751/*
752 * walk dentry list for an inode and report whether it has aliases that
753 * are hashed. We use this to determine if a directory inode can actually
754 * be used.
755 */
756static bool
757inode_has_hashed_dentries(struct inode *inode)
758{
759 struct dentry *dentry;
760
761 spin_lock(&dcache_lock);
762 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
763 if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
764 spin_unlock(&dcache_lock);
765 return true;
766 }
767 }
768 spin_unlock(&dcache_lock);
769 return false;
770}
771
740/* Given fattrs, get a corresponding inode */ 772/* Given fattrs, get a corresponding inode */
741struct inode * 773struct inode *
742cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) 774cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
@@ -745,19 +777,23 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
745 struct inode *inode; 777 struct inode *inode;
746 778
747retry_iget5_locked: 779retry_iget5_locked:
748 cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid)); 780 cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
749 781
750 /* hash down to 32-bits on 32-bit arch */ 782 /* hash down to 32-bits on 32-bit arch */
751 hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); 783 hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
752 784
753 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); 785 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
754 if (inode) { 786 if (inode) {
755 /* was there a problematic inode number collision? */ 787 /* was there a potentially problematic inode collision? */
756 if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) { 788 if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
757 iput(inode);
758 fattr->cf_uniqueid = iunique(sb, ROOT_I);
759 fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION; 789 fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
760 goto retry_iget5_locked; 790
791 if (inode_has_hashed_dentries(inode)) {
792 cifs_autodisable_serverino(CIFS_SB(sb));
793 iput(inode);
794 fattr->cf_uniqueid = iunique(sb, ROOT_I);
795 goto retry_iget5_locked;
796 }
761 } 797 }
762 798
763 cifs_fattr_to_inode(inode, fattr); 799 cifs_fattr_to_inode(inode, fattr);
@@ -765,6 +801,12 @@ retry_iget5_locked:
765 inode->i_flags |= S_NOATIME | S_NOCMTIME; 801 inode->i_flags |= S_NOATIME | S_NOCMTIME;
766 if (inode->i_state & I_NEW) { 802 if (inode->i_state & I_NEW) {
767 inode->i_ino = hash; 803 inode->i_ino = hash;
804 if (S_ISREG(inode->i_mode))
805 inode->i_data.backing_dev_info = sb->s_bdi;
806#ifdef CONFIG_CIFS_FSCACHE
807 /* initialize per-inode cache cookie pointer */
808 CIFS_I(inode)->fscache = NULL;
809#endif
768 unlock_new_inode(inode); 810 unlock_new_inode(inode);
769 } 811 }
770 } 812 }
@@ -794,10 +836,15 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
794 xid, NULL); 836 xid, NULL);
795 837
796 if (!inode) 838 if (!inode)
797 return ERR_PTR(-ENOMEM); 839 return ERR_PTR(rc);
840
841#ifdef CONFIG_CIFS_FSCACHE
842 /* populate tcon->resource_id */
843 cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid;
844#endif
798 845
799 if (rc && cifs_sb->tcon->ipc) { 846 if (rc && cifs_sb->tcon->ipc) {
800 cFYI(1, ("ipc connection - fake read inode")); 847 cFYI(1, "ipc connection - fake read inode");
801 inode->i_mode |= S_IFDIR; 848 inode->i_mode |= S_IFDIR;
802 inode->i_nlink = 2; 849 inode->i_nlink = 2;
803 inode->i_op = &cifs_ipc_inode_ops; 850 inode->i_op = &cifs_ipc_inode_ops;
@@ -859,7 +906,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
859 * server times. 906 * server times.
860 */ 907 */
861 if (set_time && (attrs->ia_valid & ATTR_CTIME)) { 908 if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
862 cFYI(1, ("CIFS - CTIME changed")); 909 cFYI(1, "CIFS - CTIME changed");
863 info_buf.ChangeTime = 910 info_buf.ChangeTime =
864 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); 911 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
865 } else 912 } else
@@ -894,8 +941,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
894 goto out; 941 goto out;
895 } 942 }
896 943
897 cFYI(1, ("calling SetFileInfo since SetPathInfo for " 944 cFYI(1, "calling SetFileInfo since SetPathInfo for "
898 "times not supported by this server")); 945 "times not supported by this server");
899 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, 946 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
900 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, 947 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
901 CREATE_NOT_DIR, &netfid, &oplock, 948 CREATE_NOT_DIR, &netfid, &oplock,
@@ -1053,7 +1100,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1053 struct iattr *attrs = NULL; 1100 struct iattr *attrs = NULL;
1054 __u32 dosattr = 0, origattr = 0; 1101 __u32 dosattr = 0, origattr = 0;
1055 1102
1056 cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry)); 1103 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
1057 1104
1058 xid = GetXid(); 1105 xid = GetXid();
1059 1106
@@ -1072,7 +1119,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1072 rc = CIFSPOSIXDelFile(xid, tcon, full_path, 1119 rc = CIFSPOSIXDelFile(xid, tcon, full_path,
1073 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, 1120 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
1074 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1121 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1075 cFYI(1, ("posix del rc %d", rc)); 1122 cFYI(1, "posix del rc %d", rc);
1076 if ((rc == 0) || (rc == -ENOENT)) 1123 if ((rc == 0) || (rc == -ENOENT))
1077 goto psx_del_no_retry; 1124 goto psx_del_no_retry;
1078 } 1125 }
@@ -1146,7 +1193,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1146 struct inode *newinode = NULL; 1193 struct inode *newinode = NULL;
1147 struct cifs_fattr fattr; 1194 struct cifs_fattr fattr;
1148 1195
1149 cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode)); 1196 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
1150 1197
1151 xid = GetXid(); 1198 xid = GetXid();
1152 1199
@@ -1181,7 +1228,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1181 kfree(pInfo); 1228 kfree(pInfo);
1182 goto mkdir_retry_old; 1229 goto mkdir_retry_old;
1183 } else if (rc) { 1230 } else if (rc) {
1184 cFYI(1, ("posix mkdir returned 0x%x", rc)); 1231 cFYI(1, "posix mkdir returned 0x%x", rc);
1185 d_drop(direntry); 1232 d_drop(direntry);
1186 } else { 1233 } else {
1187 if (pInfo->Type == cpu_to_le32(-1)) { 1234 if (pInfo->Type == cpu_to_le32(-1)) {
@@ -1198,6 +1245,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1198 direntry->d_op = &cifs_dentry_ops; 1245 direntry->d_op = &cifs_dentry_ops;
1199 1246
1200 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); 1247 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
1248 cifs_fill_uniqueid(inode->i_sb, &fattr);
1201 newinode = cifs_iget(inode->i_sb, &fattr); 1249 newinode = cifs_iget(inode->i_sb, &fattr);
1202 if (!newinode) { 1250 if (!newinode) {
1203 kfree(pInfo); 1251 kfree(pInfo);
@@ -1207,12 +1255,12 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1207 d_instantiate(direntry, newinode); 1255 d_instantiate(direntry, newinode);
1208 1256
1209#ifdef CONFIG_CIFS_DEBUG2 1257#ifdef CONFIG_CIFS_DEBUG2
1210 cFYI(1, ("instantiated dentry %p %s to inode %p", 1258 cFYI(1, "instantiated dentry %p %s to inode %p",
1211 direntry, direntry->d_name.name, newinode)); 1259 direntry, direntry->d_name.name, newinode);
1212 1260
1213 if (newinode->i_nlink != 2) 1261 if (newinode->i_nlink != 2)
1214 cFYI(1, ("unexpected number of links %d", 1262 cFYI(1, "unexpected number of links %d",
1215 newinode->i_nlink)); 1263 newinode->i_nlink);
1216#endif 1264#endif
1217 } 1265 }
1218 kfree(pInfo); 1266 kfree(pInfo);
@@ -1223,7 +1271,7 @@ mkdir_retry_old:
1223 rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls, 1271 rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
1224 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1272 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1225 if (rc) { 1273 if (rc) {
1226 cFYI(1, ("cifs_mkdir returned 0x%x", rc)); 1274 cFYI(1, "cifs_mkdir returned 0x%x", rc);
1227 d_drop(direntry); 1275 d_drop(direntry);
1228 } else { 1276 } else {
1229mkdir_get_info: 1277mkdir_get_info:
@@ -1326,7 +1374,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1326 char *full_path = NULL; 1374 char *full_path = NULL;
1327 struct cifsInodeInfo *cifsInode; 1375 struct cifsInodeInfo *cifsInode;
1328 1376
1329 cFYI(1, ("cifs_rmdir, inode = 0x%p", inode)); 1377 cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
1330 1378
1331 xid = GetXid(); 1379 xid = GetXid();
1332 1380
@@ -1389,6 +1437,10 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1389 if (rc == 0 || rc != -ETXTBSY) 1437 if (rc == 0 || rc != -ETXTBSY)
1390 return rc; 1438 return rc;
1391 1439
1440 /* open-file renames don't work across directories */
1441 if (to_dentry->d_parent != from_dentry->d_parent)
1442 return rc;
1443
1392 /* open the file to be renamed -- we need DELETE perms */ 1444 /* open the file to be renamed -- we need DELETE perms */
1393 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, 1445 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
1394 CREATE_NOT_DIR, &srcfid, &oplock, NULL, 1446 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
@@ -1412,29 +1464,18 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1412{ 1464{
1413 char *fromName = NULL; 1465 char *fromName = NULL;
1414 char *toName = NULL; 1466 char *toName = NULL;
1415 struct cifs_sb_info *cifs_sb_source; 1467 struct cifs_sb_info *cifs_sb;
1416 struct cifs_sb_info *cifs_sb_target;
1417 struct cifsTconInfo *tcon; 1468 struct cifsTconInfo *tcon;
1418 FILE_UNIX_BASIC_INFO *info_buf_source = NULL; 1469 FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
1419 FILE_UNIX_BASIC_INFO *info_buf_target; 1470 FILE_UNIX_BASIC_INFO *info_buf_target;
1420 int xid, rc, tmprc; 1471 int xid, rc, tmprc;
1421 1472
1422 cifs_sb_target = CIFS_SB(target_dir->i_sb); 1473 cifs_sb = CIFS_SB(source_dir->i_sb);
1423 cifs_sb_source = CIFS_SB(source_dir->i_sb); 1474 tcon = cifs_sb->tcon;
1424 tcon = cifs_sb_source->tcon;
1425 1475
1426 xid = GetXid(); 1476 xid = GetXid();
1427 1477
1428 /* 1478 /*
1429 * BB: this might be allowed if same server, but different share.
1430 * Consider adding support for this
1431 */
1432 if (tcon != cifs_sb_target->tcon) {
1433 rc = -EXDEV;
1434 goto cifs_rename_exit;
1435 }
1436
1437 /*
1438 * we already have the rename sem so we do not need to 1479 * we already have the rename sem so we do not need to
1439 * grab it again here to protect the path integrity 1480 * grab it again here to protect the path integrity
1440 */ 1481 */
@@ -1469,17 +1510,16 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1469 info_buf_target = info_buf_source + 1; 1510 info_buf_target = info_buf_source + 1;
1470 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName, 1511 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
1471 info_buf_source, 1512 info_buf_source,
1472 cifs_sb_source->local_nls, 1513 cifs_sb->local_nls,
1473 cifs_sb_source->mnt_cifs_flags & 1514 cifs_sb->mnt_cifs_flags &
1474 CIFS_MOUNT_MAP_SPECIAL_CHR); 1515 CIFS_MOUNT_MAP_SPECIAL_CHR);
1475 if (tmprc != 0) 1516 if (tmprc != 0)
1476 goto unlink_target; 1517 goto unlink_target;
1477 1518
1478 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, 1519 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName,
1479 toName, info_buf_target, 1520 info_buf_target,
1480 cifs_sb_target->local_nls, 1521 cifs_sb->local_nls,
1481 /* remap based on source sb */ 1522 cifs_sb->mnt_cifs_flags &
1482 cifs_sb_source->mnt_cifs_flags &
1483 CIFS_MOUNT_MAP_SPECIAL_CHR); 1523 CIFS_MOUNT_MAP_SPECIAL_CHR);
1484 1524
1485 if (tmprc == 0 && (info_buf_source->UniqueId == 1525 if (tmprc == 0 && (info_buf_source->UniqueId ==
@@ -1528,6 +1568,11 @@ cifs_inode_needs_reval(struct inode *inode)
1528 if (time_after_eq(jiffies, cifs_i->time + HZ)) 1568 if (time_after_eq(jiffies, cifs_i->time + HZ))
1529 return true; 1569 return true;
1530 1570
1571 /* hardlinked files w/ noserverino get "special" treatment */
1572 if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
1573 S_ISREG(inode->i_mode) && inode->i_nlink != 1)
1574 return true;
1575
1531 return false; 1576 return false;
1532} 1577}
1533 1578
@@ -1547,6 +1592,7 @@ cifs_invalidate_mapping(struct inode *inode)
1547 cifs_i->write_behind_rc = rc; 1592 cifs_i->write_behind_rc = rc;
1548 } 1593 }
1549 invalidate_remote_inode(inode); 1594 invalidate_remote_inode(inode);
1595 cifs_fscache_reset_inode_cookie(inode);
1550} 1596}
1551 1597
1552int cifs_revalidate_file(struct file *filp) 1598int cifs_revalidate_file(struct file *filp)
@@ -1594,9 +1640,9 @@ int cifs_revalidate_dentry(struct dentry *dentry)
1594 goto check_inval; 1640 goto check_inval;
1595 } 1641 }
1596 1642
1597 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld " 1643 cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1598 "jiffies %ld", full_path, inode, inode->i_count.counter, 1644 "jiffies %ld", full_path, inode, inode->i_count.counter,
1599 dentry, dentry->d_time, jiffies)); 1645 dentry, dentry->d_time, jiffies);
1600 1646
1601 if (CIFS_SB(sb)->tcon->unix_ext) 1647 if (CIFS_SB(sb)->tcon->unix_ext)
1602 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 1648 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
@@ -1642,26 +1688,16 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
1642 return rc; 1688 return rc;
1643} 1689}
1644 1690
1645static int cifs_vmtruncate(struct inode *inode, loff_t offset) 1691static void cifs_setsize(struct inode *inode, loff_t offset)
1646{ 1692{
1647 loff_t oldsize; 1693 loff_t oldsize;
1648 int err;
1649 1694
1650 spin_lock(&inode->i_lock); 1695 spin_lock(&inode->i_lock);
1651 err = inode_newsize_ok(inode, offset);
1652 if (err) {
1653 spin_unlock(&inode->i_lock);
1654 goto out;
1655 }
1656
1657 oldsize = inode->i_size; 1696 oldsize = inode->i_size;
1658 i_size_write(inode, offset); 1697 i_size_write(inode, offset);
1659 spin_unlock(&inode->i_lock); 1698 spin_unlock(&inode->i_lock);
1699
1660 truncate_pagecache(inode, oldsize, offset); 1700 truncate_pagecache(inode, oldsize, offset);
1661 if (inode->i_op->truncate)
1662 inode->i_op->truncate(inode);
1663out:
1664 return err;
1665} 1701}
1666 1702
1667static int 1703static int
@@ -1690,12 +1726,12 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1690 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1726 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1691 npid, false); 1727 npid, false);
1692 cifsFileInfo_put(open_file); 1728 cifsFileInfo_put(open_file);
1693 cFYI(1, ("SetFSize for attrs rc = %d", rc)); 1729 cFYI(1, "SetFSize for attrs rc = %d", rc);
1694 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1730 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1695 unsigned int bytes_written; 1731 unsigned int bytes_written;
1696 rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size, 1732 rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
1697 &bytes_written, NULL, NULL, 1); 1733 &bytes_written, NULL, NULL, 1);
1698 cFYI(1, ("Wrt seteof rc %d", rc)); 1734 cFYI(1, "Wrt seteof rc %d", rc);
1699 } 1735 }
1700 } else 1736 } else
1701 rc = -EINVAL; 1737 rc = -EINVAL;
@@ -1709,7 +1745,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1709 false, cifs_sb->local_nls, 1745 false, cifs_sb->local_nls,
1710 cifs_sb->mnt_cifs_flags & 1746 cifs_sb->mnt_cifs_flags &
1711 CIFS_MOUNT_MAP_SPECIAL_CHR); 1747 CIFS_MOUNT_MAP_SPECIAL_CHR);
1712 cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc)); 1748 cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
1713 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1749 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1714 __u16 netfid; 1750 __u16 netfid;
1715 int oplock = 0; 1751 int oplock = 0;
@@ -1726,7 +1762,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1726 attrs->ia_size, 1762 attrs->ia_size,
1727 &bytes_written, NULL, 1763 &bytes_written, NULL,
1728 NULL, 1); 1764 NULL, 1);
1729 cFYI(1, ("wrt seteof rc %d", rc)); 1765 cFYI(1, "wrt seteof rc %d", rc);
1730 CIFSSMBClose(xid, pTcon, netfid); 1766 CIFSSMBClose(xid, pTcon, netfid);
1731 } 1767 }
1732 } 1768 }
@@ -1734,7 +1770,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1734 1770
1735 if (rc == 0) { 1771 if (rc == 0) {
1736 cifsInode->server_eof = attrs->ia_size; 1772 cifsInode->server_eof = attrs->ia_size;
1737 rc = cifs_vmtruncate(inode, attrs->ia_size); 1773 cifs_setsize(inode, attrs->ia_size);
1738 cifs_truncate_page(inode->i_mapping, inode->i_size); 1774 cifs_truncate_page(inode->i_mapping, inode->i_size);
1739 } 1775 }
1740 1776
@@ -1754,19 +1790,17 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1754 struct cifs_unix_set_info_args *args = NULL; 1790 struct cifs_unix_set_info_args *args = NULL;
1755 struct cifsFileInfo *open_file; 1791 struct cifsFileInfo *open_file;
1756 1792
1757 cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x", 1793 cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
1758 direntry->d_name.name, attrs->ia_valid)); 1794 direntry->d_name.name, attrs->ia_valid);
1759 1795
1760 xid = GetXid(); 1796 xid = GetXid();
1761 1797
1762 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) { 1798 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
1763 /* check if we have permission to change attrs */ 1799 attrs->ia_valid |= ATTR_FORCE;
1764 rc = inode_change_ok(inode, attrs); 1800
1765 if (rc < 0) 1801 rc = inode_change_ok(inode, attrs);
1766 goto out; 1802 if (rc < 0)
1767 else 1803 goto out;
1768 rc = 0;
1769 }
1770 1804
1771 full_path = build_path_from_dentry(direntry); 1805 full_path = build_path_from_dentry(direntry);
1772 if (full_path == NULL) { 1806 if (full_path == NULL) {
@@ -1852,18 +1886,24 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1852 CIFS_MOUNT_MAP_SPECIAL_CHR); 1886 CIFS_MOUNT_MAP_SPECIAL_CHR);
1853 } 1887 }
1854 1888
1855 if (!rc) { 1889 if (rc)
1856 rc = inode_setattr(inode, attrs); 1890 goto out;
1857 1891
1858 /* force revalidate when any of these times are set since some 1892 if ((attrs->ia_valid & ATTR_SIZE) &&
1859 of the fs types (eg ext3, fat) do not have fine enough 1893 attrs->ia_size != i_size_read(inode))
1860 time granularity to match protocol, and we do not have a 1894 truncate_setsize(inode, attrs->ia_size);
1861 a way (yet) to query the server fs's time granularity (and 1895
1862 whether it rounds times down). 1896 setattr_copy(inode, attrs);
1863 */ 1897 mark_inode_dirty(inode);
1864 if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))) 1898
1865 cifsInode->time = 0; 1899 /* force revalidate when any of these times are set since some
1866 } 1900 of the fs types (eg ext3, fat) do not have fine enough
1901 time granularity to match protocol, and we do not have a
1902 a way (yet) to query the server fs's time granularity (and
1903 whether it rounds times down).
1904 */
1905 if (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))
1906 cifsInode->time = 0;
1867out: 1907out:
1868 kfree(args); 1908 kfree(args);
1869 kfree(full_path); 1909 kfree(full_path);
@@ -1885,17 +1925,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1885 1925
1886 xid = GetXid(); 1926 xid = GetXid();
1887 1927
1888 cFYI(1, ("setattr on file %s attrs->iavalid 0x%x", 1928 cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
1889 direntry->d_name.name, attrs->ia_valid)); 1929 direntry->d_name.name, attrs->ia_valid);
1890 1930
1891 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) { 1931 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
1892 /* check if we have permission to change attrs */ 1932 attrs->ia_valid |= ATTR_FORCE;
1893 rc = inode_change_ok(inode, attrs); 1933
1894 if (rc < 0) { 1934 rc = inode_change_ok(inode, attrs);
1895 FreeXid(xid); 1935 if (rc < 0) {
1896 return rc; 1936 FreeXid(xid);
1897 } else 1937 return rc;
1898 rc = 0;
1899 } 1938 }
1900 1939
1901 full_path = build_path_from_dentry(direntry); 1940 full_path = build_path_from_dentry(direntry);
@@ -1943,7 +1982,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1943 attrs->ia_valid &= ~ATTR_MODE; 1982 attrs->ia_valid &= ~ATTR_MODE;
1944 1983
1945 if (attrs->ia_valid & ATTR_MODE) { 1984 if (attrs->ia_valid & ATTR_MODE) {
1946 cFYI(1, ("Mode changed to 0%o", attrs->ia_mode)); 1985 cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
1947 mode = attrs->ia_mode; 1986 mode = attrs->ia_mode;
1948 } 1987 }
1949 1988
@@ -2003,8 +2042,17 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2003 2042
2004 /* do not need local check to inode_check_ok since the server does 2043 /* do not need local check to inode_check_ok since the server does
2005 that */ 2044 that */
2006 if (!rc) 2045 if (rc)
2007 rc = inode_setattr(inode, attrs); 2046 goto cifs_setattr_exit;
2047
2048 if ((attrs->ia_valid & ATTR_SIZE) &&
2049 attrs->ia_size != i_size_read(inode))
2050 truncate_setsize(inode, attrs->ia_size);
2051
2052 setattr_copy(inode, attrs);
2053 mark_inode_dirty(inode);
2054 return 0;
2055
2008cifs_setattr_exit: 2056cifs_setattr_exit:
2009 kfree(full_path); 2057 kfree(full_path);
2010 FreeXid(xid); 2058 FreeXid(xid);
@@ -2029,7 +2077,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
2029#if 0 2077#if 0
2030void cifs_delete_inode(struct inode *inode) 2078void cifs_delete_inode(struct inode *inode)
2031{ 2079{
2032 cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode)); 2080 cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
2033 /* may have to add back in if and when safe distributed caching of 2081 /* may have to add back in if and when safe distributed caching of
2034 directories added e.g. via FindNotify */ 2082 directories added e.g. via FindNotify */
2035} 2083}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index f94650683a00..9d38a71c8e14 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -41,13 +41,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
41 __u64 ExtAttrMask = 0; 41 __u64 ExtAttrMask = 0;
42 __u64 caps; 42 __u64 caps;
43 struct cifsTconInfo *tcon; 43 struct cifsTconInfo *tcon;
44 struct cifsFileInfo *pSMBFile = 44 struct cifsFileInfo *pSMBFile = filep->private_data;
45 (struct cifsFileInfo *)filep->private_data;
46#endif /* CONFIG_CIFS_POSIX */ 45#endif /* CONFIG_CIFS_POSIX */
47 46
48 xid = GetXid(); 47 xid = GetXid();
49 48
50 cFYI(1, ("ioctl file %p cmd %u arg %lu", filep, command, arg)); 49 cFYI(1, "ioctl file %p cmd %u arg %lu", filep, command, arg);
51 50
52 cifs_sb = CIFS_SB(inode->i_sb); 51 cifs_sb = CIFS_SB(inode->i_sb);
53 52
@@ -64,12 +63,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
64 63
65 switch (command) { 64 switch (command) {
66 case CIFS_IOC_CHECKUMOUNT: 65 case CIFS_IOC_CHECKUMOUNT:
67 cFYI(1, ("User unmount attempted")); 66 cFYI(1, "User unmount attempted");
68 if (cifs_sb->mnt_uid == current_uid()) 67 if (cifs_sb->mnt_uid == current_uid())
69 rc = 0; 68 rc = 0;
70 else { 69 else {
71 rc = -EACCES; 70 rc = -EACCES;
72 cFYI(1, ("uids do not match")); 71 cFYI(1, "uids do not match");
73 } 72 }
74 break; 73 break;
75#ifdef CONFIG_CIFS_POSIX 74#ifdef CONFIG_CIFS_POSIX
@@ -97,11 +96,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
97 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, 96 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
98 extAttrBits, &ExtAttrMask);*/ 97 extAttrBits, &ExtAttrMask);*/
99 } 98 }
100 cFYI(1, ("set flags not implemented yet")); 99 cFYI(1, "set flags not implemented yet");
101 break; 100 break;
102#endif /* CONFIG_CIFS_POSIX */ 101#endif /* CONFIG_CIFS_POSIX */
103 default: 102 default:
104 cFYI(1, ("unsupported ioctl")); 103 cFYI(1, "unsupported ioctl");
105 break; 104 break;
106 } 105 }
107 106
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index c1a9d4236a8c..473ca8033656 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -139,7 +139,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
139 if (!full_path) 139 if (!full_path)
140 goto out; 140 goto out;
141 141
142 cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode)); 142 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
143 143
144 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, 144 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
145 cifs_sb->local_nls); 145 cifs_sb->local_nls);
@@ -178,8 +178,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
178 return rc; 178 return rc;
179 } 179 }
180 180
181 cFYI(1, ("Full path: %s", full_path)); 181 cFYI(1, "Full path: %s", full_path);
182 cFYI(1, ("symname is %s", symname)); 182 cFYI(1, "symname is %s", symname);
183 183
184 /* BB what if DFS and this volume is on different share? BB */ 184 /* BB what if DFS and this volume is on different share? BB */
185 if (pTcon->unix_ext) 185 if (pTcon->unix_ext)
@@ -198,8 +198,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
198 inode->i_sb, xid, NULL); 198 inode->i_sb, xid, NULL);
199 199
200 if (rc != 0) { 200 if (rc != 0) {
201 cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d", 201 cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
202 rc)); 202 rc);
203 } else { 203 } else {
204 if (pTcon->nocase) 204 if (pTcon->nocase)
205 direntry->d_op = &cifs_ci_dentry_ops; 205 direntry->d_op = &cifs_ci_dentry_ops;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d1474996a812..3ccadc1326d6 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -51,7 +51,7 @@ _GetXid(void)
51 if (GlobalTotalActiveXid > GlobalMaxActiveXid) 51 if (GlobalTotalActiveXid > GlobalMaxActiveXid)
52 GlobalMaxActiveXid = GlobalTotalActiveXid; 52 GlobalMaxActiveXid = GlobalTotalActiveXid;
53 if (GlobalTotalActiveXid > 65000) 53 if (GlobalTotalActiveXid > 65000)
54 cFYI(1, ("warning: more than 65000 requests active")); 54 cFYI(1, "warning: more than 65000 requests active");
55 xid = GlobalCurrentXid++; 55 xid = GlobalCurrentXid++;
56 spin_unlock(&GlobalMid_Lock); 56 spin_unlock(&GlobalMid_Lock);
57 return xid; 57 return xid;
@@ -88,7 +88,7 @@ void
88sesInfoFree(struct cifsSesInfo *buf_to_free) 88sesInfoFree(struct cifsSesInfo *buf_to_free)
89{ 89{
90 if (buf_to_free == NULL) { 90 if (buf_to_free == NULL) {
91 cFYI(1, ("Null buffer passed to sesInfoFree")); 91 cFYI(1, "Null buffer passed to sesInfoFree");
92 return; 92 return;
93 } 93 }
94 94
@@ -126,7 +126,7 @@ void
126tconInfoFree(struct cifsTconInfo *buf_to_free) 126tconInfoFree(struct cifsTconInfo *buf_to_free)
127{ 127{
128 if (buf_to_free == NULL) { 128 if (buf_to_free == NULL) {
129 cFYI(1, ("Null buffer passed to tconInfoFree")); 129 cFYI(1, "Null buffer passed to tconInfoFree");
130 return; 130 return;
131 } 131 }
132 atomic_dec(&tconInfoAllocCount); 132 atomic_dec(&tconInfoAllocCount);
@@ -166,7 +166,7 @@ void
166cifs_buf_release(void *buf_to_free) 166cifs_buf_release(void *buf_to_free)
167{ 167{
168 if (buf_to_free == NULL) { 168 if (buf_to_free == NULL) {
169 /* cFYI(1, ("Null buffer passed to cifs_buf_release"));*/ 169 /* cFYI(1, "Null buffer passed to cifs_buf_release");*/
170 return; 170 return;
171 } 171 }
172 mempool_free(buf_to_free, cifs_req_poolp); 172 mempool_free(buf_to_free, cifs_req_poolp);
@@ -202,7 +202,7 @@ cifs_small_buf_release(void *buf_to_free)
202{ 202{
203 203
204 if (buf_to_free == NULL) { 204 if (buf_to_free == NULL) {
205 cFYI(1, ("Null buffer passed to cifs_small_buf_release")); 205 cFYI(1, "Null buffer passed to cifs_small_buf_release");
206 return; 206 return;
207 } 207 }
208 mempool_free(buf_to_free, cifs_sm_req_poolp); 208 mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -345,19 +345,19 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
345 /* with userid/password pairs found on the smb session */ 345 /* with userid/password pairs found on the smb session */
346 /* for other target tcp/ip addresses BB */ 346 /* for other target tcp/ip addresses BB */
347 if (current_fsuid() != treeCon->ses->linux_uid) { 347 if (current_fsuid() != treeCon->ses->linux_uid) {
348 cFYI(1, ("Multiuser mode and UID " 348 cFYI(1, "Multiuser mode and UID "
349 "did not match tcon uid")); 349 "did not match tcon uid");
350 read_lock(&cifs_tcp_ses_lock); 350 read_lock(&cifs_tcp_ses_lock);
351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) { 351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list); 352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
353 if (ses->linux_uid == current_fsuid()) { 353 if (ses->linux_uid == current_fsuid()) {
354 if (ses->server == treeCon->ses->server) { 354 if (ses->server == treeCon->ses->server) {
355 cFYI(1, ("found matching uid substitute right smb_uid")); 355 cFYI(1, "found matching uid substitute right smb_uid");
356 buffer->Uid = ses->Suid; 356 buffer->Uid = ses->Suid;
357 break; 357 break;
358 } else { 358 } else {
359 /* BB eventually call cifs_setup_session here */ 359 /* BB eventually call cifs_setup_session here */
360 cFYI(1, ("local UID found but no smb sess with this server exists")); 360 cFYI(1, "local UID found but no smb sess with this server exists");
361 } 361 }
362 } 362 }
363 } 363 }
@@ -394,17 +394,16 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
394 if (smb->Command == SMB_COM_LOCKING_ANDX) 394 if (smb->Command == SMB_COM_LOCKING_ANDX)
395 return 0; 395 return 0;
396 else 396 else
397 cERROR(1, ("Received Request not response")); 397 cERROR(1, "Received Request not response");
398 } 398 }
399 } else { /* bad signature or mid */ 399 } else { /* bad signature or mid */
400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) 400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
401 cERROR(1, 401 cERROR(1, "Bad protocol string signature header %x",
402 ("Bad protocol string signature header %x", 402 *(unsigned int *) smb->Protocol);
403 *(unsigned int *) smb->Protocol));
404 if (mid != smb->Mid) 403 if (mid != smb->Mid)
405 cERROR(1, ("Mids do not match")); 404 cERROR(1, "Mids do not match");
406 } 405 }
407 cERROR(1, ("bad smb detected. The Mid=%d", smb->Mid)); 406 cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
408 return 1; 407 return 1;
409} 408}
410 409
@@ -413,7 +412,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
413{ 412{
414 __u32 len = smb->smb_buf_length; 413 __u32 len = smb->smb_buf_length;
415 __u32 clc_len; /* calculated length */ 414 __u32 clc_len; /* calculated length */
416 cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len)); 415 cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
417 416
418 if (length < 2 + sizeof(struct smb_hdr)) { 417 if (length < 2 + sizeof(struct smb_hdr)) {
419 if ((length >= sizeof(struct smb_hdr) - 1) 418 if ((length >= sizeof(struct smb_hdr) - 1)
@@ -437,15 +436,15 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
437 tmp[sizeof(struct smb_hdr)+1] = 0; 436 tmp[sizeof(struct smb_hdr)+1] = 0;
438 return 0; 437 return 0;
439 } 438 }
440 cERROR(1, ("rcvd invalid byte count (bcc)")); 439 cERROR(1, "rcvd invalid byte count (bcc)");
441 } else { 440 } else {
442 cERROR(1, ("Length less than smb header size")); 441 cERROR(1, "Length less than smb header size");
443 } 442 }
444 return 1; 443 return 1;
445 } 444 }
446 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 445 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
447 cERROR(1, ("smb length greater than MaxBufSize, mid=%d", 446 cERROR(1, "smb length greater than MaxBufSize, mid=%d",
448 smb->Mid)); 447 smb->Mid);
449 return 1; 448 return 1;
450 } 449 }
451 450
@@ -454,8 +453,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
454 clc_len = smbCalcSize_LE(smb); 453 clc_len = smbCalcSize_LE(smb);
455 454
456 if (4 + len != length) { 455 if (4 + len != length) {
457 cERROR(1, ("Length read does not match RFC1001 length %d", 456 cERROR(1, "Length read does not match RFC1001 length %d",
458 len)); 457 len);
459 return 1; 458 return 1;
460 } 459 }
461 460
@@ -466,8 +465,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
466 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) 465 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
467 return 0; /* bcc wrapped */ 466 return 0; /* bcc wrapped */
468 } 467 }
469 cFYI(1, ("Calculated size %d vs length %d mismatch for mid %d", 468 cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
470 clc_len, 4 + len, smb->Mid)); 469 clc_len, 4 + len, smb->Mid);
471 /* Windows XP can return a few bytes too much, presumably 470 /* Windows XP can return a few bytes too much, presumably
472 an illegal pad, at the end of byte range lock responses 471 an illegal pad, at the end of byte range lock responses
473 so we allow for that three byte pad, as long as actual 472 so we allow for that three byte pad, as long as actual
@@ -482,8 +481,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
482 if ((4+len > clc_len) && (len <= clc_len + 512)) 481 if ((4+len > clc_len) && (len <= clc_len + 512))
483 return 0; 482 return 0;
484 else { 483 else {
485 cERROR(1, ("RFC1001 size %d bigger than SMB for Mid=%d", 484 cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
486 len, smb->Mid)); 485 len, smb->Mid);
487 return 1; 486 return 1;
488 } 487 }
489 } 488 }
@@ -499,9 +498,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
499 struct cifsTconInfo *tcon; 498 struct cifsTconInfo *tcon;
500 struct cifsInodeInfo *pCifsInode; 499 struct cifsInodeInfo *pCifsInode;
501 struct cifsFileInfo *netfile; 500 struct cifsFileInfo *netfile;
502 int rc;
503 501
504 cFYI(1, ("Checking for oplock break or dnotify response")); 502 cFYI(1, "Checking for oplock break or dnotify response");
505 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && 503 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
506 (pSMB->hdr.Flags & SMBFLG_RESPONSE)) { 504 (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
507 struct smb_com_transaction_change_notify_rsp *pSMBr = 505 struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -513,15 +511,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
513 511
514 pnotify = (struct file_notify_information *) 512 pnotify = (struct file_notify_information *)
515 ((char *)&pSMBr->hdr.Protocol + data_offset); 513 ((char *)&pSMBr->hdr.Protocol + data_offset);
516 cFYI(1, ("dnotify on %s Action: 0x%x", 514 cFYI(1, "dnotify on %s Action: 0x%x",
517 pnotify->FileName, pnotify->Action)); 515 pnotify->FileName, pnotify->Action);
518 /* cifs_dump_mem("Rcvd notify Data: ",buf, 516 /* cifs_dump_mem("Rcvd notify Data: ",buf,
519 sizeof(struct smb_hdr)+60); */ 517 sizeof(struct smb_hdr)+60); */
520 return true; 518 return true;
521 } 519 }
522 if (pSMBr->hdr.Status.CifsError) { 520 if (pSMBr->hdr.Status.CifsError) {
523 cFYI(1, ("notify err 0x%d", 521 cFYI(1, "notify err 0x%d",
524 pSMBr->hdr.Status.CifsError)); 522 pSMBr->hdr.Status.CifsError);
525 return true; 523 return true;
526 } 524 }
527 return false; 525 return false;
@@ -535,7 +533,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
535 large dirty files cached on the client */ 533 large dirty files cached on the client */
536 if ((NT_STATUS_INVALID_HANDLE) == 534 if ((NT_STATUS_INVALID_HANDLE) ==
537 le32_to_cpu(pSMB->hdr.Status.CifsError)) { 535 le32_to_cpu(pSMB->hdr.Status.CifsError)) {
538 cFYI(1, ("invalid handle on oplock break")); 536 cFYI(1, "invalid handle on oplock break");
539 return true; 537 return true;
540 } else if (ERRbadfid == 538 } else if (ERRbadfid ==
541 le16_to_cpu(pSMB->hdr.Status.DosError.Error)) { 539 le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -547,8 +545,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
547 if (pSMB->hdr.WordCount != 8) 545 if (pSMB->hdr.WordCount != 8)
548 return false; 546 return false;
549 547
550 cFYI(1, ("oplock type 0x%d level 0x%d", 548 cFYI(1, "oplock type 0x%d level 0x%d",
551 pSMB->LockType, pSMB->OplockLevel)); 549 pSMB->LockType, pSMB->OplockLevel);
552 if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) 550 if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
553 return false; 551 return false;
554 552
@@ -579,30 +577,35 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
579 return true; 577 return true;
580 } 578 }
581 579
582 cFYI(1, ("file id match, oplock break")); 580 cFYI(1, "file id match, oplock break");
583 pCifsInode = CIFS_I(netfile->pInode); 581 pCifsInode = CIFS_I(netfile->pInode);
584 pCifsInode->clientCanCacheAll = false; 582 pCifsInode->clientCanCacheAll = false;
585 if (pSMB->OplockLevel == 0) 583 if (pSMB->OplockLevel == 0)
586 pCifsInode->clientCanCacheRead = false; 584 pCifsInode->clientCanCacheRead = false;
587 rc = slow_work_enqueue(&netfile->oplock_break); 585
588 if (rc) { 586 /*
589 cERROR(1, ("failed to enqueue oplock " 587 * cifs_oplock_break_put() can't be called
590 "break: %d\n", rc)); 588 * from here. Get reference after queueing
591 } else { 589 * succeeded. cifs_oplock_break() will
592 netfile->oplock_break_cancelled = false; 590 * synchronize using GlobalSMSSeslock.
593 } 591 */
592 if (queue_work(system_nrt_wq,
593 &netfile->oplock_break))
594 cifs_oplock_break_get(netfile);
595 netfile->oplock_break_cancelled = false;
596
594 read_unlock(&GlobalSMBSeslock); 597 read_unlock(&GlobalSMBSeslock);
595 read_unlock(&cifs_tcp_ses_lock); 598 read_unlock(&cifs_tcp_ses_lock);
596 return true; 599 return true;
597 } 600 }
598 read_unlock(&GlobalSMBSeslock); 601 read_unlock(&GlobalSMBSeslock);
599 read_unlock(&cifs_tcp_ses_lock); 602 read_unlock(&cifs_tcp_ses_lock);
600 cFYI(1, ("No matching file for oplock break")); 603 cFYI(1, "No matching file for oplock break");
601 return true; 604 return true;
602 } 605 }
603 } 606 }
604 read_unlock(&cifs_tcp_ses_lock); 607 read_unlock(&cifs_tcp_ses_lock);
605 cFYI(1, ("Can not process oplock break for non-existent connection")); 608 cFYI(1, "Can not process oplock break for non-existent connection");
606 return true; 609 return true;
607} 610}
608 611
@@ -721,11 +724,11 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
721{ 724{
722 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 725 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
723 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 726 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
724 cERROR(1, ("Autodisabling the use of server inode numbers on " 727 cERROR(1, "Autodisabling the use of server inode numbers on "
725 "%s. This server doesn't seem to support them " 728 "%s. This server doesn't seem to support them "
726 "properly. Hardlinks will not be recognized on this " 729 "properly. Hardlinks will not be recognized on this "
727 "mount. Consider mounting with the \"noserverino\" " 730 "mount. Consider mounting with the \"noserverino\" "
728 "option to silence this message.", 731 "option to silence this message.",
729 cifs_sb->tcon->treeName)); 732 cifs_sb->tcon->treeName);
730 } 733 }
731} 734}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index bd6d6895730d..9aad47a2d62f 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -61,6 +61,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
61 {ERRremcd, -EACCES}, 61 {ERRremcd, -EACCES},
62 {ERRdiffdevice, -EXDEV}, 62 {ERRdiffdevice, -EXDEV},
63 {ERRnofiles, -ENOENT}, 63 {ERRnofiles, -ENOENT},
64 {ERRwriteprot, -EROFS},
64 {ERRbadshare, -ETXTBSY}, 65 {ERRbadshare, -ETXTBSY},
65 {ERRlock, -EACCES}, 66 {ERRlock, -EACCES},
66 {ERRunsup, -EINVAL}, 67 {ERRunsup, -EINVAL},
@@ -139,17 +140,18 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
139 * Returns 0 on failure. 140 * Returns 0 on failure.
140 */ 141 */
141static int 142static int
142cifs_inet_pton(const int address_family, const char *cp, void *dst) 143cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
143{ 144{
144 int ret = 0; 145 int ret = 0;
145 146
146 /* calculate length by finding first slash or NULL */ 147 /* calculate length by finding first slash or NULL */
147 if (address_family == AF_INET) 148 if (address_family == AF_INET)
148 ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL); 149 ret = in4_pton(cp, len, dst, '\\', NULL);
149 else if (address_family == AF_INET6) 150 else if (address_family == AF_INET6)
150 ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL); 151 ret = in6_pton(cp, len, dst , '\\', NULL);
151 152
152 cFYI(DBG2, ("address conversion returned %d for %s", ret, cp)); 153 cFYI(DBG2, "address conversion returned %d for %*.*s",
154 ret, len, len, cp);
153 if (ret > 0) 155 if (ret > 0)
154 ret = 1; 156 ret = 1;
155 return ret; 157 return ret;
@@ -164,43 +166,70 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
164 * Returns 0 on failure. 166 * Returns 0 on failure.
165 */ 167 */
166int 168int
167cifs_convert_address(char *src, void *dst) 169cifs_convert_address(struct sockaddr *dst, const char *src, int len)
168{ 170{
169 int rc; 171 int rc, alen, slen;
170 char *pct, *endp; 172 const char *pct;
173 char *endp, scope_id[13];
171 struct sockaddr_in *s4 = (struct sockaddr_in *) dst; 174 struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
172 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; 175 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
173 176
174 /* IPv4 address */ 177 /* IPv4 address */
175 if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) { 178 if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) {
176 s4->sin_family = AF_INET; 179 s4->sin_family = AF_INET;
177 return 1; 180 return 1;
178 } 181 }
179 182
180 /* temporarily terminate string */ 183 /* attempt to exclude the scope ID from the address part */
181 pct = strchr(src, '%'); 184 pct = memchr(src, '%', len);
182 if (pct) 185 alen = pct ? pct - src : len;
183 *pct = '\0';
184
185 rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr);
186
187 /* repair temp termination (if any) and make pct point to scopeid */
188 if (pct)
189 *pct++ = '%';
190 186
187 rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr);
191 if (!rc) 188 if (!rc)
192 return rc; 189 return rc;
193 190
194 s6->sin6_family = AF_INET6; 191 s6->sin6_family = AF_INET6;
195 if (pct) { 192 if (pct) {
193 /* grab the scope ID */
194 slen = len - (alen + 1);
195 if (slen <= 0 || slen > 12)
196 return 0;
197 memcpy(scope_id, pct + 1, slen);
198 scope_id[slen] = '\0';
199
196 s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0); 200 s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
197 if (!*pct || *endp) 201 if (endp != scope_id + slen)
198 return 0; 202 return 0;
199 } 203 }
200 204
201 return rc; 205 return rc;
202} 206}
203 207
208int
209cifs_set_port(struct sockaddr *addr, const unsigned short int port)
210{
211 switch (addr->sa_family) {
212 case AF_INET:
213 ((struct sockaddr_in *)addr)->sin_port = htons(port);
214 break;
215 case AF_INET6:
216 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
217 break;
218 default:
219 return 0;
220 }
221 return 1;
222}
223
224int
225cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
226 const unsigned short int port)
227{
228 if (!cifs_convert_address(dst, src, len))
229 return 0;
230 return cifs_set_port(dst, port);
231}
232
204/***************************************************************************** 233/*****************************************************************************
205convert a NT status code to a dos class/code 234convert a NT status code to a dos class/code
206 *****************************************************************************/ 235 *****************************************************************************/
@@ -870,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
870 } 899 }
871 /* else ERRHRD class errors or junk - return EIO */ 900 /* else ERRHRD class errors or junk - return EIO */
872 901
873 cFYI(1, ("Mapping smb error code %d to POSIX err %d", 902 cFYI(1, "Mapping smb error code %d to POSIX err %d",
874 smberrcode, rc)); 903 smberrcode, rc);
875 904
876 /* generic corrective action e.g. reconnect SMB session on 905 /* generic corrective action e.g. reconnect SMB session on
877 * ERRbaduid could be added */ 906 * ERRbaduid could be added */
@@ -940,20 +969,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
940 SMB_TIME *st = (SMB_TIME *)&time; 969 SMB_TIME *st = (SMB_TIME *)&time;
941 SMB_DATE *sd = (SMB_DATE *)&date; 970 SMB_DATE *sd = (SMB_DATE *)&date;
942 971
943 cFYI(1, ("date %d time %d", date, time)); 972 cFYI(1, "date %d time %d", date, time);
944 973
945 sec = 2 * st->TwoSeconds; 974 sec = 2 * st->TwoSeconds;
946 min = st->Minutes; 975 min = st->Minutes;
947 if ((sec > 59) || (min > 59)) 976 if ((sec > 59) || (min > 59))
948 cERROR(1, ("illegal time min %d sec %d", min, sec)); 977 cERROR(1, "illegal time min %d sec %d", min, sec);
949 sec += (min * 60); 978 sec += (min * 60);
950 sec += 60 * 60 * st->Hours; 979 sec += 60 * 60 * st->Hours;
951 if (st->Hours > 24) 980 if (st->Hours > 24)
952 cERROR(1, ("illegal hours %d", st->Hours)); 981 cERROR(1, "illegal hours %d", st->Hours);
953 days = sd->Day; 982 days = sd->Day;
954 month = sd->Month; 983 month = sd->Month;
955 if ((days > 31) || (month > 12)) { 984 if ((days > 31) || (month > 12)) {
956 cERROR(1, ("illegal date, month %d day: %d", month, days)); 985 cERROR(1, "illegal date, month %d day: %d", month, days);
957 if (month > 12) 986 if (month > 12)
958 month = 12; 987 month = 12;
959 } 988 }
@@ -979,7 +1008,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
979 1008
980 ts.tv_sec = sec + offset; 1009 ts.tv_sec = sec + offset;
981 1010
982 /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */ 1011 /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
983 1012
984 ts.tv_nsec = 0; 1013 ts.tv_nsec = 0;
985 return ts; 1014 return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 18e0bc1fb593..d5e591fab475 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -47,15 +47,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
47 if (file) { 47 if (file) {
48 cf = file->private_data; 48 cf = file->private_data;
49 if (cf == NULL) { 49 if (cf == NULL) {
50 cFYI(1, ("empty cifs private file data")); 50 cFYI(1, "empty cifs private file data");
51 return; 51 return;
52 } 52 }
53 if (cf->invalidHandle) 53 if (cf->invalidHandle)
54 cFYI(1, ("invalid handle")); 54 cFYI(1, "invalid handle");
55 if (cf->srch_inf.endOfSearch) 55 if (cf->srch_inf.endOfSearch)
56 cFYI(1, ("end of search")); 56 cFYI(1, "end of search");
57 if (cf->srch_inf.emptyDir) 57 if (cf->srch_inf.emptyDir)
58 cFYI(1, ("empty dir")); 58 cFYI(1, "empty dir");
59 } 59 }
60} 60}
61#else 61#else
@@ -76,7 +76,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
76 struct inode *inode; 76 struct inode *inode;
77 struct super_block *sb = parent->d_inode->i_sb; 77 struct super_block *sb = parent->d_inode->i_sb;
78 78
79 cFYI(1, ("For %s", name->name)); 79 cFYI(1, "For %s", name->name);
80 80
81 if (parent->d_op && parent->d_op->d_hash) 81 if (parent->d_op && parent->d_op->d_hash)
82 parent->d_op->d_hash(parent, name); 82 parent->d_op->d_hash(parent, name);
@@ -214,7 +214,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
214 fid, 214 fid,
215 cifs_sb->local_nls); 215 cifs_sb->local_nls);
216 if (CIFSSMBClose(xid, ptcon, fid)) { 216 if (CIFSSMBClose(xid, ptcon, fid)) {
217 cFYI(1, ("Error closing temporary reparsepoint open)")); 217 cFYI(1, "Error closing temporary reparsepoint open");
218 } 218 }
219 } 219 }
220} 220}
@@ -252,7 +252,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
252 if (full_path == NULL) 252 if (full_path == NULL)
253 return -ENOMEM; 253 return -ENOMEM;
254 254
255 cFYI(1, ("Full path: %s start at: %lld", full_path, file->f_pos)); 255 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
256 256
257ffirst_retry: 257ffirst_retry:
258 /* test for Unix extensions */ 258 /* test for Unix extensions */
@@ -297,7 +297,7 @@ static int cifs_unicode_bytelen(char *str)
297 if (ustr[len] == 0) 297 if (ustr[len] == 0)
298 return len << 1; 298 return len << 1;
299 } 299 }
300 cFYI(1, ("Unicode string longer than PATH_MAX found")); 300 cFYI(1, "Unicode string longer than PATH_MAX found");
301 return len << 1; 301 return len << 1;
302} 302}
303 303
@@ -314,19 +314,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
314 pfData->FileNameLength; 314 pfData->FileNameLength;
315 } else 315 } else
316 new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); 316 new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
317 cFYI(1, ("new entry %p old entry %p", new_entry, old_entry)); 317 cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
318 /* validate that new_entry is not past end of SMB */ 318 /* validate that new_entry is not past end of SMB */
319 if (new_entry >= end_of_smb) { 319 if (new_entry >= end_of_smb) {
320 cERROR(1, 320 cERROR(1, "search entry %p began after end of SMB %p old entry %p",
321 ("search entry %p began after end of SMB %p old entry %p", 321 new_entry, end_of_smb, old_entry);
322 new_entry, end_of_smb, old_entry));
323 return NULL; 322 return NULL;
324 } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && 323 } else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
325 (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) 324 (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
326 || ((level != SMB_FIND_FILE_INFO_STANDARD) && 325 || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
327 (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { 326 (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) {
328 cERROR(1, ("search entry %p extends after end of SMB %p", 327 cERROR(1, "search entry %p extends after end of SMB %p",
329 new_entry, end_of_smb)); 328 new_entry, end_of_smb);
330 return NULL; 329 return NULL;
331 } else 330 } else
332 return new_entry; 331 return new_entry;
@@ -380,8 +379,8 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
380 filename = &pFindData->FileName[0]; 379 filename = &pFindData->FileName[0];
381 len = pFindData->FileNameLength; 380 len = pFindData->FileNameLength;
382 } else { 381 } else {
383 cFYI(1, ("Unknown findfirst level %d", 382 cFYI(1, "Unknown findfirst level %d",
384 cfile->srch_inf.info_level)); 383 cfile->srch_inf.info_level);
385 } 384 }
386 385
387 if (filename) { 386 if (filename) {
@@ -481,7 +480,7 @@ static int cifs_save_resume_key(const char *current_entry,
481 len = (unsigned int)pFindData->FileNameLength; 480 len = (unsigned int)pFindData->FileNameLength;
482 cifsFile->srch_inf.resume_key = pFindData->ResumeKey; 481 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
483 } else { 482 } else {
484 cFYI(1, ("Unknown findfirst level %d", level)); 483 cFYI(1, "Unknown findfirst level %d", level);
485 return -EINVAL; 484 return -EINVAL;
486 } 485 }
487 cifsFile->srch_inf.resume_name_len = len; 486 cifsFile->srch_inf.resume_name_len = len;
@@ -525,7 +524,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
525 is_dir_changed(file)) || 524 is_dir_changed(file)) ||
526 (index_to_find < first_entry_in_buffer)) { 525 (index_to_find < first_entry_in_buffer)) {
527 /* close and restart search */ 526 /* close and restart search */
528 cFYI(1, ("search backing up - close and restart search")); 527 cFYI(1, "search backing up - close and restart search");
529 write_lock(&GlobalSMBSeslock); 528 write_lock(&GlobalSMBSeslock);
530 if (!cifsFile->srch_inf.endOfSearch && 529 if (!cifsFile->srch_inf.endOfSearch &&
531 !cifsFile->invalidHandle) { 530 !cifsFile->invalidHandle) {
@@ -535,7 +534,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
535 } else 534 } else
536 write_unlock(&GlobalSMBSeslock); 535 write_unlock(&GlobalSMBSeslock);
537 if (cifsFile->srch_inf.ntwrk_buf_start) { 536 if (cifsFile->srch_inf.ntwrk_buf_start) {
538 cFYI(1, ("freeing SMB ff cache buf on search rewind")); 537 cFYI(1, "freeing SMB ff cache buf on search rewind");
539 if (cifsFile->srch_inf.smallBuf) 538 if (cifsFile->srch_inf.smallBuf)
540 cifs_small_buf_release(cifsFile->srch_inf. 539 cifs_small_buf_release(cifsFile->srch_inf.
541 ntwrk_buf_start); 540 ntwrk_buf_start);
@@ -546,8 +545,8 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
546 } 545 }
547 rc = initiate_cifs_search(xid, file); 546 rc = initiate_cifs_search(xid, file);
548 if (rc) { 547 if (rc) {
549 cFYI(1, ("error %d reinitiating a search on rewind", 548 cFYI(1, "error %d reinitiating a search on rewind",
550 rc)); 549 rc);
551 return rc; 550 return rc;
552 } 551 }
553 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 552 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -555,7 +554,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
555 554
556 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && 555 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
557 (rc == 0) && !cifsFile->srch_inf.endOfSearch) { 556 (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
558 cFYI(1, ("calling findnext2")); 557 cFYI(1, "calling findnext2");
559 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, 558 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
560 &cifsFile->srch_inf); 559 &cifsFile->srch_inf);
561 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 560 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -575,7 +574,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
575 first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry 574 first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
576 - cifsFile->srch_inf.entries_in_buffer; 575 - cifsFile->srch_inf.entries_in_buffer;
577 pos_in_buf = index_to_find - first_entry_in_buffer; 576 pos_in_buf = index_to_find - first_entry_in_buffer;
578 cFYI(1, ("found entry - pos_in_buf %d", pos_in_buf)); 577 cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
579 578
580 for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) { 579 for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
581 /* go entry by entry figuring out which is first */ 580 /* go entry by entry figuring out which is first */
@@ -584,19 +583,19 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
584 } 583 }
585 if ((current_entry == NULL) && (i < pos_in_buf)) { 584 if ((current_entry == NULL) && (i < pos_in_buf)) {
586 /* BB fixme - check if we should flag this error */ 585 /* BB fixme - check if we should flag this error */
587 cERROR(1, ("reached end of buf searching for pos in buf" 586 cERROR(1, "reached end of buf searching for pos in buf"
588 " %d index to find %lld rc %d", 587 " %d index to find %lld rc %d",
589 pos_in_buf, index_to_find, rc)); 588 pos_in_buf, index_to_find, rc);
590 } 589 }
591 rc = 0; 590 rc = 0;
592 *ppCurrentEntry = current_entry; 591 *ppCurrentEntry = current_entry;
593 } else { 592 } else {
594 cFYI(1, ("index not in buffer - could not findnext into it")); 593 cFYI(1, "index not in buffer - could not findnext into it");
595 return 0; 594 return 0;
596 } 595 }
597 596
598 if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) { 597 if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
599 cFYI(1, ("can not return entries pos_in_buf beyond last")); 598 cFYI(1, "can not return entries pos_in_buf beyond last");
600 *num_to_ret = 0; 599 *num_to_ret = 0;
601 } else 600 } else
602 *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf; 601 *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -656,12 +655,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
656 /* one byte length, no name conversion */ 655 /* one byte length, no name conversion */
657 len = (unsigned int)pFindData->FileNameLength; 656 len = (unsigned int)pFindData->FileNameLength;
658 } else { 657 } else {
659 cFYI(1, ("Unknown findfirst level %d", level)); 658 cFYI(1, "Unknown findfirst level %d", level);
660 return -EINVAL; 659 return -EINVAL;
661 } 660 }
662 661
663 if (len > max_len) { 662 if (len > max_len) {
664 cERROR(1, ("bad search response length %d past smb end", len)); 663 cERROR(1, "bad search response length %d past smb end", len);
665 return -EINVAL; 664 return -EINVAL;
666 } 665 }
667 666
@@ -754,7 +753,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
754 * case already. Why should we be clobbering other errors from it? 753 * case already. Why should we be clobbering other errors from it?
755 */ 754 */
756 if (rc) { 755 if (rc) {
757 cFYI(1, ("filldir rc = %d", rc)); 756 cFYI(1, "filldir rc = %d", rc);
758 rc = -EOVERFLOW; 757 rc = -EOVERFLOW;
759 } 758 }
760 dput(tmp_dentry); 759 dput(tmp_dentry);
@@ -786,7 +785,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
786 case 0: 785 case 0:
787 if (filldir(direntry, ".", 1, file->f_pos, 786 if (filldir(direntry, ".", 1, file->f_pos,
788 file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) { 787 file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
789 cERROR(1, ("Filldir for current dir failed")); 788 cERROR(1, "Filldir for current dir failed");
790 rc = -ENOMEM; 789 rc = -ENOMEM;
791 break; 790 break;
792 } 791 }
@@ -794,7 +793,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
794 case 1: 793 case 1:
795 if (filldir(direntry, "..", 2, file->f_pos, 794 if (filldir(direntry, "..", 2, file->f_pos,
796 file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { 795 file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
797 cERROR(1, ("Filldir for parent dir failed")); 796 cERROR(1, "Filldir for parent dir failed");
798 rc = -ENOMEM; 797 rc = -ENOMEM;
799 break; 798 break;
800 } 799 }
@@ -807,7 +806,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
807 806
808 if (file->private_data == NULL) { 807 if (file->private_data == NULL) {
809 rc = initiate_cifs_search(xid, file); 808 rc = initiate_cifs_search(xid, file);
810 cFYI(1, ("initiate cifs search rc %d", rc)); 809 cFYI(1, "initiate cifs search rc %d", rc);
811 if (rc) { 810 if (rc) {
812 FreeXid(xid); 811 FreeXid(xid);
813 return rc; 812 return rc;
@@ -821,7 +820,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
821 cifsFile = file->private_data; 820 cifsFile = file->private_data;
822 if (cifsFile->srch_inf.endOfSearch) { 821 if (cifsFile->srch_inf.endOfSearch) {
823 if (cifsFile->srch_inf.emptyDir) { 822 if (cifsFile->srch_inf.emptyDir) {
824 cFYI(1, ("End of search, empty dir")); 823 cFYI(1, "End of search, empty dir");
825 rc = 0; 824 rc = 0;
826 break; 825 break;
827 } 826 }
@@ -833,26 +832,31 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
833 rc = find_cifs_entry(xid, pTcon, file, 832 rc = find_cifs_entry(xid, pTcon, file,
834 &current_entry, &num_to_fill); 833 &current_entry, &num_to_fill);
835 if (rc) { 834 if (rc) {
836 cFYI(1, ("fce error %d", rc)); 835 cFYI(1, "fce error %d", rc);
837 goto rddir2_exit; 836 goto rddir2_exit;
838 } else if (current_entry != NULL) { 837 } else if (current_entry != NULL) {
839 cFYI(1, ("entry %lld found", file->f_pos)); 838 cFYI(1, "entry %lld found", file->f_pos);
840 } else { 839 } else {
841 cFYI(1, ("could not find entry")); 840 cFYI(1, "could not find entry");
842 goto rddir2_exit; 841 goto rddir2_exit;
843 } 842 }
844 cFYI(1, ("loop through %d times filling dir for net buf %p", 843 cFYI(1, "loop through %d times filling dir for net buf %p",
845 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start)); 844 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
846 max_len = smbCalcSize((struct smb_hdr *) 845 max_len = smbCalcSize((struct smb_hdr *)
847 cifsFile->srch_inf.ntwrk_buf_start); 846 cifsFile->srch_inf.ntwrk_buf_start);
848 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; 847 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
849 848
850 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); 849 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
850 if (tmp_buf == NULL) {
851 rc = -ENOMEM;
852 break;
853 }
854
851 for (i = 0; (i < num_to_fill) && (rc == 0); i++) { 855 for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
852 if (current_entry == NULL) { 856 if (current_entry == NULL) {
853 /* evaluate whether this case is an error */ 857 /* evaluate whether this case is an error */
854 cERROR(1, ("past SMB end, num to fill %d i %d", 858 cERROR(1, "past SMB end, num to fill %d i %d",
855 num_to_fill, i)); 859 num_to_fill, i);
856 break; 860 break;
857 } 861 }
858 /* if buggy server returns . and .. late do 862 /* if buggy server returns . and .. late do
@@ -867,8 +871,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
867 file->f_pos++; 871 file->f_pos++;
868 if (file->f_pos == 872 if (file->f_pos ==
869 cifsFile->srch_inf.index_of_last_entry) { 873 cifsFile->srch_inf.index_of_last_entry) {
870 cFYI(1, ("last entry in buf at pos %lld %s", 874 cFYI(1, "last entry in buf at pos %lld %s",
871 file->f_pos, tmp_buf)); 875 file->f_pos, tmp_buf);
872 cifs_save_resume_key(current_entry, cifsFile); 876 cifs_save_resume_key(current_entry, cifsFile);
873 break; 877 break;
874 } else 878 } else
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7c3fd7463f44..0a57cb7db5dd 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -35,9 +35,11 @@
35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
36 unsigned char *p24); 36 unsigned char *p24);
37 37
38/* Checks if this is the first smb session to be reconnected after 38/*
39 the socket has been reestablished (so we know whether to use vc 0). 39 * Checks if this is the first smb session to be reconnected after
40 Called while holding the cifs_tcp_ses_lock, so do not block */ 40 * the socket has been reestablished (so we know whether to use vc 0).
41 * Called while holding the cifs_tcp_ses_lock, so do not block
42 */
41static bool is_first_ses_reconnect(struct cifsSesInfo *ses) 43static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
42{ 44{
43 struct list_head *tmp; 45 struct list_head *tmp;
@@ -284,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
284 int len; 286 int len;
285 char *data = *pbcc_area; 287 char *data = *pbcc_area;
286 288
287 cFYI(1, ("bleft %d", bleft)); 289 cFYI(1, "bleft %d", bleft);
288 290
289 /* 291 /*
290 * Windows servers do not always double null terminate their final 292 * Windows servers do not always double null terminate their final
@@ -301,7 +303,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
301 303
302 kfree(ses->serverOS); 304 kfree(ses->serverOS);
303 ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 305 ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
304 cFYI(1, ("serverOS=%s", ses->serverOS)); 306 cFYI(1, "serverOS=%s", ses->serverOS);
305 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; 307 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
306 data += len; 308 data += len;
307 bleft -= len; 309 bleft -= len;
@@ -310,7 +312,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
310 312
311 kfree(ses->serverNOS); 313 kfree(ses->serverNOS);
312 ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 314 ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
313 cFYI(1, ("serverNOS=%s", ses->serverNOS)); 315 cFYI(1, "serverNOS=%s", ses->serverNOS);
314 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; 316 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
315 data += len; 317 data += len;
316 bleft -= len; 318 bleft -= len;
@@ -319,7 +321,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
319 321
320 kfree(ses->serverDomain); 322 kfree(ses->serverDomain);
321 ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 323 ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
322 cFYI(1, ("serverDomain=%s", ses->serverDomain)); 324 cFYI(1, "serverDomain=%s", ses->serverDomain);
323 325
324 return; 326 return;
325} 327}
@@ -332,7 +334,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
332 int len; 334 int len;
333 char *bcc_ptr = *pbcc_area; 335 char *bcc_ptr = *pbcc_area;
334 336
335 cFYI(1, ("decode sessetup ascii. bleft %d", bleft)); 337 cFYI(1, "decode sessetup ascii. bleft %d", bleft);
336 338
337 len = strnlen(bcc_ptr, bleft); 339 len = strnlen(bcc_ptr, bleft);
338 if (len >= bleft) 340 if (len >= bleft)
@@ -344,7 +346,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
344 if (ses->serverOS) 346 if (ses->serverOS)
345 strncpy(ses->serverOS, bcc_ptr, len); 347 strncpy(ses->serverOS, bcc_ptr, len);
346 if (strncmp(ses->serverOS, "OS/2", 4) == 0) { 348 if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
347 cFYI(1, ("OS/2 server")); 349 cFYI(1, "OS/2 server");
348 ses->flags |= CIFS_SES_OS2; 350 ses->flags |= CIFS_SES_OS2;
349 } 351 }
350 352
@@ -373,7 +375,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
373 /* BB For newer servers which do not support Unicode, 375 /* BB For newer servers which do not support Unicode,
374 but thus do return domain here we could add parsing 376 but thus do return domain here we could add parsing
375 for it later, but it is not very important */ 377 for it later, but it is not very important */
376 cFYI(1, ("ascii: bytes left %d", bleft)); 378 cFYI(1, "ascii: bytes left %d", bleft);
377 379
378 return rc; 380 return rc;
379} 381}
@@ -384,16 +386,16 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
384 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; 386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
385 387
386 if (blob_len < sizeof(CHALLENGE_MESSAGE)) { 388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
387 cERROR(1, ("challenge blob len %d too small", blob_len)); 389 cERROR(1, "challenge blob len %d too small", blob_len);
388 return -EINVAL; 390 return -EINVAL;
389 } 391 }
390 392
391 if (memcmp(pblob->Signature, "NTLMSSP", 8)) { 393 if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
392 cERROR(1, ("blob signature incorrect %s", pblob->Signature)); 394 cERROR(1, "blob signature incorrect %s", pblob->Signature);
393 return -EINVAL; 395 return -EINVAL;
394 } 396 }
395 if (pblob->MessageType != NtLmChallenge) { 397 if (pblob->MessageType != NtLmChallenge) {
396 cERROR(1, ("Incorrect message type %d", pblob->MessageType)); 398 cERROR(1, "Incorrect message type %d", pblob->MessageType);
397 return -EINVAL; 399 return -EINVAL;
398 } 400 }
399 401
@@ -447,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
447 This function returns the length of the data in the blob */ 449 This function returns the length of the data in the blob */
448static int build_ntlmssp_auth_blob(unsigned char *pbuffer, 450static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
449 struct cifsSesInfo *ses, 451 struct cifsSesInfo *ses,
450 const struct nls_table *nls_cp, int first) 452 const struct nls_table *nls_cp, bool first)
451{ 453{
452 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; 454 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
453 __u32 flags; 455 __u32 flags;
@@ -546,7 +548,7 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
546 548
547static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB, 549static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
548 struct cifsSesInfo *ses, 550 struct cifsSesInfo *ses,
549 const struct nls_table *nls, int first_time) 551 const struct nls_table *nls, bool first_time)
550{ 552{
551 int bloblen; 553 int bloblen;
552 554
@@ -559,8 +561,8 @@ static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
559#endif 561#endif
560 562
561int 563int
562CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, 564CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
563 const struct nls_table *nls_cp) 565 const struct nls_table *nls_cp)
564{ 566{
565 int rc = 0; 567 int rc = 0;
566 int wct; 568 int wct;
@@ -577,13 +579,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
577 int bytes_remaining; 579 int bytes_remaining;
578 struct key *spnego_key = NULL; 580 struct key *spnego_key = NULL;
579 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 581 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
582 bool first_time;
580 583
581 if (ses == NULL) 584 if (ses == NULL)
582 return -EINVAL; 585 return -EINVAL;
583 586
587 read_lock(&cifs_tcp_ses_lock);
588 first_time = is_first_ses_reconnect(ses);
589 read_unlock(&cifs_tcp_ses_lock);
590
584 type = ses->server->secType; 591 type = ses->server->secType;
585 592
586 cFYI(1, ("sess setup type %d", type)); 593 cFYI(1, "sess setup type %d", type);
587ssetup_ntlmssp_authenticate: 594ssetup_ntlmssp_authenticate:
588 if (phase == NtLmChallenge) 595 if (phase == NtLmChallenge)
589 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ 596 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -664,7 +671,7 @@ ssetup_ntlmssp_authenticate:
664 changed to do higher than lanman dialect and 671 changed to do higher than lanman dialect and
665 we reconnected would we ever calc signing_key? */ 672 we reconnected would we ever calc signing_key? */
666 673
667 cFYI(1, ("Negotiating LANMAN setting up strings")); 674 cFYI(1, "Negotiating LANMAN setting up strings");
668 /* Unicode not allowed for LANMAN dialects */ 675 /* Unicode not allowed for LANMAN dialects */
669 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 676 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
670#endif 677#endif
@@ -723,15 +730,7 @@ ssetup_ntlmssp_authenticate:
723 730
724 /* calculate session key */ 731 /* calculate session key */
725 setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); 732 setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
726 if (first_time) /* should this be moved into common code 733 /* FIXME: calculate MAC key */
727 with similar ntlmv2 path? */
728 /* cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key,
729 response BB FIXME, v2_sess_key); */
730
731 /* copy session key */
732
733 /* memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE);
734 bcc_ptr += LM2_SESS_KEY_SIZE; */
735 memcpy(bcc_ptr, (char *)v2_sess_key, 734 memcpy(bcc_ptr, (char *)v2_sess_key,
736 sizeof(struct ntlmv2_resp)); 735 sizeof(struct ntlmv2_resp));
737 bcc_ptr += sizeof(struct ntlmv2_resp); 736 bcc_ptr += sizeof(struct ntlmv2_resp);
@@ -744,7 +743,7 @@ ssetup_ntlmssp_authenticate:
744 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); 743 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
745 } else 744 } else
746 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 745 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
747 } else if (type == Kerberos || type == MSKerberos) { 746 } else if (type == Kerberos) {
748#ifdef CONFIG_CIFS_UPCALL 747#ifdef CONFIG_CIFS_UPCALL
749 struct cifs_spnego_msg *msg; 748 struct cifs_spnego_msg *msg;
750 spnego_key = cifs_get_spnego_key(ses); 749 spnego_key = cifs_get_spnego_key(ses);
@@ -758,17 +757,17 @@ ssetup_ntlmssp_authenticate:
758 /* check version field to make sure that cifs.upcall is 757 /* check version field to make sure that cifs.upcall is
759 sending us a response in an expected form */ 758 sending us a response in an expected form */
760 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { 759 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
761 cERROR(1, ("incorrect version of cifs.upcall (expected" 760 cERROR(1, "incorrect version of cifs.upcall (expected"
762 " %d but got %d)", 761 " %d but got %d)",
763 CIFS_SPNEGO_UPCALL_VERSION, msg->version)); 762 CIFS_SPNEGO_UPCALL_VERSION, msg->version);
764 rc = -EKEYREJECTED; 763 rc = -EKEYREJECTED;
765 goto ssetup_exit; 764 goto ssetup_exit;
766 } 765 }
767 /* bail out if key is too long */ 766 /* bail out if key is too long */
768 if (msg->sesskey_len > 767 if (msg->sesskey_len >
769 sizeof(ses->server->mac_signing_key.data.krb5)) { 768 sizeof(ses->server->mac_signing_key.data.krb5)) {
770 cERROR(1, ("Kerberos signing key too long (%u bytes)", 769 cERROR(1, "Kerberos signing key too long (%u bytes)",
771 msg->sesskey_len)); 770 msg->sesskey_len);
772 rc = -EOVERFLOW; 771 rc = -EOVERFLOW;
773 goto ssetup_exit; 772 goto ssetup_exit;
774 } 773 }
@@ -796,7 +795,7 @@ ssetup_ntlmssp_authenticate:
796 /* BB: is this right? */ 795 /* BB: is this right? */
797 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 796 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
798#else /* ! CONFIG_CIFS_UPCALL */ 797#else /* ! CONFIG_CIFS_UPCALL */
799 cERROR(1, ("Kerberos negotiated but upcall support disabled!")); 798 cERROR(1, "Kerberos negotiated but upcall support disabled!");
800 rc = -ENOSYS; 799 rc = -ENOSYS;
801 goto ssetup_exit; 800 goto ssetup_exit;
802#endif /* CONFIG_CIFS_UPCALL */ 801#endif /* CONFIG_CIFS_UPCALL */
@@ -804,12 +803,12 @@ ssetup_ntlmssp_authenticate:
804#ifdef CONFIG_CIFS_EXPERIMENTAL 803#ifdef CONFIG_CIFS_EXPERIMENTAL
805 if (type == RawNTLMSSP) { 804 if (type == RawNTLMSSP) {
806 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { 805 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
807 cERROR(1, ("NTLMSSP requires Unicode support")); 806 cERROR(1, "NTLMSSP requires Unicode support");
808 rc = -ENOSYS; 807 rc = -ENOSYS;
809 goto ssetup_exit; 808 goto ssetup_exit;
810 } 809 }
811 810
812 cFYI(1, ("ntlmssp session setup phase %d", phase)); 811 cFYI(1, "ntlmssp session setup phase %d", phase);
813 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 812 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
814 capabilities |= CAP_EXTENDED_SECURITY; 813 capabilities |= CAP_EXTENDED_SECURITY;
815 pSMB->req.Capabilities |= cpu_to_le32(capabilities); 814 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -827,7 +826,7 @@ ssetup_ntlmssp_authenticate:
827 on the response (challenge) */ 826 on the response (challenge) */
828 smb_buf->Uid = ses->Suid; 827 smb_buf->Uid = ses->Suid;
829 } else { 828 } else {
830 cERROR(1, ("invalid phase %d", phase)); 829 cERROR(1, "invalid phase %d", phase);
831 rc = -ENOSYS; 830 rc = -ENOSYS;
832 goto ssetup_exit; 831 goto ssetup_exit;
833 } 832 }
@@ -839,12 +838,12 @@ ssetup_ntlmssp_authenticate:
839 } 838 }
840 unicode_oslm_strings(&bcc_ptr, nls_cp); 839 unicode_oslm_strings(&bcc_ptr, nls_cp);
841 } else { 840 } else {
842 cERROR(1, ("secType %d not supported!", type)); 841 cERROR(1, "secType %d not supported!", type);
843 rc = -ENOSYS; 842 rc = -ENOSYS;
844 goto ssetup_exit; 843 goto ssetup_exit;
845 } 844 }
846#else 845#else
847 cERROR(1, ("secType %d not supported!", type)); 846 cERROR(1, "secType %d not supported!", type);
848 rc = -ENOSYS; 847 rc = -ENOSYS;
849 goto ssetup_exit; 848 goto ssetup_exit;
850#endif 849#endif
@@ -862,7 +861,7 @@ ssetup_ntlmssp_authenticate:
862 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); 861 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
863 /* SMB request buf freed in SendReceive2 */ 862 /* SMB request buf freed in SendReceive2 */
864 863
865 cFYI(1, ("ssetup rc from sendrecv2 is %d", rc)); 864 cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
866 865
867 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 866 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
868 smb_buf = (struct smb_hdr *)iov[0].iov_base; 867 smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -870,7 +869,7 @@ ssetup_ntlmssp_authenticate:
870 if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError == 869 if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
871 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { 870 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
872 if (phase != NtLmNegotiate) { 871 if (phase != NtLmNegotiate) {
873 cERROR(1, ("Unexpected more processing error")); 872 cERROR(1, "Unexpected more processing error");
874 goto ssetup_exit; 873 goto ssetup_exit;
875 } 874 }
876 /* NTLMSSP Negotiate sent now processing challenge (response) */ 875 /* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -882,14 +881,14 @@ ssetup_ntlmssp_authenticate:
882 881
883 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { 882 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
884 rc = -EIO; 883 rc = -EIO;
885 cERROR(1, ("bad word count %d", smb_buf->WordCount)); 884 cERROR(1, "bad word count %d", smb_buf->WordCount);
886 goto ssetup_exit; 885 goto ssetup_exit;
887 } 886 }
888 action = le16_to_cpu(pSMB->resp.Action); 887 action = le16_to_cpu(pSMB->resp.Action);
889 if (action & GUEST_LOGIN) 888 if (action & GUEST_LOGIN)
890 cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */ 889 cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
891 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ 890 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
892 cFYI(1, ("UID = %d ", ses->Suid)); 891 cFYI(1, "UID = %d ", ses->Suid);
893 /* response can have either 3 or 4 word count - Samba sends 3 */ 892 /* response can have either 3 or 4 word count - Samba sends 3 */
894 /* and lanman response is 3 */ 893 /* and lanman response is 3 */
895 bytes_remaining = BCC(smb_buf); 894 bytes_remaining = BCC(smb_buf);
@@ -899,7 +898,7 @@ ssetup_ntlmssp_authenticate:
899 __u16 blob_len; 898 __u16 blob_len;
900 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 899 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
901 if (blob_len > bytes_remaining) { 900 if (blob_len > bytes_remaining) {
902 cERROR(1, ("bad security blob length %d", blob_len)); 901 cERROR(1, "bad security blob length %d", blob_len);
903 rc = -EINVAL; 902 rc = -EINVAL;
904 goto ssetup_exit; 903 goto ssetup_exit;
905 } 904 }
@@ -933,7 +932,7 @@ ssetup_exit:
933 } 932 }
934 kfree(str_area); 933 kfree(str_area);
935 if (resp_buf_type == CIFS_SMALL_BUFFER) { 934 if (resp_buf_type == CIFS_SMALL_BUFFER) {
936 cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base)); 935 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
937 cifs_small_buf_release(iov[0].iov_base); 936 cifs_small_buf_release(iov[0].iov_base);
938 } else if (resp_buf_type == CIFS_LARGE_BUFFER) 937 } else if (resp_buf_type == CIFS_LARGE_BUFFER)
939 cifs_buf_release(iov[0].iov_base); 938 cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index c5084d27db7c..7f16cb825fe5 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -76,6 +76,7 @@
76#define ERRnofiles 18 /* A File Search command can find no 76#define ERRnofiles 18 /* A File Search command can find no
77 more files matching the specified 77 more files matching the specified
78 criteria. */ 78 criteria. */
79#define ERRwriteprot 19 /* media is write protected */
79#define ERRgeneral 31 80#define ERRgeneral 31
80#define ERRbadshare 32 /* The sharing mode specified for an 81#define ERRbadshare 32 /* The sharing mode specified for an
81 Open conflicts with existing FIDs on 82 Open conflicts with existing FIDs on
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ad081fe7eb18..82f78c4d6978 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -35,7 +35,6 @@
35#include "cifs_debug.h" 35#include "cifs_debug.h"
36 36
37extern mempool_t *cifs_mid_poolp; 37extern mempool_t *cifs_mid_poolp;
38extern struct kmem_cache *cifs_oplock_cachep;
39 38
40static struct mid_q_entry * 39static struct mid_q_entry *
41AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) 40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
@@ -43,7 +42,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
43 struct mid_q_entry *temp; 42 struct mid_q_entry *temp;
44 43
45 if (server == NULL) { 44 if (server == NULL) {
46 cERROR(1, ("Null TCP session in AllocMidQEntry")); 45 cERROR(1, "Null TCP session in AllocMidQEntry");
47 return NULL; 46 return NULL;
48 } 47 }
49 48
@@ -55,7 +54,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
55 temp->mid = smb_buffer->Mid; /* always LE */ 54 temp->mid = smb_buffer->Mid; /* always LE */
56 temp->pid = current->pid; 55 temp->pid = current->pid;
57 temp->command = smb_buffer->Command; 56 temp->command = smb_buffer->Command;
58 cFYI(1, ("For smb_command %d", temp->command)); 57 cFYI(1, "For smb_command %d", temp->command);
59 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ 58 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
60 /* when mid allocated can be before when sent */ 59 /* when mid allocated can be before when sent */
61 temp->when_alloc = jiffies; 60 temp->when_alloc = jiffies;
@@ -140,7 +139,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
140 total_len += iov[i].iov_len; 139 total_len += iov[i].iov_len;
141 140
142 smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length); 141 smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
143 cFYI(1, ("Sending smb: total_len %d", total_len)); 142 cFYI(1, "Sending smb: total_len %d", total_len);
144 dump_smb(smb_buffer, len); 143 dump_smb(smb_buffer, len);
145 144
146 i = 0; 145 i = 0;
@@ -168,9 +167,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
168 reconnect which may clear the network problem. 167 reconnect which may clear the network problem.
169 */ 168 */
170 if ((i >= 14) || (!server->noblocksnd && (i > 2))) { 169 if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
171 cERROR(1, 170 cERROR(1, "sends on sock %p stuck for 15 seconds",
172 ("sends on sock %p stuck for 15 seconds", 171 ssocket);
173 ssocket));
174 rc = -EAGAIN; 172 rc = -EAGAIN;
175 break; 173 break;
176 } 174 }
@@ -184,13 +182,13 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
184 total_len = 0; 182 total_len = 0;
185 break; 183 break;
186 } else if (rc > total_len) { 184 } else if (rc > total_len) {
187 cERROR(1, ("sent %d requested %d", rc, total_len)); 185 cERROR(1, "sent %d requested %d", rc, total_len);
188 break; 186 break;
189 } 187 }
190 if (rc == 0) { 188 if (rc == 0) {
191 /* should never happen, letting socket clear before 189 /* should never happen, letting socket clear before
192 retrying is our only obvious option here */ 190 retrying is our only obvious option here */
193 cERROR(1, ("tcp sent no data")); 191 cERROR(1, "tcp sent no data");
194 msleep(500); 192 msleep(500);
195 continue; 193 continue;
196 } 194 }
@@ -213,8 +211,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
213 } 211 }
214 212
215 if ((total_len > 0) && (total_len != smb_buf_length + 4)) { 213 if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
216 cFYI(1, ("partial send (%d remaining), terminating session", 214 cFYI(1, "partial send (%d remaining), terminating session",
217 total_len)); 215 total_len);
218 /* If we have only sent part of an SMB then the next SMB 216 /* If we have only sent part of an SMB then the next SMB
219 could be taken as the remainder of this one. We need 217 could be taken as the remainder of this one. We need
220 to kill the socket so the server throws away the partial 218 to kill the socket so the server throws away the partial
@@ -223,7 +221,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
223 } 221 }
224 222
225 if (rc < 0) { 223 if (rc < 0) {
226 cERROR(1, ("Error %d sending data on socket to server", rc)); 224 cERROR(1, "Error %d sending data on socket to server", rc);
227 } else 225 } else
228 rc = 0; 226 rc = 0;
229 227
@@ -296,7 +294,7 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
296 } 294 }
297 295
298 if (ses->server->tcpStatus == CifsNeedReconnect) { 296 if (ses->server->tcpStatus == CifsNeedReconnect) {
299 cFYI(1, ("tcp session dead - return to caller to retry")); 297 cFYI(1, "tcp session dead - return to caller to retry");
300 return -EAGAIN; 298 return -EAGAIN;
301 } 299 }
302 300
@@ -348,7 +346,7 @@ static int wait_for_response(struct cifsSesInfo *ses,
348 lrt += time_to_wait; 346 lrt += time_to_wait;
349 if (time_after(jiffies, lrt)) { 347 if (time_after(jiffies, lrt)) {
350 /* No replies for time_to_wait. */ 348 /* No replies for time_to_wait. */
351 cERROR(1, ("server not responding")); 349 cERROR(1, "server not responding");
352 return -1; 350 return -1;
353 } 351 }
354 } else { 352 } else {
@@ -379,7 +377,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
379 iov[0].iov_len = in_buf->smb_buf_length + 4; 377 iov[0].iov_len = in_buf->smb_buf_length + 4;
380 flags |= CIFS_NO_RESP; 378 flags |= CIFS_NO_RESP;
381 rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); 379 rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
382 cFYI(DBG2, ("SendRcvNoRsp flags %d rc %d", flags, rc)); 380 cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
383 381
384 return rc; 382 return rc;
385} 383}
@@ -402,7 +400,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
402 400
403 if ((ses == NULL) || (ses->server == NULL)) { 401 if ((ses == NULL) || (ses->server == NULL)) {
404 cifs_small_buf_release(in_buf); 402 cifs_small_buf_release(in_buf);
405 cERROR(1, ("Null session")); 403 cERROR(1, "Null session");
406 return -EIO; 404 return -EIO;
407 } 405 }
408 406
@@ -471,7 +469,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
471 else if (long_op == CIFS_BLOCKING_OP) 469 else if (long_op == CIFS_BLOCKING_OP)
472 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */ 470 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */
473 else { 471 else {
474 cERROR(1, ("unknown timeout flag %d", long_op)); 472 cERROR(1, "unknown timeout flag %d", long_op);
475 rc = -EIO; 473 rc = -EIO;
476 goto out; 474 goto out;
477 } 475 }
@@ -490,8 +488,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
490 spin_lock(&GlobalMid_Lock); 488 spin_lock(&GlobalMid_Lock);
491 489
492 if (midQ->resp_buf == NULL) { 490 if (midQ->resp_buf == NULL) {
493 cERROR(1, ("No response to cmd %d mid %d", 491 cERROR(1, "No response to cmd %d mid %d",
494 midQ->command, midQ->mid)); 492 midQ->command, midQ->mid);
495 if (midQ->midState == MID_REQUEST_SUBMITTED) { 493 if (midQ->midState == MID_REQUEST_SUBMITTED) {
496 if (ses->server->tcpStatus == CifsExiting) 494 if (ses->server->tcpStatus == CifsExiting)
497 rc = -EHOSTDOWN; 495 rc = -EHOSTDOWN;
@@ -504,7 +502,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
504 if (rc != -EHOSTDOWN) { 502 if (rc != -EHOSTDOWN) {
505 if (midQ->midState == MID_RETRY_NEEDED) { 503 if (midQ->midState == MID_RETRY_NEEDED) {
506 rc = -EAGAIN; 504 rc = -EAGAIN;
507 cFYI(1, ("marking request for retry")); 505 cFYI(1, "marking request for retry");
508 } else { 506 } else {
509 rc = -EIO; 507 rc = -EIO;
510 } 508 }
@@ -521,8 +519,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
521 receive_len = midQ->resp_buf->smb_buf_length; 519 receive_len = midQ->resp_buf->smb_buf_length;
522 520
523 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 521 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
524 cERROR(1, ("Frame too large received. Length: %d Xid: %d", 522 cERROR(1, "Frame too large received. Length: %d Xid: %d",
525 receive_len, xid)); 523 receive_len, xid);
526 rc = -EIO; 524 rc = -EIO;
527 goto out; 525 goto out;
528 } 526 }
@@ -548,7 +546,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
548 &ses->server->mac_signing_key, 546 &ses->server->mac_signing_key,
549 midQ->sequence_number+1); 547 midQ->sequence_number+1);
550 if (rc) { 548 if (rc) {
551 cERROR(1, ("Unexpected SMB signature")); 549 cERROR(1, "Unexpected SMB signature");
552 /* BB FIXME add code to kill session */ 550 /* BB FIXME add code to kill session */
553 } 551 }
554 } 552 }
@@ -569,7 +567,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
569 DeleteMidQEntry */ 567 DeleteMidQEntry */
570 } else { 568 } else {
571 rc = -EIO; 569 rc = -EIO;
572 cFYI(1, ("Bad MID state?")); 570 cFYI(1, "Bad MID state?");
573 } 571 }
574 572
575out: 573out:
@@ -591,11 +589,11 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
591 struct mid_q_entry *midQ; 589 struct mid_q_entry *midQ;
592 590
593 if (ses == NULL) { 591 if (ses == NULL) {
594 cERROR(1, ("Null smb session")); 592 cERROR(1, "Null smb session");
595 return -EIO; 593 return -EIO;
596 } 594 }
597 if (ses->server == NULL) { 595 if (ses->server == NULL) {
598 cERROR(1, ("Null tcp session")); 596 cERROR(1, "Null tcp session");
599 return -EIO; 597 return -EIO;
600 } 598 }
601 599
@@ -607,8 +605,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
607 use ses->maxReq */ 605 use ses->maxReq */
608 606
609 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 607 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
610 cERROR(1, ("Illegal length, greater than maximum frame, %d", 608 cERROR(1, "Illegal length, greater than maximum frame, %d",
611 in_buf->smb_buf_length)); 609 in_buf->smb_buf_length);
612 return -EIO; 610 return -EIO;
613 } 611 }
614 612
@@ -665,7 +663,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
665 else if (long_op == CIFS_BLOCKING_OP) 663 else if (long_op == CIFS_BLOCKING_OP)
666 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */ 664 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
667 else { 665 else {
668 cERROR(1, ("unknown timeout flag %d", long_op)); 666 cERROR(1, "unknown timeout flag %d", long_op);
669 rc = -EIO; 667 rc = -EIO;
670 goto out; 668 goto out;
671 } 669 }
@@ -681,8 +679,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
681 679
682 spin_lock(&GlobalMid_Lock); 680 spin_lock(&GlobalMid_Lock);
683 if (midQ->resp_buf == NULL) { 681 if (midQ->resp_buf == NULL) {
684 cERROR(1, ("No response for cmd %d mid %d", 682 cERROR(1, "No response for cmd %d mid %d",
685 midQ->command, midQ->mid)); 683 midQ->command, midQ->mid);
686 if (midQ->midState == MID_REQUEST_SUBMITTED) { 684 if (midQ->midState == MID_REQUEST_SUBMITTED) {
687 if (ses->server->tcpStatus == CifsExiting) 685 if (ses->server->tcpStatus == CifsExiting)
688 rc = -EHOSTDOWN; 686 rc = -EHOSTDOWN;
@@ -695,7 +693,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
695 if (rc != -EHOSTDOWN) { 693 if (rc != -EHOSTDOWN) {
696 if (midQ->midState == MID_RETRY_NEEDED) { 694 if (midQ->midState == MID_RETRY_NEEDED) {
697 rc = -EAGAIN; 695 rc = -EAGAIN;
698 cFYI(1, ("marking request for retry")); 696 cFYI(1, "marking request for retry");
699 } else { 697 } else {
700 rc = -EIO; 698 rc = -EIO;
701 } 699 }
@@ -712,8 +710,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
712 receive_len = midQ->resp_buf->smb_buf_length; 710 receive_len = midQ->resp_buf->smb_buf_length;
713 711
714 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 712 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
715 cERROR(1, ("Frame too large received. Length: %d Xid: %d", 713 cERROR(1, "Frame too large received. Length: %d Xid: %d",
716 receive_len, xid)); 714 receive_len, xid);
717 rc = -EIO; 715 rc = -EIO;
718 goto out; 716 goto out;
719 } 717 }
@@ -736,7 +734,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
736 &ses->server->mac_signing_key, 734 &ses->server->mac_signing_key,
737 midQ->sequence_number+1); 735 midQ->sequence_number+1);
738 if (rc) { 736 if (rc) {
739 cERROR(1, ("Unexpected SMB signature")); 737 cERROR(1, "Unexpected SMB signature");
740 /* BB FIXME add code to kill session */ 738 /* BB FIXME add code to kill session */
741 } 739 }
742 } 740 }
@@ -753,7 +751,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
753 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 751 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
754 } else { 752 } else {
755 rc = -EIO; 753 rc = -EIO;
756 cERROR(1, ("Bad MID state?")); 754 cERROR(1, "Bad MID state?");
757 } 755 }
758 756
759out: 757out:
@@ -824,13 +822,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
824 struct cifsSesInfo *ses; 822 struct cifsSesInfo *ses;
825 823
826 if (tcon == NULL || tcon->ses == NULL) { 824 if (tcon == NULL || tcon->ses == NULL) {
827 cERROR(1, ("Null smb session")); 825 cERROR(1, "Null smb session");
828 return -EIO; 826 return -EIO;
829 } 827 }
830 ses = tcon->ses; 828 ses = tcon->ses;
831 829
832 if (ses->server == NULL) { 830 if (ses->server == NULL) {
833 cERROR(1, ("Null tcp session")); 831 cERROR(1, "Null tcp session");
834 return -EIO; 832 return -EIO;
835 } 833 }
836 834
@@ -842,8 +840,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
842 use ses->maxReq */ 840 use ses->maxReq */
843 841
844 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 842 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
845 cERROR(1, ("Illegal length, greater than maximum frame, %d", 843 cERROR(1, "Illegal length, greater than maximum frame, %d",
846 in_buf->smb_buf_length)); 844 in_buf->smb_buf_length);
847 return -EIO; 845 return -EIO;
848 } 846 }
849 847
@@ -933,8 +931,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
933 spin_unlock(&GlobalMid_Lock); 931 spin_unlock(&GlobalMid_Lock);
934 receive_len = midQ->resp_buf->smb_buf_length; 932 receive_len = midQ->resp_buf->smb_buf_length;
935 } else { 933 } else {
936 cERROR(1, ("No response for cmd %d mid %d", 934 cERROR(1, "No response for cmd %d mid %d",
937 midQ->command, midQ->mid)); 935 midQ->command, midQ->mid);
938 if (midQ->midState == MID_REQUEST_SUBMITTED) { 936 if (midQ->midState == MID_REQUEST_SUBMITTED) {
939 if (ses->server->tcpStatus == CifsExiting) 937 if (ses->server->tcpStatus == CifsExiting)
940 rc = -EHOSTDOWN; 938 rc = -EHOSTDOWN;
@@ -947,7 +945,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
947 if (rc != -EHOSTDOWN) { 945 if (rc != -EHOSTDOWN) {
948 if (midQ->midState == MID_RETRY_NEEDED) { 946 if (midQ->midState == MID_RETRY_NEEDED) {
949 rc = -EAGAIN; 947 rc = -EAGAIN;
950 cFYI(1, ("marking request for retry")); 948 cFYI(1, "marking request for retry");
951 } else { 949 } else {
952 rc = -EIO; 950 rc = -EIO;
953 } 951 }
@@ -958,8 +956,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
958 } 956 }
959 957
960 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 958 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
961 cERROR(1, ("Frame too large received. Length: %d Xid: %d", 959 cERROR(1, "Frame too large received. Length: %d Xid: %d",
962 receive_len, xid)); 960 receive_len, xid);
963 rc = -EIO; 961 rc = -EIO;
964 goto out; 962 goto out;
965 } 963 }
@@ -968,7 +966,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
968 966
969 if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) { 967 if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
970 rc = -EIO; 968 rc = -EIO;
971 cERROR(1, ("Bad MID state?")); 969 cERROR(1, "Bad MID state?");
972 goto out; 970 goto out;
973 } 971 }
974 972
@@ -986,7 +984,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
986 &ses->server->mac_signing_key, 984 &ses->server->mac_signing_key,
987 midQ->sequence_number+1); 985 midQ->sequence_number+1);
988 if (rc) { 986 if (rc) {
989 cERROR(1, ("Unexpected SMB signature")); 987 cERROR(1, "Unexpected SMB signature");
990 /* BB FIXME add code to kill session */ 988 /* BB FIXME add code to kill session */
991 } 989 }
992 } 990 }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index f555ce077d4f..a1509207bfa6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -70,12 +70,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
70 return rc; 70 return rc;
71 } 71 }
72 if (ea_name == NULL) { 72 if (ea_name == NULL) {
73 cFYI(1, ("Null xattr names not supported")); 73 cFYI(1, "Null xattr names not supported");
74 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) 74 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
75 && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) { 75 && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
76 cFYI(1, 76 cFYI(1,
77 ("illegal xattr request %s (only user namespace supported)", 77 "illegal xattr request %s (only user namespace supported)",
78 ea_name)); 78 ea_name);
79 /* BB what if no namespace prefix? */ 79 /* BB what if no namespace prefix? */
80 /* Should we just pass them to server, except for 80 /* Should we just pass them to server, except for
81 system and perhaps security prefixes? */ 81 system and perhaps security prefixes? */
@@ -131,19 +131,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
131 search server for EAs or streams to 131 search server for EAs or streams to
132 returns as xattrs */ 132 returns as xattrs */
133 if (value_size > MAX_EA_VALUE_SIZE) { 133 if (value_size > MAX_EA_VALUE_SIZE) {
134 cFYI(1, ("size of EA value too large")); 134 cFYI(1, "size of EA value too large");
135 kfree(full_path); 135 kfree(full_path);
136 FreeXid(xid); 136 FreeXid(xid);
137 return -EOPNOTSUPP; 137 return -EOPNOTSUPP;
138 } 138 }
139 139
140 if (ea_name == NULL) { 140 if (ea_name == NULL) {
141 cFYI(1, ("Null xattr names not supported")); 141 cFYI(1, "Null xattr names not supported");
142 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { 142 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
143 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 143 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
144 goto set_ea_exit; 144 goto set_ea_exit;
145 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) 145 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
146 cFYI(1, ("attempt to set cifs inode metadata")); 146 cFYI(1, "attempt to set cifs inode metadata");
147 147
148 ea_name += 5; /* skip past user. prefix */ 148 ea_name += 5; /* skip past user. prefix */
149 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, 149 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -169,9 +169,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
169 ACL_TYPE_ACCESS, cifs_sb->local_nls, 169 ACL_TYPE_ACCESS, cifs_sb->local_nls,
170 cifs_sb->mnt_cifs_flags & 170 cifs_sb->mnt_cifs_flags &
171 CIFS_MOUNT_MAP_SPECIAL_CHR); 171 CIFS_MOUNT_MAP_SPECIAL_CHR);
172 cFYI(1, ("set POSIX ACL rc %d", rc)); 172 cFYI(1, "set POSIX ACL rc %d", rc);
173#else 173#else
174 cFYI(1, ("set POSIX ACL not supported")); 174 cFYI(1, "set POSIX ACL not supported");
175#endif 175#endif
176 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, 176 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
177 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 177 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -182,13 +182,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
182 ACL_TYPE_DEFAULT, cifs_sb->local_nls, 182 ACL_TYPE_DEFAULT, cifs_sb->local_nls,
183 cifs_sb->mnt_cifs_flags & 183 cifs_sb->mnt_cifs_flags &
184 CIFS_MOUNT_MAP_SPECIAL_CHR); 184 CIFS_MOUNT_MAP_SPECIAL_CHR);
185 cFYI(1, ("set POSIX default ACL rc %d", rc)); 185 cFYI(1, "set POSIX default ACL rc %d", rc);
186#else 186#else
187 cFYI(1, ("set default POSIX ACL not supported")); 187 cFYI(1, "set default POSIX ACL not supported");
188#endif 188#endif
189 } else { 189 } else {
190 cFYI(1, ("illegal xattr request %s (only user namespace" 190 cFYI(1, "illegal xattr request %s (only user namespace"
191 " supported)", ea_name)); 191 " supported)", ea_name);
192 /* BB what if no namespace prefix? */ 192 /* BB what if no namespace prefix? */
193 /* Should we just pass them to server, except for 193 /* Should we just pass them to server, except for
194 system and perhaps security prefixes? */ 194 system and perhaps security prefixes? */
@@ -235,13 +235,13 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
235 /* return dos attributes as pseudo xattr */ 235 /* return dos attributes as pseudo xattr */
236 /* return alt name if available as pseudo attr */ 236 /* return alt name if available as pseudo attr */
237 if (ea_name == NULL) { 237 if (ea_name == NULL) {
238 cFYI(1, ("Null xattr names not supported")); 238 cFYI(1, "Null xattr names not supported");
239 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { 239 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
240 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 240 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
241 goto get_ea_exit; 241 goto get_ea_exit;
242 242
243 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) { 243 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
244 cFYI(1, ("attempt to query cifs inode metadata")); 244 cFYI(1, "attempt to query cifs inode metadata");
245 /* revalidate/getattr then populate from inode */ 245 /* revalidate/getattr then populate from inode */
246 } /* BB add else when above is implemented */ 246 } /* BB add else when above is implemented */
247 ea_name += 5; /* skip past user. prefix */ 247 ea_name += 5; /* skip past user. prefix */
@@ -287,7 +287,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
287 } 287 }
288#endif /* EXPERIMENTAL */ 288#endif /* EXPERIMENTAL */
289#else 289#else
290 cFYI(1, ("query POSIX ACL not supported yet")); 290 cFYI(1, "query POSIX ACL not supported yet");
291#endif /* CONFIG_CIFS_POSIX */ 291#endif /* CONFIG_CIFS_POSIX */
292 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, 292 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
293 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 293 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -299,18 +299,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
299 cifs_sb->mnt_cifs_flags & 299 cifs_sb->mnt_cifs_flags &
300 CIFS_MOUNT_MAP_SPECIAL_CHR); 300 CIFS_MOUNT_MAP_SPECIAL_CHR);
301#else 301#else
302 cFYI(1, ("query POSIX default ACL not supported yet")); 302 cFYI(1, "query POSIX default ACL not supported yet");
303#endif 303#endif
304 } else if (strncmp(ea_name, 304 } else if (strncmp(ea_name,
305 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { 305 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
306 cFYI(1, ("Trusted xattr namespace not supported yet")); 306 cFYI(1, "Trusted xattr namespace not supported yet");
307 } else if (strncmp(ea_name, 307 } else if (strncmp(ea_name,
308 CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { 308 CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
309 cFYI(1, ("Security xattr namespace not supported yet")); 309 cFYI(1, "Security xattr namespace not supported yet");
310 } else 310 } else
311 cFYI(1, 311 cFYI(1,
312 ("illegal xattr request %s (only user namespace supported)", 312 "illegal xattr request %s (only user namespace supported)",
313 ea_name)); 313 ea_name);
314 314
315 /* We could add an additional check for streams ie 315 /* We could add an additional check for streams ie
316 if proc/fs/cifs/streamstoxattr is set then 316 if proc/fs/cifs/streamstoxattr is set then
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
11 11
12void coda_destroy_inodecache(void); 12void coda_destroy_inodecache(void);
13int coda_init_inodecache(void); 13int coda_init_inodecache(void);
14int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, 14int coda_fsync(struct file *coda_file, int datasync);
15 int datasync);
16void coda_sysctl_init(void); 15void coda_sysctl_init(void);
17void coda_sysctl_clean(void); 16void coda_sysctl_clean(void);
18 17
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 4c813f2cdc52..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
202 return 0; 202 return 0;
203} 203}
204 204
205int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) 205int coda_fsync(struct file *coda_file, int datasync)
206{ 206{
207 struct file *host_file; 207 struct file *host_file;
208 struct inode *coda_inode = coda_dentry->d_inode; 208 struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
209 struct coda_file_info *cfi; 209 struct coda_file_info *cfi;
210 int err = 0; 210 int err = 0;
211 211
@@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
217 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 217 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
218 host_file = cfi->cfi_container; 218 host_file = cfi->cfi_container;
219 219
220 err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); 220 err = vfs_fsync(host_file, datasync);
221 if ( !err && !datasync ) { 221 if ( !err && !datasync ) {
222 lock_kernel(); 222 lock_kernel();
223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d97f9935a028..6526e6f21ecf 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -35,7 +35,7 @@
35#include "coda_int.h" 35#include "coda_int.h"
36 36
37/* VFS super_block ops */ 37/* VFS super_block ops */
38static void coda_clear_inode(struct inode *); 38static void coda_evict_inode(struct inode *);
39static void coda_put_super(struct super_block *); 39static void coda_put_super(struct super_block *);
40static int coda_statfs(struct dentry *dentry, struct kstatfs *buf); 40static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
41 41
@@ -93,7 +93,7 @@ static const struct super_operations coda_super_operations =
93{ 93{
94 .alloc_inode = coda_alloc_inode, 94 .alloc_inode = coda_alloc_inode,
95 .destroy_inode = coda_destroy_inode, 95 .destroy_inode = coda_destroy_inode,
96 .clear_inode = coda_clear_inode, 96 .evict_inode = coda_evict_inode,
97 .put_super = coda_put_super, 97 .put_super = coda_put_super,
98 .statfs = coda_statfs, 98 .statfs = coda_statfs,
99 .remount_fs = coda_remount, 99 .remount_fs = coda_remount,
@@ -224,8 +224,10 @@ static void coda_put_super(struct super_block *sb)
224 printk("Coda: Bye bye.\n"); 224 printk("Coda: Bye bye.\n");
225} 225}
226 226
227static void coda_clear_inode(struct inode *inode) 227static void coda_evict_inode(struct inode *inode)
228{ 228{
229 truncate_inode_pages(&inode->i_data, 0);
230 end_writeback(inode);
229 coda_cache_clear_inode(inode); 231 coda_cache_clear_inode(inode);
230} 232}
231 233
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Pioctl operations for Coda. 2 * Pioctl operations for Coda.
3 * Original version: (C) 1996 Peter Braam 3 * Original version: (C) 1996 Peter Braam
4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University 4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
5 * 5 *
6 * Carnegie Mellon encourages users of this code to contribute improvements 6 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
23#include <linux/coda_fs_i.h> 23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
25 25
26#include <linux/smp_lock.h>
27
26/* pioctl ops */ 28/* pioctl ops */
27static int coda_ioctl_permission(struct inode *inode, int mask); 29static int coda_ioctl_permission(struct inode *inode, int mask);
28static int coda_pioctl(struct inode * inode, struct file * filp, 30static long coda_pioctl(struct file *filp, unsigned int cmd,
29 unsigned int cmd, unsigned long user_data); 31 unsigned long user_data);
30 32
31/* exported from this file */ 33/* exported from this file */
32const struct inode_operations coda_ioctl_inode_operations = 34const struct inode_operations coda_ioctl_inode_operations = {
33{
34 .permission = coda_ioctl_permission, 35 .permission = coda_ioctl_permission,
35 .setattr = coda_setattr, 36 .setattr = coda_setattr,
36}; 37};
37 38
38const struct file_operations coda_ioctl_operations = { 39const struct file_operations coda_ioctl_operations = {
39 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
40 .ioctl = coda_pioctl, 41 .unlocked_ioctl = coda_pioctl,
41}; 42};
42 43
43/* the coda pioctl inode ops */ 44/* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
46 return (mask & MAY_EXEC) ? -EACCES : 0; 47 return (mask & MAY_EXEC) ? -EACCES : 0;
47} 48}
48 49
49static int coda_pioctl(struct inode * inode, struct file * filp, 50static long coda_pioctl(struct file *filp, unsigned int cmd,
50 unsigned int cmd, unsigned long user_data) 51 unsigned long user_data)
51{ 52{
52 struct path path; 53 struct path path;
53 int error; 54 int error;
54 struct PioctlData data; 55 struct PioctlData data;
55 struct inode *target_inode = NULL; 56 struct inode *inode = filp->f_dentry->d_inode;
56 struct coda_inode_info *cnp; 57 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp;
57 59
58 /* get the Pioctl data arguments from user space */ 60 lock_kernel();
59 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 61
60 return -EINVAL; 62 /* get the Pioctl data arguments from user space */
61 } 63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
62 64 error = -EINVAL;
63 /* 65 goto out;
64 * Look up the pathname. Note that the pathname is in
65 * user memory, and namei takes care of this
66 */
67 if (data.follow) {
68 error = user_path(data.path, &path);
69 } else {
70 error = user_lpath(data.path, &path);
71 } 66 }
72 67
73 if ( error ) { 68 /*
74 return error; 69 * Look up the pathname. Note that the pathname is in
75 } else { 70 * user memory, and namei takes care of this
71 */
72 if (data.follow)
73 error = user_path(data.path, &path);
74 else
75 error = user_lpath(data.path, &path);
76
77 if (error)
78 goto out;
79 else
76 target_inode = path.dentry->d_inode; 80 target_inode = path.dentry->d_inode;
77 } 81
78
79 /* return if it is not a Coda inode */ 82 /* return if it is not a Coda inode */
80 if ( target_inode->i_sb != inode->i_sb ) { 83 if (target_inode->i_sb != inode->i_sb) {
81 path_put(&path); 84 path_put(&path);
82 return -EINVAL; 85 error = -EINVAL;
86 goto out;
83 } 87 }
84 88
85 /* now proceed to make the upcall */ 89 /* now proceed to make the upcall */
86 cnp = ITOC(target_inode); 90 cnp = ITOC(target_inode);
87 91
88 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
89 93
90 path_put(&path); 94 path_put(&path);
91 return error;
92}
93 95
96out:
97 unlock_kernel();
98 return error;
99}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..116af7546cf0 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
73 return mask; 73 return mask;
74} 74}
75 75
76static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 76static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
77 unsigned int cmd, unsigned long arg)
78{ 77{
79 unsigned int data; 78 unsigned int data;
80 79
@@ -178,15 +177,15 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
178 nbytes = req->uc_outSize; /* don't have more space! */ 177 nbytes = req->uc_outSize; /* don't have more space! */
179 } 178 }
180 if (copy_from_user(req->uc_data, buf, nbytes)) { 179 if (copy_from_user(req->uc_data, buf, nbytes)) {
181 req->uc_flags |= REQ_ABORT; 180 req->uc_flags |= CODA_REQ_ABORT;
182 wake_up(&req->uc_sleep); 181 wake_up(&req->uc_sleep);
183 retval = -EFAULT; 182 retval = -EFAULT;
184 goto out; 183 goto out;
185 } 184 }
186 185
187 /* adjust outsize. is this useful ?? */ 186 /* adjust outsize. is this useful ?? */
188 req->uc_outSize = nbytes; 187 req->uc_outSize = nbytes;
189 req->uc_flags |= REQ_WRITE; 188 req->uc_flags |= CODA_REQ_WRITE;
190 count = nbytes; 189 count = nbytes;
191 190
192 /* Convert filedescriptor into a file handle */ 191 /* Convert filedescriptor into a file handle */
@@ -255,8 +254,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
255 retval = -EFAULT; 254 retval = -EFAULT;
256 255
257 /* If request was not a signal, enqueue and don't free */ 256 /* If request was not a signal, enqueue and don't free */
258 if (!(req->uc_flags & REQ_ASYNC)) { 257 if (!(req->uc_flags & CODA_REQ_ASYNC)) {
259 req->uc_flags |= REQ_READ; 258 req->uc_flags |= CODA_REQ_READ;
260 list_add_tail(&(req->uc_chain), &vcp->vc_processing); 259 list_add_tail(&(req->uc_chain), &vcp->vc_processing);
261 goto out; 260 goto out;
262 } 261 }
@@ -316,19 +315,19 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
316 list_del(&req->uc_chain); 315 list_del(&req->uc_chain);
317 316
318 /* Async requests need to be freed here */ 317 /* Async requests need to be freed here */
319 if (req->uc_flags & REQ_ASYNC) { 318 if (req->uc_flags & CODA_REQ_ASYNC) {
320 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); 319 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
321 kfree(req); 320 kfree(req);
322 continue; 321 continue;
323 } 322 }
324 req->uc_flags |= REQ_ABORT; 323 req->uc_flags |= CODA_REQ_ABORT;
325 wake_up(&req->uc_sleep); 324 wake_up(&req->uc_sleep);
326 } 325 }
327 326
328 list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) { 327 list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) {
329 list_del(&req->uc_chain); 328 list_del(&req->uc_chain);
330 329
331 req->uc_flags |= REQ_ABORT; 330 req->uc_flags |= CODA_REQ_ABORT;
332 wake_up(&req->uc_sleep); 331 wake_up(&req->uc_sleep);
333 } 332 }
334 333
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
344 .read = coda_psdev_read, 343 .read = coda_psdev_read,
345 .write = coda_psdev_write, 344 .write = coda_psdev_write,
346 .poll = coda_psdev_poll, 345 .poll = coda_psdev_poll,
347 .ioctl = coda_psdev_ioctl, 346 .unlocked_ioctl = coda_psdev_ioctl,
348 .open = coda_psdev_open, 347 .open = coda_psdev_open,
349 .release = coda_psdev_release, 348 .release = coda_psdev_release,
350}; 349};
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index f09c5ed76f6c..b8893ab6f9e6 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -604,7 +604,7 @@ static void coda_unblock_signals(sigset_t *old)
604 (((r)->uc_opcode != CODA_CLOSE && \ 604 (((r)->uc_opcode != CODA_CLOSE && \
605 (r)->uc_opcode != CODA_STORE && \ 605 (r)->uc_opcode != CODA_STORE && \
606 (r)->uc_opcode != CODA_RELEASE) || \ 606 (r)->uc_opcode != CODA_RELEASE) || \
607 (r)->uc_flags & REQ_READ)) 607 (r)->uc_flags & CODA_REQ_READ))
608 608
609static inline void coda_waitfor_upcall(struct upc_req *req) 609static inline void coda_waitfor_upcall(struct upc_req *req)
610{ 610{
@@ -624,7 +624,7 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
624 set_current_state(TASK_UNINTERRUPTIBLE); 624 set_current_state(TASK_UNINTERRUPTIBLE);
625 625
626 /* got a reply */ 626 /* got a reply */
627 if (req->uc_flags & (REQ_WRITE | REQ_ABORT)) 627 if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT))
628 break; 628 break;
629 629
630 if (blocked && time_after(jiffies, timeout) && 630 if (blocked && time_after(jiffies, timeout) &&
@@ -708,7 +708,7 @@ static int coda_upcall(struct venus_comm *vcp,
708 coda_waitfor_upcall(req); 708 coda_waitfor_upcall(req);
709 709
710 /* Op went through, interrupt or not... */ 710 /* Op went through, interrupt or not... */
711 if (req->uc_flags & REQ_WRITE) { 711 if (req->uc_flags & CODA_REQ_WRITE) {
712 out = (union outputArgs *)req->uc_data; 712 out = (union outputArgs *)req->uc_data;
713 /* here we map positive Venus errors to kernel errors */ 713 /* here we map positive Venus errors to kernel errors */
714 error = -out->oh.result; 714 error = -out->oh.result;
@@ -717,13 +717,13 @@ static int coda_upcall(struct venus_comm *vcp,
717 } 717 }
718 718
719 error = -EINTR; 719 error = -EINTR;
720 if ((req->uc_flags & REQ_ABORT) || !signal_pending(current)) { 720 if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) {
721 printk(KERN_WARNING "coda: Unexpected interruption.\n"); 721 printk(KERN_WARNING "coda: Unexpected interruption.\n");
722 goto exit; 722 goto exit;
723 } 723 }
724 724
725 /* Interrupted before venus read it. */ 725 /* Interrupted before venus read it. */
726 if (!(req->uc_flags & REQ_READ)) 726 if (!(req->uc_flags & CODA_REQ_READ))
727 goto exit; 727 goto exit;
728 728
729 /* Venus saw the upcall, make sure we can send interrupt signal */ 729 /* Venus saw the upcall, make sure we can send interrupt signal */
@@ -747,7 +747,7 @@ static int coda_upcall(struct venus_comm *vcp,
747 sig_inputArgs->ih.opcode = CODA_SIGNAL; 747 sig_inputArgs->ih.opcode = CODA_SIGNAL;
748 sig_inputArgs->ih.unique = req->uc_unique; 748 sig_inputArgs->ih.unique = req->uc_unique;
749 749
750 sig_req->uc_flags = REQ_ASYNC; 750 sig_req->uc_flags = CODA_REQ_ASYNC;
751 sig_req->uc_opcode = sig_inputArgs->ih.opcode; 751 sig_req->uc_opcode = sig_inputArgs->ih.opcode;
752 sig_req->uc_unique = sig_inputArgs->ih.unique; 752 sig_req->uc_unique = sig_inputArgs->ih.unique;
753 sig_req->uc_inSize = sizeof(struct coda_in_hdr); 753 sig_req->uc_inSize = sizeof(struct coda_in_hdr);
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..0644a154672b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -8,13 +8,14 @@
8 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) 8 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
9 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) 9 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
10 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs 10 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
11 * Copyright (C) 2003 Pavel Machek (pavel@suse.cz) 11 * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz)
12 * 12 *
13 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License version 2 as 14 * it under the terms of the GNU General Public License version 2 as
15 * published by the Free Software Foundation. 15 * published by the Free Software Foundation.
16 */ 16 */
17 17
18#include <linux/stddef.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/linkage.h> 20#include <linux/linkage.h>
20#include <linux/compat.h> 21#include <linux/compat.h>
@@ -76,7 +77,8 @@ int compat_printk(const char *fmt, ...)
76 * Not all architectures have sys_utime, so implement this in terms 77 * Not all architectures have sys_utime, so implement this in terms
77 * of sys_utimes. 78 * of sys_utimes.
78 */ 79 */
79asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __user *t) 80asmlinkage long compat_sys_utime(const char __user *filename,
81 struct compat_utimbuf __user *t)
80{ 82{
81 struct timespec tv[2]; 83 struct timespec tv[2];
82 84
@@ -90,7 +92,7 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
90 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); 92 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
91} 93}
92 94
93asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags) 95asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
94{ 96{
95 struct timespec tv[2]; 97 struct timespec tv[2];
96 98
@@ -105,7 +107,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
105 return do_utimes(dfd, filename, t ? tv : NULL, flags); 107 return do_utimes(dfd, filename, t ? tv : NULL, flags);
106} 108}
107 109
108asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t) 110asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
109{ 111{
110 struct timespec tv[2]; 112 struct timespec tv[2];
111 113
@@ -124,7 +126,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, st
124 return do_utimes(dfd, filename, t ? tv : NULL, 0); 126 return do_utimes(dfd, filename, t ? tv : NULL, 0);
125} 127}
126 128
127asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t) 129asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
128{ 130{
129 return compat_sys_futimesat(AT_FDCWD, filename, t); 131 return compat_sys_futimesat(AT_FDCWD, filename, t);
130} 132}
@@ -168,7 +170,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
168 return err; 170 return err;
169} 171}
170 172
171asmlinkage long compat_sys_newstat(char __user * filename, 173asmlinkage long compat_sys_newstat(const char __user * filename,
172 struct compat_stat __user *statbuf) 174 struct compat_stat __user *statbuf)
173{ 175{
174 struct kstat stat; 176 struct kstat stat;
@@ -180,7 +182,7 @@ asmlinkage long compat_sys_newstat(char __user * filename,
180 return cp_compat_stat(&stat, statbuf); 182 return cp_compat_stat(&stat, statbuf);
181} 183}
182 184
183asmlinkage long compat_sys_newlstat(char __user * filename, 185asmlinkage long compat_sys_newlstat(const char __user * filename,
184 struct compat_stat __user *statbuf) 186 struct compat_stat __user *statbuf)
185{ 187{
186 struct kstat stat; 188 struct kstat stat;
@@ -193,7 +195,8 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
193} 195}
194 196
195#ifndef __ARCH_WANT_STAT64 197#ifndef __ARCH_WANT_STAT64
196asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename, 198asmlinkage long compat_sys_newfstatat(unsigned int dfd,
199 const char __user *filename,
197 struct compat_stat __user *statbuf, int flag) 200 struct compat_stat __user *statbuf, int flag)
198{ 201{
199 struct kstat stat; 202 struct kstat stat;
@@ -266,7 +269,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
266 error = user_path(pathname, &path); 269 error = user_path(pathname, &path);
267 if (!error) { 270 if (!error) {
268 struct kstatfs tmp; 271 struct kstatfs tmp;
269 error = vfs_statfs(path.dentry, &tmp); 272 error = vfs_statfs(&path, &tmp);
270 if (!error) 273 if (!error)
271 error = put_compat_statfs(buf, &tmp); 274 error = put_compat_statfs(buf, &tmp);
272 path_put(&path); 275 path_put(&path);
@@ -284,7 +287,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
284 file = fget(fd); 287 file = fget(fd);
285 if (!file) 288 if (!file)
286 goto out; 289 goto out;
287 error = vfs_statfs(file->f_path.dentry, &tmp); 290 error = vfs_statfs(&file->f_path, &tmp);
288 if (!error) 291 if (!error)
289 error = put_compat_statfs(buf, &tmp); 292 error = put_compat_statfs(buf, &tmp);
290 fput(file); 293 fput(file);
@@ -334,7 +337,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
334 error = user_path(pathname, &path); 337 error = user_path(pathname, &path);
335 if (!error) { 338 if (!error) {
336 struct kstatfs tmp; 339 struct kstatfs tmp;
337 error = vfs_statfs(path.dentry, &tmp); 340 error = vfs_statfs(&path, &tmp);
338 if (!error) 341 if (!error)
339 error = put_compat_statfs64(buf, &tmp); 342 error = put_compat_statfs64(buf, &tmp);
340 path_put(&path); 343 path_put(&path);
@@ -355,7 +358,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
355 file = fget(fd); 358 file = fget(fd);
356 if (!file) 359 if (!file)
357 goto out; 360 goto out;
358 error = vfs_statfs(file->f_path.dentry, &tmp); 361 error = vfs_statfs(&file->f_path, &tmp);
359 if (!error) 362 if (!error)
360 error = put_compat_statfs64(buf, &tmp); 363 error = put_compat_statfs64(buf, &tmp);
361 fput(file); 364 fput(file);
@@ -378,7 +381,7 @@ asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
378 sb = user_get_super(new_decode_dev(dev)); 381 sb = user_get_super(new_decode_dev(dev));
379 if (!sb) 382 if (!sb)
380 return -EINVAL; 383 return -EINVAL;
381 err = vfs_statfs(sb->s_root, &sbuf); 384 err = statfs_by_dentry(sb->s_root, &sbuf);
382 drop_super(sb); 385 drop_super(sb);
383 if (err) 386 if (err)
384 return err; 387 return err;
@@ -568,6 +571,79 @@ out:
568 return ret; 571 return ret;
569} 572}
570 573
574/* A write operation does a read from user space and vice versa */
575#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
576
577ssize_t compat_rw_copy_check_uvector(int type,
578 const struct compat_iovec __user *uvector, unsigned long nr_segs,
579 unsigned long fast_segs, struct iovec *fast_pointer,
580 struct iovec **ret_pointer)
581{
582 compat_ssize_t tot_len;
583 struct iovec *iov = *ret_pointer = fast_pointer;
584 ssize_t ret = 0;
585 int seg;
586
587 /*
588 * SuS says "The readv() function *may* fail if the iovcnt argument
589 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
590 * traditionally returned zero for zero segments, so...
591 */
592 if (nr_segs == 0)
593 goto out;
594
595 ret = -EINVAL;
596 if (nr_segs > UIO_MAXIOV || nr_segs < 0)
597 goto out;
598 if (nr_segs > fast_segs) {
599 ret = -ENOMEM;
600 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
601 if (iov == NULL) {
602 *ret_pointer = fast_pointer;
603 goto out;
604 }
605 }
606 *ret_pointer = iov;
607
608 /*
609 * Single unix specification:
610 * We should -EINVAL if an element length is not >= 0 and fitting an
611 * ssize_t. The total length is fitting an ssize_t
612 *
613 * Be careful here because iov_len is a size_t not an ssize_t
614 */
615 tot_len = 0;
616 ret = -EINVAL;
617 for (seg = 0; seg < nr_segs; seg++) {
618 compat_ssize_t tmp = tot_len;
619 compat_uptr_t buf;
620 compat_ssize_t len;
621
622 if (__get_user(len, &uvector->iov_len) ||
623 __get_user(buf, &uvector->iov_base)) {
624 ret = -EFAULT;
625 goto out;
626 }
627 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
628 goto out;
629 tot_len += len;
630 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
631 goto out;
632 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
633 ret = -EFAULT;
634 goto out;
635 }
636 iov->iov_base = compat_ptr(buf);
637 iov->iov_len = (compat_size_t) len;
638 uvector++;
639 iov++;
640 }
641 ret = tot_len;
642
643out:
644 return ret;
645}
646
571static inline long 647static inline long
572copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) 648copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
573{ 649{
@@ -600,7 +676,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
600 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); 676 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
601 ret = copy_iocb(nr, iocb, iocb64); 677 ret = copy_iocb(nr, iocb, iocb64);
602 if (!ret) 678 if (!ret)
603 ret = sys_io_submit(ctx_id, nr, iocb64); 679 ret = do_io_submit(ctx_id, nr, iocb64, 1);
604 return ret; 680 return ret;
605} 681}
606 682
@@ -763,9 +839,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
763#define NCPFS_NAME "ncpfs" 839#define NCPFS_NAME "ncpfs"
764#define NFS4_NAME "nfs4" 840#define NFS4_NAME "nfs4"
765 841
766asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name, 842asmlinkage long compat_sys_mount(const char __user * dev_name,
767 char __user * type, unsigned long flags, 843 const char __user * dir_name,
768 void __user * data) 844 const char __user * type, unsigned long flags,
845 const void __user * data)
769{ 846{
770 char *kernel_type; 847 char *kernel_type;
771 unsigned long data_page; 848 unsigned long data_page;
@@ -818,8 +895,6 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
818 return retval; 895 return retval;
819} 896}
820 897
821#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
822
823struct compat_old_linux_dirent { 898struct compat_old_linux_dirent {
824 compat_ulong_t d_ino; 899 compat_ulong_t d_ino;
825 compat_ulong_t d_offset; 900 compat_ulong_t d_offset;
@@ -908,7 +983,8 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
908 struct compat_linux_dirent __user * dirent; 983 struct compat_linux_dirent __user * dirent;
909 struct compat_getdents_callback *buf = __buf; 984 struct compat_getdents_callback *buf = __buf;
910 compat_ulong_t d_ino; 985 compat_ulong_t d_ino;
911 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t)); 986 int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
987 namlen + 2, sizeof(compat_long_t));
912 988
913 buf->error = -EINVAL; /* only used if we fail.. */ 989 buf->error = -EINVAL; /* only used if we fail.. */
914 if (reclen > buf->count) 990 if (reclen > buf->count)
@@ -995,8 +1071,8 @@ static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t
995{ 1071{
996 struct linux_dirent64 __user *dirent; 1072 struct linux_dirent64 __user *dirent;
997 struct compat_getdents_callback64 *buf = __buf; 1073 struct compat_getdents_callback64 *buf = __buf;
998 int jj = NAME_OFFSET(dirent); 1074 int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
999 int reclen = ALIGN(jj + namlen + 1, sizeof(u64)); 1075 sizeof(u64));
1000 u64 off; 1076 u64 off;
1001 1077
1002 buf->error = -EINVAL; /* only used if we fail.. */ 1078 buf->error = -EINVAL; /* only used if we fail.. */
@@ -1077,70 +1153,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1077{ 1153{
1078 compat_ssize_t tot_len; 1154 compat_ssize_t tot_len;
1079 struct iovec iovstack[UIO_FASTIOV]; 1155 struct iovec iovstack[UIO_FASTIOV];
1080 struct iovec *iov=iovstack, *vector; 1156 struct iovec *iov = iovstack;
1081 ssize_t ret; 1157 ssize_t ret;
1082 int seg;
1083 io_fn_t fn; 1158 io_fn_t fn;
1084 iov_fn_t fnv; 1159 iov_fn_t fnv;
1085 1160
1086 /*
1087 * SuS says "The readv() function *may* fail if the iovcnt argument
1088 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
1089 * traditionally returned zero for zero segments, so...
1090 */
1091 ret = 0;
1092 if (nr_segs == 0)
1093 goto out;
1094
1095 /*
1096 * First get the "struct iovec" from user memory and
1097 * verify all the pointers
1098 */
1099 ret = -EINVAL; 1161 ret = -EINVAL;
1100 if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
1101 goto out;
1102 if (!file->f_op) 1162 if (!file->f_op)
1103 goto out; 1163 goto out;
1104 if (nr_segs > UIO_FASTIOV) { 1164
1105 ret = -ENOMEM;
1106 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
1107 if (!iov)
1108 goto out;
1109 }
1110 ret = -EFAULT; 1165 ret = -EFAULT;
1111 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) 1166 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
1112 goto out; 1167 goto out;
1113 1168
1114 /* 1169 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1115 * Single unix specification: 1170 UIO_FASTIOV, iovstack, &iov);
1116 * We should -EINVAL if an element length is not >= 0 and fitting an
1117 * ssize_t. The total length is fitting an ssize_t
1118 *
1119 * Be careful here because iov_len is a size_t not an ssize_t
1120 */
1121 tot_len = 0;
1122 vector = iov;
1123 ret = -EINVAL;
1124 for (seg = 0 ; seg < nr_segs; seg++) {
1125 compat_ssize_t tmp = tot_len;
1126 compat_ssize_t len;
1127 compat_uptr_t buf;
1128
1129 if (__get_user(len, &uvector->iov_len) ||
1130 __get_user(buf, &uvector->iov_base)) {
1131 ret = -EFAULT;
1132 goto out;
1133 }
1134 if (len < 0) /* size_t not fitting an compat_ssize_t .. */
1135 goto out;
1136 tot_len += len;
1137 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
1138 goto out;
1139 vector->iov_base = compat_ptr(buf);
1140 vector->iov_len = (compat_size_t) len;
1141 uvector++;
1142 vector++;
1143 }
1144 if (tot_len == 0) { 1171 if (tot_len == 0) {
1145 ret = 0; 1172 ret = 0;
1146 goto out; 1173 goto out;
@@ -1169,11 +1196,10 @@ out:
1169 if (iov != iovstack) 1196 if (iov != iovstack)
1170 kfree(iov); 1197 kfree(iov);
1171 if ((ret + (type == READ)) > 0) { 1198 if ((ret + (type == READ)) > 0) {
1172 struct dentry *dentry = file->f_path.dentry;
1173 if (type == READ) 1199 if (type == READ)
1174 fsnotify_access(dentry); 1200 fsnotify_access(file);
1175 else 1201 else
1176 fsnotify_modify(dentry); 1202 fsnotify_modify(file);
1177 } 1203 }
1178 return ret; 1204 return ret;
1179} 1205}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 641640dc7ae5..03e59aa318eb 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) 4 * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
5 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) 5 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
6 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs 6 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
7 * Copyright (C) 2003 Pavel Machek (pavel@suse.cz) 7 * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz)
8 * 8 *
9 * These routines maintain argument size conversion between 32bit and 64bit 9 * These routines maintain argument size conversion between 32bit and 64bit
10 * ioctls. 10 * ioctls.
@@ -131,23 +131,6 @@ static int w_long(unsigned int fd, unsigned int cmd,
131 return err; 131 return err;
132} 132}
133 133
134static int rw_long(unsigned int fd, unsigned int cmd,
135 compat_ulong_t __user *argp)
136{
137 mm_segment_t old_fs = get_fs();
138 int err;
139 unsigned long val;
140
141 if(get_user(val, argp))
142 return -EFAULT;
143 set_fs (KERNEL_DS);
144 err = sys_ioctl(fd, cmd, (unsigned long)&val);
145 set_fs (old_fs);
146 if (!err && put_user(val, argp))
147 return -EFAULT;
148 return err;
149}
150
151struct compat_video_event { 134struct compat_video_event {
152 int32_t type; 135 int32_t type;
153 compat_time_t timestamp; 136 compat_time_t timestamp;
@@ -594,15 +577,12 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
594 return err; 577 return err;
595} 578}
596 579
597static int ioc_settimeout(unsigned int fd, unsigned int cmd,
598 compat_ulong_t __user *argp)
599{
600 return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp);
601}
602
603/* Bluetooth ioctls */ 580/* Bluetooth ioctls */
604#define HCIUARTSETPROTO _IOW('U', 200, int) 581#define HCIUARTSETPROTO _IOW('U', 200, int)
605#define HCIUARTGETPROTO _IOR('U', 201, int) 582#define HCIUARTGETPROTO _IOR('U', 201, int)
583#define HCIUARTGETDEVICE _IOR('U', 202, int)
584#define HCIUARTSETFLAGS _IOW('U', 203, int)
585#define HCIUARTGETFLAGS _IOR('U', 204, int)
606 586
607#define BNEPCONNADD _IOW('B', 200, int) 587#define BNEPCONNADD _IOW('B', 200, int)
608#define BNEPCONNDEL _IOW('B', 201, int) 588#define BNEPCONNDEL _IOW('B', 201, int)
@@ -966,6 +946,7 @@ COMPATIBLE_IOCTL(TIOCGPGRP)
966COMPATIBLE_IOCTL(TIOCGPTN) 946COMPATIBLE_IOCTL(TIOCGPTN)
967COMPATIBLE_IOCTL(TIOCSPTLCK) 947COMPATIBLE_IOCTL(TIOCSPTLCK)
968COMPATIBLE_IOCTL(TIOCSERGETLSR) 948COMPATIBLE_IOCTL(TIOCSERGETLSR)
949COMPATIBLE_IOCTL(TIOCSIG)
969#ifdef TCGETS2 950#ifdef TCGETS2
970COMPATIBLE_IOCTL(TCGETS2) 951COMPATIBLE_IOCTL(TCGETS2)
971COMPATIBLE_IOCTL(TCSETS2) 952COMPATIBLE_IOCTL(TCSETS2)
@@ -1281,13 +1262,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
1281COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) 1262COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
1282COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) 1263COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
1283COMPATIBLE_IOCTL(OSS_GETVERSION) 1264COMPATIBLE_IOCTL(OSS_GETVERSION)
1284/* AUTOFS */
1285COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC)
1286COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
1287COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
1288COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
1289COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
1290COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
1291/* Raw devices */ 1265/* Raw devices */
1292COMPATIBLE_IOCTL(RAW_SETBIND) 1266COMPATIBLE_IOCTL(RAW_SETBIND)
1293COMPATIBLE_IOCTL(RAW_GETBIND) 1267COMPATIBLE_IOCTL(RAW_GETBIND)
@@ -1328,6 +1302,8 @@ COMPATIBLE_IOCTL(HCISETLINKPOL)
1328COMPATIBLE_IOCTL(HCISETLINKMODE) 1302COMPATIBLE_IOCTL(HCISETLINKMODE)
1329COMPATIBLE_IOCTL(HCISETACLMTU) 1303COMPATIBLE_IOCTL(HCISETACLMTU)
1330COMPATIBLE_IOCTL(HCISETSCOMTU) 1304COMPATIBLE_IOCTL(HCISETSCOMTU)
1305COMPATIBLE_IOCTL(HCIBLOCKADDR)
1306COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
1331COMPATIBLE_IOCTL(HCIINQUIRY) 1307COMPATIBLE_IOCTL(HCIINQUIRY)
1332COMPATIBLE_IOCTL(HCIUARTSETPROTO) 1308COMPATIBLE_IOCTL(HCIUARTSETPROTO)
1333COMPATIBLE_IOCTL(HCIUARTGETPROTO) 1309COMPATIBLE_IOCTL(HCIUARTGETPROTO)
@@ -1552,9 +1528,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
1552 case RAW_GETBIND: 1528 case RAW_GETBIND:
1553 return raw_ioctl(fd, cmd, argp); 1529 return raw_ioctl(fd, cmd, argp);
1554#endif 1530#endif
1555#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
1556 case AUTOFS_IOC_SETTIMEOUT32:
1557 return ioc_settimeout(fd, cmd, argp);
1558 /* One SMB ioctl needs translations. */ 1531 /* One SMB ioctl needs translations. */
1559#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) 1532#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
1560 case SMB_IOC_GETMOUNTUID_32: 1533 case SMB_IOC_GETMOUNTUID_32:
@@ -1609,9 +1582,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
1609 case KDSKBMETA: 1582 case KDSKBMETA:
1610 case KDSKBLED: 1583 case KDSKBLED:
1611 case KDSETLED: 1584 case KDSETLED:
1612 /* AUTOFS */
1613 case AUTOFS_IOC_READY:
1614 case AUTOFS_IOC_FAIL:
1615 /* NBD */ 1585 /* NBD */
1616 case NBD_SET_SOCK: 1586 case NBD_SET_SOCK:
1617 case NBD_SET_BLKSIZE: 1587 case NBD_SET_BLKSIZE:
@@ -1729,8 +1699,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
1729 goto out_fput; 1699 goto out_fput;
1730 } 1700 }
1731 1701
1732 if (!filp->f_op || 1702 if (!filp->f_op || !filp->f_op->unlocked_ioctl)
1733 (!filp->f_op->ioctl && !filp->f_op->unlocked_ioctl))
1734 goto do_ioctl; 1703 goto do_ioctl;
1735 break; 1704 break;
1736 } 1705 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c8af2d91174b..cf78d44a8d6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -73,15 +73,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
73 return -EINVAL; 73 return -EINVAL;
74 74
75 sd_iattr = sd->s_iattr; 75 sd_iattr = sd->s_iattr;
76
77 error = inode_change_ok(inode, iattr);
78 if (error)
79 return error;
80
81 error = inode_setattr(inode, iattr);
82 if (error)
83 return error;
84
85 if (!sd_iattr) { 76 if (!sd_iattr) {
86 /* setting attributes for the first time, allocate now */ 77 /* setting attributes for the first time, allocate now */
87 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); 78 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
@@ -94,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
94 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; 85 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
95 sd->s_iattr = sd_iattr; 86 sd->s_iattr = sd_iattr;
96 } 87 }
97
98 /* attributes were changed atleast once in past */ 88 /* attributes were changed atleast once in past */
99 89
90 error = simple_setattr(dentry, iattr);
91 if (error)
92 return error;
93
100 if (ia_valid & ATTR_UID) 94 if (ia_valid & ATTR_UID)
101 sd_iattr->ia_uid = iattr->ia_uid; 95 sd_iattr->ia_uid = iattr->ia_uid;
102 if (ia_valid & ATTR_GID) 96 if (ia_valid & ATTR_GID)
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index dd3634e4c967..1e7a33028d33 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -39,66 +39,55 @@ static DEFINE_MUTEX(read_mutex);
39#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1) 39#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1)
40#define OFFSET(x) ((x)->i_ino) 40#define OFFSET(x) ((x)->i_ino)
41 41
42 42static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
43static int cramfs_iget5_test(struct inode *inode, void *opaque)
44{
45 struct cramfs_inode *cramfs_inode = opaque;
46 return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
47}
48
49static int cramfs_iget5_set(struct inode *inode, void *opaque)
50{ 43{
51 struct cramfs_inode *cramfs_inode = opaque; 44 static struct timespec zerotime;
52 inode->i_ino = CRAMINO(cramfs_inode); 45 inode->i_mode = cramfs_inode->mode;
53 return 0; 46 inode->i_uid = cramfs_inode->uid;
47 inode->i_size = cramfs_inode->size;
48 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
49 inode->i_gid = cramfs_inode->gid;
50 /* Struct copy intentional */
51 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
52 /* inode->i_nlink is left 1 - arguably wrong for directories,
53 but it's the best we can do without reading the directory
54 contents. 1 yields the right result in GNU find, even
55 without -noleaf option. */
56 if (S_ISREG(inode->i_mode)) {
57 inode->i_fop = &generic_ro_fops;
58 inode->i_data.a_ops = &cramfs_aops;
59 } else if (S_ISDIR(inode->i_mode)) {
60 inode->i_op = &cramfs_dir_inode_operations;
61 inode->i_fop = &cramfs_directory_operations;
62 } else if (S_ISLNK(inode->i_mode)) {
63 inode->i_op = &page_symlink_inode_operations;
64 inode->i_data.a_ops = &cramfs_aops;
65 } else {
66 init_special_inode(inode, inode->i_mode,
67 old_decode_dev(cramfs_inode->size));
68 }
54} 69}
55 70
56static struct inode *get_cramfs_inode(struct super_block *sb, 71static struct inode *get_cramfs_inode(struct super_block *sb,
57 struct cramfs_inode * cramfs_inode) 72 struct cramfs_inode * cramfs_inode)
58{ 73{
59 struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode), 74 struct inode *inode;
60 cramfs_iget5_test, cramfs_iget5_set, 75 if (CRAMINO(cramfs_inode) == 1) {
61 cramfs_inode); 76 inode = new_inode(sb);
62 static struct timespec zerotime; 77 if (inode) {
63 78 inode->i_ino = 1;
64 if (inode && (inode->i_state & I_NEW)) { 79 setup_inode(inode, cramfs_inode);
65 inode->i_mode = cramfs_inode->mode; 80 }
66 inode->i_uid = cramfs_inode->uid; 81 } else {
67 inode->i_size = cramfs_inode->size; 82 inode = iget_locked(sb, CRAMINO(cramfs_inode));
68 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; 83 if (inode && (inode->i_state & I_NEW)) {
69 inode->i_gid = cramfs_inode->gid; 84 setup_inode(inode, cramfs_inode);
70 /* Struct copy intentional */ 85 unlock_new_inode(inode);
71 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
72 /* inode->i_nlink is left 1 - arguably wrong for directories,
73 but it's the best we can do without reading the directory
74 contents. 1 yields the right result in GNU find, even
75 without -noleaf option. */
76 if (S_ISREG(inode->i_mode)) {
77 inode->i_fop = &generic_ro_fops;
78 inode->i_data.a_ops = &cramfs_aops;
79 } else if (S_ISDIR(inode->i_mode)) {
80 inode->i_op = &cramfs_dir_inode_operations;
81 inode->i_fop = &cramfs_directory_operations;
82 } else if (S_ISLNK(inode->i_mode)) {
83 inode->i_op = &page_symlink_inode_operations;
84 inode->i_data.a_ops = &cramfs_aops;
85 } else {
86 init_special_inode(inode, inode->i_mode,
87 old_decode_dev(cramfs_inode->size));
88 } 86 }
89 unlock_new_inode(inode);
90 } 87 }
91 return inode; 88 return inode;
92} 89}
93 90
94static void cramfs_drop_inode(struct inode *inode)
95{
96 if (inode->i_ino == 1)
97 generic_delete_inode(inode);
98 else
99 generic_drop_inode(inode);
100}
101
102/* 91/*
103 * We have our own block cache: don't fill up the buffer cache 92 * We have our own block cache: don't fill up the buffer cache
104 * with the rom-image, because the way the filesystem is set 93 * with the rom-image, because the way the filesystem is set
@@ -542,7 +531,6 @@ static const struct super_operations cramfs_ops = {
542 .put_super = cramfs_put_super, 531 .put_super = cramfs_put_super,
543 .remount_fs = cramfs_remount, 532 .remount_fs = cramfs_remount,
544 .statfs = cramfs_statfs, 533 .statfs = cramfs_statfs,
545 .drop_inode = cramfs_drop_inode,
546}; 534};
547 535
548static int cramfs_get_sb(struct file_system_type *fs_type, 536static int cramfs_get_sb(struct file_system_type *fs_type,
diff --git a/fs/dcache.c b/fs/dcache.c
index f1358e5c3a59..83293be48149 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -536,7 +536,7 @@ restart:
536 */ 536 */
537static void prune_dcache(int count) 537static void prune_dcache(int count)
538{ 538{
539 struct super_block *sb; 539 struct super_block *sb, *p = NULL;
540 int w_count; 540 int w_count;
541 int unused = dentry_stat.nr_unused; 541 int unused = dentry_stat.nr_unused;
542 int prune_ratio; 542 int prune_ratio;
@@ -545,13 +545,14 @@ static void prune_dcache(int count)
545 if (unused == 0 || count == 0) 545 if (unused == 0 || count == 0)
546 return; 546 return;
547 spin_lock(&dcache_lock); 547 spin_lock(&dcache_lock);
548restart:
549 if (count >= unused) 548 if (count >= unused)
550 prune_ratio = 1; 549 prune_ratio = 1;
551 else 550 else
552 prune_ratio = unused / count; 551 prune_ratio = unused / count;
553 spin_lock(&sb_lock); 552 spin_lock(&sb_lock);
554 list_for_each_entry(sb, &super_blocks, s_list) { 553 list_for_each_entry(sb, &super_blocks, s_list) {
554 if (list_empty(&sb->s_instances))
555 continue;
555 if (sb->s_nr_dentry_unused == 0) 556 if (sb->s_nr_dentry_unused == 0)
556 continue; 557 continue;
557 sb->s_count++; 558 sb->s_count++;
@@ -589,16 +590,16 @@ restart:
589 up_read(&sb->s_umount); 590 up_read(&sb->s_umount);
590 } 591 }
591 spin_lock(&sb_lock); 592 spin_lock(&sb_lock);
593 if (p)
594 __put_super(p);
592 count -= pruned; 595 count -= pruned;
593 /* 596 p = sb;
594 * restart only when sb is no longer on the list and 597 /* more work left to do? */
595 * we have more work to do. 598 if (count <= 0)
596 */ 599 break;
597 if (__put_super_and_need_restart(sb) && count > 0) {
598 spin_unlock(&sb_lock);
599 goto restart;
600 }
601 } 600 }
601 if (p)
602 __put_super(p);
602 spin_unlock(&sb_lock); 603 spin_unlock(&sb_lock);
603 spin_unlock(&dcache_lock); 604 spin_unlock(&dcache_lock);
604} 605}
@@ -897,7 +898,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
897 * 898 *
898 * In this case we return -1 to tell the caller that we baled. 899 * In this case we return -1 to tell the caller that we baled.
899 */ 900 */
900static int shrink_dcache_memory(int nr, gfp_t gfp_mask) 901static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
901{ 902{
902 if (nr) { 903 if (nr) {
903 if (!(gfp_mask & __GFP_FS)) 904 if (!(gfp_mask & __GFP_FS))
@@ -1331,31 +1332,13 @@ EXPORT_SYMBOL(d_add_ci);
1331 * d_lookup - search for a dentry 1332 * d_lookup - search for a dentry
1332 * @parent: parent dentry 1333 * @parent: parent dentry
1333 * @name: qstr of name we wish to find 1334 * @name: qstr of name we wish to find
1335 * Returns: dentry, or NULL
1334 * 1336 *
1335 * Searches the children of the parent dentry for the name in question. If 1337 * d_lookup searches the children of the parent dentry for the name in
1336 * the dentry is found its reference count is incremented and the dentry 1338 * question. If the dentry is found its reference count is incremented and the
1337 * is returned. The caller must use dput to free the entry when it has 1339 * dentry is returned. The caller must use dput to free the entry when it has
1338 * finished using it. %NULL is returned on failure. 1340 * finished using it. %NULL is returned if the dentry does not exist.
1339 *
1340 * __d_lookup is dcache_lock free. The hash list is protected using RCU.
1341 * Memory barriers are used while updating and doing lockless traversal.
1342 * To avoid races with d_move while rename is happening, d_lock is used.
1343 *
1344 * Overflows in memcmp(), while d_move, are avoided by keeping the length
1345 * and name pointer in one structure pointed by d_qstr.
1346 *
1347 * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
1348 * lookup is going on.
1349 *
1350 * The dentry unused LRU is not updated even if lookup finds the required dentry
1351 * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
1352 * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
1353 * acquisition.
1354 *
1355 * d_lookup() is protected against the concurrent renames in some unrelated
1356 * directory using the seqlockt_t rename_lock.
1357 */ 1341 */
1358
1359struct dentry * d_lookup(struct dentry * parent, struct qstr * name) 1342struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1360{ 1343{
1361 struct dentry * dentry = NULL; 1344 struct dentry * dentry = NULL;
@@ -1371,6 +1354,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1371} 1354}
1372EXPORT_SYMBOL(d_lookup); 1355EXPORT_SYMBOL(d_lookup);
1373 1356
1357/*
1358 * __d_lookup - search for a dentry (racy)
1359 * @parent: parent dentry
1360 * @name: qstr of name we wish to find
1361 * Returns: dentry, or NULL
1362 *
1363 * __d_lookup is like d_lookup, however it may (rarely) return a
1364 * false-negative result due to unrelated rename activity.
1365 *
1366 * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
1367 * however it must be used carefully, eg. with a following d_lookup in
1368 * the case of failure.
1369 *
1370 * __d_lookup callers must be commented.
1371 */
1374struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1372struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1375{ 1373{
1376 unsigned int len = name->len; 1374 unsigned int len = name->len;
@@ -1381,6 +1379,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1381 struct hlist_node *node; 1379 struct hlist_node *node;
1382 struct dentry *dentry; 1380 struct dentry *dentry;
1383 1381
1382 /*
1383 * The hash list is protected using RCU.
1384 *
1385 * Take d_lock when comparing a candidate dentry, to avoid races
1386 * with d_move().
1387 *
1388 * It is possible that concurrent renames can mess up our list
1389 * walk here and result in missing our dentry, resulting in the
1390 * false-negative result. d_lookup() protects against concurrent
1391 * renames using rename_lock seqlock.
1392 *
1393 * See Documentation/vfs/dcache-locking.txt for more details.
1394 */
1384 rcu_read_lock(); 1395 rcu_read_lock();
1385 1396
1386 hlist_for_each_entry_rcu(dentry, node, head, d_hash) { 1397 hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
@@ -1395,8 +1406,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1395 1406
1396 /* 1407 /*
1397 * Recheck the dentry after taking the lock - d_move may have 1408 * Recheck the dentry after taking the lock - d_move may have
1398 * changed things. Don't bother checking the hash because we're 1409 * changed things. Don't bother checking the hash because
1399 * about to compare the whole name anyway. 1410 * we're about to compare the whole name anyway.
1400 */ 1411 */
1401 if (dentry->d_parent != parent) 1412 if (dentry->d_parent != parent)
1402 goto next; 1413 goto next;
@@ -1529,6 +1540,7 @@ void d_delete(struct dentry * dentry)
1529 spin_lock(&dentry->d_lock); 1540 spin_lock(&dentry->d_lock);
1530 isdir = S_ISDIR(dentry->d_inode->i_mode); 1541 isdir = S_ISDIR(dentry->d_inode->i_mode);
1531 if (atomic_read(&dentry->d_count) == 1) { 1542 if (atomic_read(&dentry->d_count) == 1) {
1543 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1532 dentry_iput(dentry); 1544 dentry_iput(dentry);
1533 fsnotify_nameremove(dentry, isdir); 1545 fsnotify_nameremove(dentry, isdir);
1534 return; 1546 return;
@@ -1903,48 +1915,30 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
1903} 1915}
1904 1916
1905/** 1917/**
1906 * __d_path - return the path of a dentry 1918 * Prepend path string to a buffer
1919 *
1907 * @path: the dentry/vfsmount to report 1920 * @path: the dentry/vfsmount to report
1908 * @root: root vfsmnt/dentry (may be modified by this function) 1921 * @root: root vfsmnt/dentry (may be modified by this function)
1909 * @buffer: buffer to return value in 1922 * @buffer: pointer to the end of the buffer
1910 * @buflen: buffer length 1923 * @buflen: pointer to buffer length
1911 *
1912 * Convert a dentry into an ASCII path name. If the entry has been deleted
1913 * the string " (deleted)" is appended. Note that this is ambiguous.
1914 * 1924 *
1915 * Returns a pointer into the buffer or an error code if the 1925 * Caller holds the dcache_lock.
1916 * path was too long.
1917 *
1918 * "buflen" should be positive. Caller holds the dcache_lock.
1919 * 1926 *
1920 * If path is not reachable from the supplied root, then the value of 1927 * If path is not reachable from the supplied root, then the value of
1921 * root is changed (without modifying refcounts). 1928 * root is changed (without modifying refcounts).
1922 */ 1929 */
1923char *__d_path(const struct path *path, struct path *root, 1930static int prepend_path(const struct path *path, struct path *root,
1924 char *buffer, int buflen) 1931 char **buffer, int *buflen)
1925{ 1932{
1926 struct dentry *dentry = path->dentry; 1933 struct dentry *dentry = path->dentry;
1927 struct vfsmount *vfsmnt = path->mnt; 1934 struct vfsmount *vfsmnt = path->mnt;
1928 char *end = buffer + buflen; 1935 bool slash = false;
1929 char *retval; 1936 int error = 0;
1930
1931 spin_lock(&vfsmount_lock);
1932 prepend(&end, &buflen, "\0", 1);
1933 if (d_unlinked(dentry) &&
1934 (prepend(&end, &buflen, " (deleted)", 10) != 0))
1935 goto Elong;
1936 1937
1937 if (buflen < 1) 1938 br_read_lock(vfsmount_lock);
1938 goto Elong; 1939 while (dentry != root->dentry || vfsmnt != root->mnt) {
1939 /* Get '/' right */
1940 retval = end-1;
1941 *retval = '/';
1942
1943 for (;;) {
1944 struct dentry * parent; 1940 struct dentry * parent;
1945 1941
1946 if (dentry == root->dentry && vfsmnt == root->mnt)
1947 break;
1948 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { 1942 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
1949 /* Global root? */ 1943 /* Global root? */
1950 if (vfsmnt->mnt_parent == vfsmnt) { 1944 if (vfsmnt->mnt_parent == vfsmnt) {
@@ -1956,28 +1950,88 @@ char *__d_path(const struct path *path, struct path *root,
1956 } 1950 }
1957 parent = dentry->d_parent; 1951 parent = dentry->d_parent;
1958 prefetch(parent); 1952 prefetch(parent);
1959 if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || 1953 error = prepend_name(buffer, buflen, &dentry->d_name);
1960 (prepend(&end, &buflen, "/", 1) != 0)) 1954 if (!error)
1961 goto Elong; 1955 error = prepend(buffer, buflen, "/", 1);
1962 retval = end; 1956 if (error)
1957 break;
1958
1959 slash = true;
1963 dentry = parent; 1960 dentry = parent;
1964 } 1961 }
1965 1962
1966out: 1963out:
1967 spin_unlock(&vfsmount_lock); 1964 if (!error && !slash)
1968 return retval; 1965 error = prepend(buffer, buflen, "/", 1);
1966
1967 br_read_unlock(vfsmount_lock);
1968 return error;
1969 1969
1970global_root: 1970global_root:
1971 retval += 1; /* hit the slash */ 1971 /*
1972 if (prepend_name(&retval, &buflen, &dentry->d_name) != 0) 1972 * Filesystems needing to implement special "root names"
1973 goto Elong; 1973 * should do so with ->d_dname()
1974 */
1975 if (IS_ROOT(dentry) &&
1976 (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
1977 WARN(1, "Root dentry has weird name <%.*s>\n",
1978 (int) dentry->d_name.len, dentry->d_name.name);
1979 }
1974 root->mnt = vfsmnt; 1980 root->mnt = vfsmnt;
1975 root->dentry = dentry; 1981 root->dentry = dentry;
1976 goto out; 1982 goto out;
1983}
1977 1984
1978Elong: 1985/**
1979 retval = ERR_PTR(-ENAMETOOLONG); 1986 * __d_path - return the path of a dentry
1980 goto out; 1987 * @path: the dentry/vfsmount to report
1988 * @root: root vfsmnt/dentry (may be modified by this function)
1989 * @buf: buffer to return value in
1990 * @buflen: buffer length
1991 *
1992 * Convert a dentry into an ASCII path name.
1993 *
1994 * Returns a pointer into the buffer or an error code if the
1995 * path was too long.
1996 *
1997 * "buflen" should be positive. Caller holds the dcache_lock.
1998 *
1999 * If path is not reachable from the supplied root, then the value of
2000 * root is changed (without modifying refcounts).
2001 */
2002char *__d_path(const struct path *path, struct path *root,
2003 char *buf, int buflen)
2004{
2005 char *res = buf + buflen;
2006 int error;
2007
2008 prepend(&res, &buflen, "\0", 1);
2009 error = prepend_path(path, root, &res, &buflen);
2010 if (error)
2011 return ERR_PTR(error);
2012
2013 return res;
2014}
2015
2016/*
2017 * same as __d_path but appends "(deleted)" for unlinked files.
2018 */
2019static int path_with_deleted(const struct path *path, struct path *root,
2020 char **buf, int *buflen)
2021{
2022 prepend(buf, buflen, "\0", 1);
2023 if (d_unlinked(path->dentry)) {
2024 int error = prepend(buf, buflen, " (deleted)", 10);
2025 if (error)
2026 return error;
2027 }
2028
2029 return prepend_path(path, root, buf, buflen);
2030}
2031
2032static int prepend_unreachable(char **buffer, int *buflen)
2033{
2034 return prepend(buffer, buflen, "(unreachable)", 13);
1981} 2035}
1982 2036
1983/** 2037/**
@@ -1998,9 +2052,10 @@ Elong:
1998 */ 2052 */
1999char *d_path(const struct path *path, char *buf, int buflen) 2053char *d_path(const struct path *path, char *buf, int buflen)
2000{ 2054{
2001 char *res; 2055 char *res = buf + buflen;
2002 struct path root; 2056 struct path root;
2003 struct path tmp; 2057 struct path tmp;
2058 int error;
2004 2059
2005 /* 2060 /*
2006 * We have various synthetic filesystems that never get mounted. On 2061 * We have various synthetic filesystems that never get mounted. On
@@ -2012,19 +2067,51 @@ char *d_path(const struct path *path, char *buf, int buflen)
2012 if (path->dentry->d_op && path->dentry->d_op->d_dname) 2067 if (path->dentry->d_op && path->dentry->d_op->d_dname)
2013 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 2068 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2014 2069
2015 read_lock(&current->fs->lock); 2070 get_fs_root(current->fs, &root);
2016 root = current->fs->root;
2017 path_get(&root);
2018 read_unlock(&current->fs->lock);
2019 spin_lock(&dcache_lock); 2071 spin_lock(&dcache_lock);
2020 tmp = root; 2072 tmp = root;
2021 res = __d_path(path, &tmp, buf, buflen); 2073 error = path_with_deleted(path, &tmp, &res, &buflen);
2074 if (error)
2075 res = ERR_PTR(error);
2022 spin_unlock(&dcache_lock); 2076 spin_unlock(&dcache_lock);
2023 path_put(&root); 2077 path_put(&root);
2024 return res; 2078 return res;
2025} 2079}
2026EXPORT_SYMBOL(d_path); 2080EXPORT_SYMBOL(d_path);
2027 2081
2082/**
2083 * d_path_with_unreachable - return the path of a dentry
2084 * @path: path to report
2085 * @buf: buffer to return value in
2086 * @buflen: buffer length
2087 *
2088 * The difference from d_path() is that this prepends "(unreachable)"
2089 * to paths which are unreachable from the current process' root.
2090 */
2091char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
2092{
2093 char *res = buf + buflen;
2094 struct path root;
2095 struct path tmp;
2096 int error;
2097
2098 if (path->dentry->d_op && path->dentry->d_op->d_dname)
2099 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2100
2101 get_fs_root(current->fs, &root);
2102 spin_lock(&dcache_lock);
2103 tmp = root;
2104 error = path_with_deleted(path, &tmp, &res, &buflen);
2105 if (!error && !path_equal(&tmp, &root))
2106 error = prepend_unreachable(&res, &buflen);
2107 spin_unlock(&dcache_lock);
2108 path_put(&root);
2109 if (error)
2110 res = ERR_PTR(error);
2111
2112 return res;
2113}
2114
2028/* 2115/*
2029 * Helper function for dentry_operations.d_dname() members 2116 * Helper function for dentry_operations.d_dname() members
2030 */ 2117 */
@@ -2049,16 +2136,12 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
2049/* 2136/*
2050 * Write full pathname from the root of the filesystem into the buffer. 2137 * Write full pathname from the root of the filesystem into the buffer.
2051 */ 2138 */
2052char *dentry_path(struct dentry *dentry, char *buf, int buflen) 2139char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
2053{ 2140{
2054 char *end = buf + buflen; 2141 char *end = buf + buflen;
2055 char *retval; 2142 char *retval;
2056 2143
2057 spin_lock(&dcache_lock);
2058 prepend(&end, &buflen, "\0", 1); 2144 prepend(&end, &buflen, "\0", 1);
2059 if (d_unlinked(dentry) &&
2060 (prepend(&end, &buflen, "//deleted", 9) != 0))
2061 goto Elong;
2062 if (buflen < 1) 2145 if (buflen < 1)
2063 goto Elong; 2146 goto Elong;
2064 /* Get '/' right */ 2147 /* Get '/' right */
@@ -2076,7 +2159,28 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2076 retval = end; 2159 retval = end;
2077 dentry = parent; 2160 dentry = parent;
2078 } 2161 }
2162 return retval;
2163Elong:
2164 return ERR_PTR(-ENAMETOOLONG);
2165}
2166EXPORT_SYMBOL(__dentry_path);
2167
2168char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2169{
2170 char *p = NULL;
2171 char *retval;
2172
2173 spin_lock(&dcache_lock);
2174 if (d_unlinked(dentry)) {
2175 p = buf + buflen;
2176 if (prepend(&p, &buflen, "//deleted", 10) != 0)
2177 goto Elong;
2178 buflen++;
2179 }
2180 retval = __dentry_path(dentry, buf, buflen);
2079 spin_unlock(&dcache_lock); 2181 spin_unlock(&dcache_lock);
2182 if (!IS_ERR(retval) && p)
2183 *p = '/'; /* restore '/' overriden with '\0' */
2080 return retval; 2184 return retval;
2081Elong: 2185Elong:
2082 spin_unlock(&dcache_lock); 2186 spin_unlock(&dcache_lock);
@@ -2110,27 +2214,30 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2110 if (!page) 2214 if (!page)
2111 return -ENOMEM; 2215 return -ENOMEM;
2112 2216
2113 read_lock(&current->fs->lock); 2217 get_fs_root_and_pwd(current->fs, &root, &pwd);
2114 pwd = current->fs->pwd;
2115 path_get(&pwd);
2116 root = current->fs->root;
2117 path_get(&root);
2118 read_unlock(&current->fs->lock);
2119 2218
2120 error = -ENOENT; 2219 error = -ENOENT;
2121 spin_lock(&dcache_lock); 2220 spin_lock(&dcache_lock);
2122 if (!d_unlinked(pwd.dentry)) { 2221 if (!d_unlinked(pwd.dentry)) {
2123 unsigned long len; 2222 unsigned long len;
2124 struct path tmp = root; 2223 struct path tmp = root;
2125 char * cwd; 2224 char *cwd = page + PAGE_SIZE;
2225 int buflen = PAGE_SIZE;
2126 2226
2127 cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE); 2227 prepend(&cwd, &buflen, "\0", 1);
2228 error = prepend_path(&pwd, &tmp, &cwd, &buflen);
2128 spin_unlock(&dcache_lock); 2229 spin_unlock(&dcache_lock);
2129 2230
2130 error = PTR_ERR(cwd); 2231 if (error)
2131 if (IS_ERR(cwd))
2132 goto out; 2232 goto out;
2133 2233
2234 /* Unreachable from current root */
2235 if (!path_equal(&tmp, &root)) {
2236 error = prepend_unreachable(&cwd, &buflen);
2237 if (error)
2238 goto out;
2239 }
2240
2134 error = -ERANGE; 2241 error = -ERANGE;
2135 len = PAGE_SIZE + page - cwd; 2242 len = PAGE_SIZE + page - cwd;
2136 if (len <= size) { 2243 if (len <= size) {
@@ -2195,11 +2302,12 @@ int path_is_under(struct path *path1, struct path *path2)
2195 struct vfsmount *mnt = path1->mnt; 2302 struct vfsmount *mnt = path1->mnt;
2196 struct dentry *dentry = path1->dentry; 2303 struct dentry *dentry = path1->dentry;
2197 int res; 2304 int res;
2198 spin_lock(&vfsmount_lock); 2305
2306 br_read_lock(vfsmount_lock);
2199 if (mnt != path2->mnt) { 2307 if (mnt != path2->mnt) {
2200 for (;;) { 2308 for (;;) {
2201 if (mnt->mnt_parent == mnt) { 2309 if (mnt->mnt_parent == mnt) {
2202 spin_unlock(&vfsmount_lock); 2310 br_read_unlock(vfsmount_lock);
2203 return 0; 2311 return 0;
2204 } 2312 }
2205 if (mnt->mnt_parent == path2->mnt) 2313 if (mnt->mnt_parent == path2->mnt)
@@ -2209,7 +2317,7 @@ int path_is_under(struct path *path1, struct path *path2)
2209 dentry = mnt->mnt_mountpoint; 2317 dentry = mnt->mnt_mountpoint;
2210 } 2318 }
2211 res = is_subdir(dentry, path2->dentry); 2319 res = is_subdir(dentry, path2->dentry);
2212 spin_unlock(&vfsmount_lock); 2320 br_read_unlock(vfsmount_lock);
2213 return res; 2321 return res;
2214} 2322}
2215EXPORT_SYMBOL(path_is_under); 2323EXPORT_SYMBOL(path_is_under);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); 277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); 278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
279 279
280DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
281
280/* 282/*
281 * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value 283 * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
282 * 284 *
283 * These functions are exactly the same as the above functions (but use a hex 285 * These functions are exactly the same as the above functions (but use a hex
284 * output for the decimal challenged). For details look at the above unsigned 286 * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
357} 359}
358EXPORT_SYMBOL_GPL(debugfs_create_x32); 360EXPORT_SYMBOL_GPL(debugfs_create_x32);
359 361
362/**
363 * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
364 * @name: a pointer to a string containing the name of the file to create.
365 * @mode: the permission that the file should have
366 * @parent: a pointer to the parent dentry for this file. This should be a
367 * directory dentry if set. If this parameter is %NULL, then the
368 * file will be created in the root of the debugfs filesystem.
369 * @value: a pointer to the variable that the file should read to and write
370 * from.
371 */
372struct dentry *debugfs_create_x64(const char *name, mode_t mode,
373 struct dentry *parent, u64 *value)
374{
375 return debugfs_create_file(name, mode, parent, value, &fops_x64);
376}
377EXPORT_SYMBOL_GPL(debugfs_create_x64);
378
360 379
361static int debugfs_size_t_set(void *data, u64 val) 380static int debugfs_size_t_set(void *data, u64 val)
362{ 381{
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0120247b41c0..8b3ffd5b5235 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
384 s->s_flags |= MS_ACTIVE; 384 s->s_flags |= MS_ACTIVE;
385 } 385 }
386 386
387 simple_set_mnt(mnt, s);
388
389 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); 387 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
390 388
391 error = mknod_ptmx(s); 389 error = mknod_ptmx(s);
392 if (error) 390 if (error)
393 goto out_dput; 391 goto out_undo_sget;
394 392
395 return 0; 393 simple_set_mnt(mnt, s);
396 394
397out_dput: 395 return 0;
398 dput(s->s_root); /* undo dget() in simple_set_mnt() */
399 396
400out_undo_sget: 397out_undo_sget:
401 deactivate_locked_super(s); 398 deactivate_locked_super(s);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..48d74c7391d1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
82 int reap_counter; /* rate limit reaping */ 82 int reap_counter; /* rate limit reaping */
83 get_block_t *get_block; /* block mapping function */ 83 get_block_t *get_block; /* block mapping function */
84 dio_iodone_t *end_io; /* IO completion function */ 84 dio_iodone_t *end_io; /* IO completion function */
85 dio_submit_t *submit_io; /* IO submition function */
86 loff_t logical_offset_in_bio; /* current first logical block in bio */
85 sector_t final_block_in_bio; /* current final block in bio + 1 */ 87 sector_t final_block_in_bio; /* current final block in bio + 1 */
86 sector_t next_block_for_io; /* next block to be put under IO, 88 sector_t next_block_for_io; /* next block to be put under IO,
87 in dio_blocks units */ 89 in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
96 unsigned cur_page_offset; /* Offset into it, in bytes */ 98 unsigned cur_page_offset; /* Offset into it, in bytes */
97 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 99 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
98 sector_t cur_page_block; /* Where it starts */ 100 sector_t cur_page_block; /* Where it starts */
101 loff_t cur_page_fs_offset; /* Offset in file */
99 102
100 /* BIO completion state */ 103 /* BIO completion state */
101 spinlock_t bio_lock; /* protects BIO fields below */ 104 spinlock_t bio_lock; /* protects BIO fields below */
@@ -215,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
215 * filesystems can use it to hold additional state between get_block calls and 218 * filesystems can use it to hold additional state between get_block calls and
216 * dio_complete. 219 * dio_complete.
217 */ 220 */
218static int dio_complete(struct dio *dio, loff_t offset, int ret) 221static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
219{ 222{
220 ssize_t transferred = 0; 223 ssize_t transferred = 0;
221 224
@@ -236,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
236 transferred = dio->i_size - offset; 239 transferred = dio->i_size - offset;
237 } 240 }
238 241
239 if (dio->end_io && dio->result)
240 dio->end_io(dio->iocb, offset, transferred,
241 dio->map_bh.b_private);
242
243 if (dio->flags & DIO_LOCKING)
244 /* lockdep: non-owner release */
245 up_read_non_owner(&dio->inode->i_alloc_sem);
246
247 if (ret == 0) 242 if (ret == 0)
248 ret = dio->page_errors; 243 ret = dio->page_errors;
249 if (ret == 0) 244 if (ret == 0)
@@ -251,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
251 if (ret == 0) 246 if (ret == 0)
252 ret = transferred; 247 ret = transferred;
253 248
249 if (dio->end_io && dio->result) {
250 dio->end_io(dio->iocb, offset, transferred,
251 dio->map_bh.b_private, ret, is_async);
252 } else if (is_async) {
253 aio_complete(dio->iocb, ret, 0);
254 }
255
256 if (dio->flags & DIO_LOCKING)
257 /* lockdep: non-owner release */
258 up_read_non_owner(&dio->inode->i_alloc_sem);
259
254 return ret; 260 return ret;
255} 261}
256 262
@@ -274,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
274 spin_unlock_irqrestore(&dio->bio_lock, flags); 280 spin_unlock_irqrestore(&dio->bio_lock, flags);
275 281
276 if (remaining == 0) { 282 if (remaining == 0) {
277 int ret = dio_complete(dio, dio->iocb->ki_pos, 0); 283 dio_complete(dio, dio->iocb->ki_pos, 0, true);
278 aio_complete(dio->iocb, ret, 0);
279 kfree(dio); 284 kfree(dio);
280 } 285 }
281} 286}
@@ -300,6 +305,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
300 spin_unlock_irqrestore(&dio->bio_lock, flags); 305 spin_unlock_irqrestore(&dio->bio_lock, flags);
301} 306}
302 307
308/**
309 * dio_end_io - handle the end io action for the given bio
310 * @bio: The direct io bio thats being completed
311 * @error: Error if there was one
312 *
313 * This is meant to be called by any filesystem that uses their own dio_submit_t
314 * so that the DIO specific endio actions are dealt with after the filesystem
315 * has done it's completion work.
316 */
317void dio_end_io(struct bio *bio, int error)
318{
319 struct dio *dio = bio->bi_private;
320
321 if (dio->is_async)
322 dio_bio_end_aio(bio, error);
323 else
324 dio_bio_end_io(bio, error);
325}
326EXPORT_SYMBOL_GPL(dio_end_io);
327
303static int 328static int
304dio_bio_alloc(struct dio *dio, struct block_device *bdev, 329dio_bio_alloc(struct dio *dio, struct block_device *bdev,
305 sector_t first_sector, int nr_vecs) 330 sector_t first_sector, int nr_vecs)
@@ -316,6 +341,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
316 bio->bi_end_io = dio_bio_end_io; 341 bio->bi_end_io = dio_bio_end_io;
317 342
318 dio->bio = bio; 343 dio->bio = bio;
344 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
319 return 0; 345 return 0;
320} 346}
321 347
@@ -340,10 +366,15 @@ static void dio_bio_submit(struct dio *dio)
340 if (dio->is_async && dio->rw == READ) 366 if (dio->is_async && dio->rw == READ)
341 bio_set_pages_dirty(bio); 367 bio_set_pages_dirty(bio);
342 368
343 submit_bio(dio->rw, bio); 369 if (dio->submit_io)
370 dio->submit_io(dio->rw, bio, dio->inode,
371 dio->logical_offset_in_bio);
372 else
373 submit_bio(dio->rw, bio);
344 374
345 dio->bio = NULL; 375 dio->bio = NULL;
346 dio->boundary = 0; 376 dio->boundary = 0;
377 dio->logical_offset_in_bio = 0;
347} 378}
348 379
349/* 380/*
@@ -603,16 +634,32 @@ static int dio_send_cur_page(struct dio *dio)
603 int ret = 0; 634 int ret = 0;
604 635
605 if (dio->bio) { 636 if (dio->bio) {
637 loff_t cur_offset = dio->cur_page_fs_offset;
638 loff_t bio_next_offset = dio->logical_offset_in_bio +
639 dio->bio->bi_size;
640
606 /* 641 /*
607 * See whether this new request is contiguous with the old 642 * See whether this new request is contiguous with the old.
643 *
644 * Btrfs cannot handl having logically non-contiguous requests
645 * submitted. For exmple if you have
646 *
647 * Logical: [0-4095][HOLE][8192-12287]
648 * Phyiscal: [0-4095] [4096-8181]
649 *
650 * We cannot submit those pages together as one BIO. So if our
651 * current logical offset in the file does not equal what would
652 * be the next logical offset in the bio, submit the bio we
653 * have.
608 */ 654 */
609 if (dio->final_block_in_bio != dio->cur_page_block) 655 if (dio->final_block_in_bio != dio->cur_page_block ||
656 cur_offset != bio_next_offset)
610 dio_bio_submit(dio); 657 dio_bio_submit(dio);
611 /* 658 /*
612 * Submit now if the underlying fs is about to perform a 659 * Submit now if the underlying fs is about to perform a
613 * metadata read 660 * metadata read
614 */ 661 */
615 if (dio->boundary) 662 else if (dio->boundary)
616 dio_bio_submit(dio); 663 dio_bio_submit(dio);
617 } 664 }
618 665
@@ -701,6 +748,7 @@ submit_page_section(struct dio *dio, struct page *page,
701 dio->cur_page_offset = offset; 748 dio->cur_page_offset = offset;
702 dio->cur_page_len = len; 749 dio->cur_page_len = len;
703 dio->cur_page_block = blocknr; 750 dio->cur_page_block = blocknr;
751 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
704out: 752out:
705 return ret; 753 return ret;
706} 754}
@@ -935,7 +983,7 @@ static ssize_t
935direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 983direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
936 const struct iovec *iov, loff_t offset, unsigned long nr_segs, 984 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
937 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, 985 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
938 struct dio *dio) 986 dio_submit_t submit_io, struct dio *dio)
939{ 987{
940 unsigned long user_addr; 988 unsigned long user_addr;
941 unsigned long flags; 989 unsigned long flags;
@@ -952,6 +1000,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
952 1000
953 dio->get_block = get_block; 1001 dio->get_block = get_block;
954 dio->end_io = end_io; 1002 dio->end_io = end_io;
1003 dio->submit_io = submit_io;
955 dio->final_block_in_bio = -1; 1004 dio->final_block_in_bio = -1;
956 dio->next_block_for_io = -1; 1005 dio->next_block_for_io = -1;
957 1006
@@ -1008,7 +1057,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1008 } 1057 }
1009 } /* end iovec loop */ 1058 } /* end iovec loop */
1010 1059
1011 if (ret == -ENOTBLK && (rw & WRITE)) { 1060 if (ret == -ENOTBLK) {
1012 /* 1061 /*
1013 * The remaining part of the request will be 1062 * The remaining part of the request will be
1014 * be handled by buffered I/O when we return 1063 * be handled by buffered I/O when we return
@@ -1079,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1079 spin_unlock_irqrestore(&dio->bio_lock, flags); 1128 spin_unlock_irqrestore(&dio->bio_lock, flags);
1080 1129
1081 if (ret2 == 0) { 1130 if (ret2 == 0) {
1082 ret = dio_complete(dio, offset, ret); 1131 ret = dio_complete(dio, offset, ret, false);
1083 kfree(dio); 1132 kfree(dio);
1084 } else 1133 } else
1085 BUG_ON(ret != -EIOCBQUEUED); 1134 BUG_ON(ret != -EIOCBQUEUED);
@@ -1110,7 +1159,7 @@ ssize_t
1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1159__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1111 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1160 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1161 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1113 int flags) 1162 dio_submit_t submit_io, int flags)
1114{ 1163{
1115 int seg; 1164 int seg;
1116 size_t size; 1165 size_t size;
@@ -1197,22 +1246,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1197 (end > i_size_read(inode))); 1246 (end > i_size_read(inode)));
1198 1247
1199 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1248 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1200 nr_segs, blkbits, get_block, end_io, dio); 1249 nr_segs, blkbits, get_block, end_io,
1201 1250 submit_io, dio);
1202 /*
1203 * In case of error extending write may have instantiated a few
1204 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1205 *
1206 * NOTE: filesystems with their own locking have to handle this
1207 * on their own.
1208 */
1209 if (flags & DIO_LOCKING) {
1210 if (unlikely((rw & WRITE) && retval < 0)) {
1211 loff_t isize = i_size_read(inode);
1212 if (end > isize)
1213 vmtruncate(inode, isize);
1214 }
1215 }
1216 1251
1217out: 1252out:
1218 return retval; 1253 return retval;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 17903b491298..031dbe3a15ca 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -733,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
733 if (lkb->lkb_rqmode < mode) 733 if (lkb->lkb_rqmode < mode)
734 break; 734 break;
735 735
736 if (!lkb) 736 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
737 list_add_tail(new, head);
738 else
739 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
740} 737}
741 738
742/* add/remove lkb to rsb's grant/convert/wait queue */ 739/* add/remove lkb to rsb's grant/convert/wait queue */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index c0d35c620526..37a34c2c622a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -248,7 +248,7 @@ static struct connection *assoc2con(int assoc_id)
248 248
249 for (i = 0 ; i < CONN_HASH_SIZE; i++) { 249 for (i = 0 ; i < CONN_HASH_SIZE; i++) {
250 hlist_for_each_entry(con, h, &connection_hash[i], list) { 250 hlist_for_each_entry(con, h, &connection_hash[i], list) {
251 if (con && con->sctp_assoc == assoc_id) { 251 if (con->sctp_assoc == assoc_id) {
252 mutex_unlock(&connections_lock); 252 mutex_unlock(&connections_lock);
253 return con; 253 return con;
254 } 254 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 2c6ad518100d..ef17e0169da1 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -81,24 +81,11 @@ static struct genl_ops dlm_nl_ops = {
81 81
82int __init dlm_netlink_init(void) 82int __init dlm_netlink_init(void)
83{ 83{
84 int rv; 84 return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
85
86 rv = genl_register_family(&family);
87 if (rv)
88 return rv;
89
90 rv = genl_register_ops(&family, &dlm_nl_ops);
91 if (rv < 0)
92 goto err;
93 return 0;
94 err:
95 genl_unregister_family(&family);
96 return rv;
97} 85}
98 86
99void dlm_netlink_exit(void) 87void dlm_netlink_exit(void)
100{ 88{
101 genl_unregister_ops(&family, &dlm_nl_ops);
102 genl_unregister_family(&family); 89 genl_unregister_family(&family);
103} 90}
104 91
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 8b6e73c47435..b6272853130c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -215,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
215 if (!ast_type) { 215 if (!ast_type) {
216 kref_get(&lkb->lkb_ref); 216 kref_get(&lkb->lkb_ref);
217 list_add_tail(&lkb->lkb_astqueue, &proc->asts); 217 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
218 lkb->lkb_ast_first = type;
218 wake_up_interruptible(&proc->wait); 219 wake_up_interruptible(&proc->wait);
219 } 220 }
220 if (type == AST_COMP && (ast_type & AST_COMP)) 221 if (type == AST_COMP && (ast_type & AST_COMP))
@@ -223,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
223 224
224 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type); 225 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
225 if (eol) { 226 if (eol) {
226 lkb->lkb_ast_type &= ~AST_BAST;
227 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; 227 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
228 } 228 }
229 229
@@ -706,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
706} 706}
707 707
708static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, 708static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
709 int bmode, char __user *buf, size_t count) 709 int mode, char __user *buf, size_t count)
710{ 710{
711#ifdef CONFIG_COMPAT 711#ifdef CONFIG_COMPAT
712 struct dlm_lock_result32 result32; 712 struct dlm_lock_result32 result32;
@@ -733,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
733 if (type == AST_BAST) { 733 if (type == AST_BAST) {
734 result.user_astaddr = ua->bastaddr; 734 result.user_astaddr = ua->bastaddr;
735 result.user_astparam = ua->bastparam; 735 result.user_astparam = ua->bastparam;
736 result.bast_mode = bmode; 736 result.bast_mode = mode;
737 } else { 737 } else {
738 result.user_astaddr = ua->castaddr; 738 result.user_astaddr = ua->castaddr;
739 result.user_astparam = ua->castparam; 739 result.user_astparam = ua->castparam;
@@ -801,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
801 struct dlm_user_proc *proc = file->private_data; 801 struct dlm_user_proc *proc = file->private_data;
802 struct dlm_lkb *lkb; 802 struct dlm_lkb *lkb;
803 DECLARE_WAITQUEUE(wait, current); 803 DECLARE_WAITQUEUE(wait, current);
804 int error, type=0, bmode=0, removed = 0; 804 int error = 0, removed;
805 int ret_type, ret_mode;
806 int bastmode, castmode, do_bast, do_cast;
805 807
806 if (count == sizeof(struct dlm_device_version)) { 808 if (count == sizeof(struct dlm_device_version)) {
807 error = copy_version_to_user(buf, count); 809 error = copy_version_to_user(buf, count);
@@ -820,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
820#endif 822#endif
821 return -EINVAL; 823 return -EINVAL;
822 824
825 try_another:
826
823 /* do we really need this? can a read happen after a close? */ 827 /* do we really need this? can a read happen after a close? */
824 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) 828 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
825 return -EINVAL; 829 return -EINVAL;
@@ -855,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
855 859
856 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); 860 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
857 861
858 if (lkb->lkb_ast_type & AST_COMP) { 862 removed = 0;
859 lkb->lkb_ast_type &= ~AST_COMP; 863 ret_type = 0;
860 type = AST_COMP; 864 ret_mode = 0;
861 } else if (lkb->lkb_ast_type & AST_BAST) { 865 do_bast = lkb->lkb_ast_type & AST_BAST;
862 lkb->lkb_ast_type &= ~AST_BAST; 866 do_cast = lkb->lkb_ast_type & AST_COMP;
863 type = AST_BAST; 867 bastmode = lkb->lkb_bastmode;
864 bmode = lkb->lkb_bastmode; 868 castmode = lkb->lkb_castmode;
869
870 /* when both are queued figure out which to do first and
871 switch first so the other goes in the next read */
872
873 if (do_cast && do_bast) {
874 if (lkb->lkb_ast_first == AST_COMP) {
875 ret_type = AST_COMP;
876 ret_mode = castmode;
877 lkb->lkb_ast_type &= ~AST_COMP;
878 lkb->lkb_ast_first = AST_BAST;
879 } else {
880 ret_type = AST_BAST;
881 ret_mode = bastmode;
882 lkb->lkb_ast_type &= ~AST_BAST;
883 lkb->lkb_ast_first = AST_COMP;
884 }
885 } else {
886 ret_type = lkb->lkb_ast_first;
887 ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
888 lkb->lkb_ast_type &= ~ret_type;
889 lkb->lkb_ast_first = 0;
890 }
891
892 /* if we're doing a bast but the bast is unnecessary, then
893 switch to do nothing or do a cast if that was needed next */
894
895 if ((ret_type == AST_BAST) &&
896 dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
897 ret_type = 0;
898 ret_mode = 0;
899
900 if (do_cast) {
901 ret_type = AST_COMP;
902 ret_mode = castmode;
903 lkb->lkb_ast_type &= ~AST_COMP;
904 lkb->lkb_ast_first = 0;
905 }
906 }
907
908 if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
909 log_print("device_read %x ast_first %x ast_type %x",
910 lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
865 } 911 }
866 912
867 if (!lkb->lkb_ast_type) { 913 if (!lkb->lkb_ast_type) {
@@ -870,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
870 } 916 }
871 spin_unlock(&proc->asts_spin); 917 spin_unlock(&proc->asts_spin);
872 918
873 error = copy_result_to_user(lkb->lkb_ua, 919 if (ret_type) {
874 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), 920 error = copy_result_to_user(lkb->lkb_ua,
875 type, bmode, buf, count); 921 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
922 ret_type, ret_mode, buf, count);
923
924 if (ret_type == AST_COMP)
925 lkb->lkb_castmode_done = castmode;
926 if (ret_type == AST_BAST)
927 lkb->lkb_bastmode_done = bastmode;
928 }
876 929
877 /* removes reference for the proc->asts lists added by 930 /* removes reference for the proc->asts lists added by
878 dlm_user_add_ast() and may result in the lkb being freed */ 931 dlm_user_add_ast() and may result in the lkb being freed */
932
879 if (removed) 933 if (removed)
880 dlm_put_lkb(lkb); 934 dlm_put_lkb(lkb);
881 935
936 /* the bast that was queued was eliminated (see unnecessary above),
937 leaving nothing to return */
938
939 if (!ret_type)
940 goto try_another;
941
882 return error; 942 return error;
883} 943}
884 944
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72c..2195c213ab2f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,13 +12,13 @@
12/* A global variable is a bit ugly, but it keeps the code simple */ 12/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches; 13int sysctl_drop_caches;
14 14
15static void drop_pagecache_sb(struct super_block *sb) 15static void drop_pagecache_sb(struct super_block *sb, void *unused)
16{ 16{
17 struct inode *inode, *toput_inode = NULL; 17 struct inode *inode, *toput_inode = NULL;
18 18
19 spin_lock(&inode_lock); 19 spin_lock(&inode_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 21 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
22 continue; 22 continue;
23 if (inode->i_mapping->nrpages == 0) 23 if (inode->i_mapping->nrpages == 0)
24 continue; 24 continue;
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
33 iput(toput_inode); 33 iput(toput_inode);
34} 34}
35 35
36static void drop_pagecache(void)
37{
38 struct super_block *sb;
39
40 spin_lock(&sb_lock);
41restart:
42 list_for_each_entry(sb, &super_blocks, s_list) {
43 sb->s_count++;
44 spin_unlock(&sb_lock);
45 down_read(&sb->s_umount);
46 if (sb->s_root)
47 drop_pagecache_sb(sb);
48 up_read(&sb->s_umount);
49 spin_lock(&sb_lock);
50 if (__put_super_and_need_restart(sb))
51 goto restart;
52 }
53 spin_unlock(&sb_lock);
54}
55
56static void drop_slab(void) 36static void drop_slab(void)
57{ 37{
58 int nr_objects; 38 int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
68 proc_dointvec_minmax(table, write, buffer, length, ppos); 48 proc_dointvec_minmax(table, write, buffer, length, ppos);
69 if (write) { 49 if (write) {
70 if (sysctl_drop_caches & 1) 50 if (sysctl_drop_caches & 1)
71 drop_pagecache(); 51 iterate_supers(drop_pagecache_sb, NULL);
72 if (sysctl_drop_caches & 2) 52 if (sysctl_drop_caches & 2)
73 drop_slab(); 53 drop_slab();
74 } 54 }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 1cc087635a5e..cbadc1bee6e7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -762,7 +762,7 @@ ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
762 762
763/** 763/**
764 * ecryptfs_init_crypt_ctx 764 * ecryptfs_init_crypt_ctx
765 * @crypt_stat: Uninitilized crypt stats structure 765 * @crypt_stat: Uninitialized crypt stats structure
766 * 766 *
767 * Initialize the crypto context. 767 * Initialize the crypto context.
768 * 768 *
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
1793static struct list_head key_tfm_list; 1793static struct list_head key_tfm_list;
1794struct mutex key_tfm_list_mutex; 1794struct mutex key_tfm_list_mutex;
1795 1795
1796int ecryptfs_init_crypto(void) 1796int __init ecryptfs_init_crypto(void)
1797{ 1797{
1798 mutex_init(&key_tfm_list_mutex); 1798 mutex_init(&key_tfm_list_mutex);
1799 INIT_LIST_HEAD(&key_tfm_list); 1799 INIT_LIST_HEAD(&key_tfm_list);
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
2169 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 2169 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2170 + encoded_name_no_prefix_size); 2170 + encoded_name_no_prefix_size);
2171 (*encoded_name)[(*encoded_name_size)] = '\0'; 2171 (*encoded_name)[(*encoded_name_size)] = '\0';
2172 (*encoded_name_size)++;
2173 } else { 2172 } else {
2174 rc = -EOPNOTSUPP; 2173 rc = -EOPNOTSUPP;
2175 } 2174 }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index bfc2e0f78f00..0032a9f5a3a9 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
731int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, 731int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
732 struct page *page_for_lower, 732 struct page *page_for_lower,
733 size_t offset_in_page, size_t size); 733 size_t offset_in_page, size_t size);
734int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 734int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
735 size_t size);
736int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 735int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
737 struct inode *ecryptfs_inode); 736 struct inode *ecryptfs_inode);
738int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, 737int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
739 pgoff_t page_index, 738 pgoff_t page_index,
740 size_t offset_in_page, size_t size, 739 size_t offset_in_page, size_t size,
741 struct inode *ecryptfs_inode); 740 struct inode *ecryptfs_inode);
742struct page *ecryptfs_get_locked_page(struct file *file, loff_t index); 741struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
743int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); 742int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
744int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, 743int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
745 struct user_namespace *user_ns); 744 struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e7440a6f5ebf..622c95140802 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -199,7 +199,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
199 "the persistent file for the dentry with name " 199 "the persistent file for the dentry with name "
200 "[%s]; rc = [%d]\n", __func__, 200 "[%s]; rc = [%d]\n", __func__,
201 ecryptfs_dentry->d_name.name, rc); 201 ecryptfs_dentry->d_name.name, rc);
202 goto out; 202 goto out_free;
203 } 203 }
204 } 204 }
205 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) 205 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
@@ -207,7 +207,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
207 rc = -EPERM; 207 rc = -EPERM;
208 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " 208 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
209 "file must hence be opened RO\n", __func__); 209 "file must hence be opened RO\n", __func__);
210 goto out; 210 goto out_free;
211 } 211 }
212 ecryptfs_set_file_lower( 212 ecryptfs_set_file_lower(
213 file, ecryptfs_inode_to_private(inode)->lower_file); 213 file, ecryptfs_inode_to_private(inode)->lower_file);
@@ -274,11 +274,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
274} 274}
275 275
276static int 276static int
277ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 277ecryptfs_fsync(struct file *file, int datasync)
278{ 278{
279 return vfs_fsync(ecryptfs_file_to_lower(file), 279 return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
280 ecryptfs_dentry_to_lower(dentry),
281 datasync);
282} 280}
283 281
284static int ecryptfs_fasync(int fd, struct file *file, int flag) 282static int ecryptfs_fasync(int fd, struct file *file, int flag)
@@ -294,12 +292,40 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
294 return rc; 292 return rc;
295} 293}
296 294
297static int ecryptfs_ioctl(struct inode *inode, struct file *file, 295static long
298 unsigned int cmd, unsigned long arg); 296ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
297{
298 struct file *lower_file = NULL;
299 long rc = -ENOTTY;
300
301 if (ecryptfs_file_to_private(file))
302 lower_file = ecryptfs_file_to_lower(file);
303 if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl)
304 rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
305 return rc;
306}
307
308#ifdef CONFIG_COMPAT
309static long
310ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
311{
312 struct file *lower_file = NULL;
313 long rc = -ENOIOCTLCMD;
314
315 if (ecryptfs_file_to_private(file))
316 lower_file = ecryptfs_file_to_lower(file);
317 if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl)
318 rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
319 return rc;
320}
321#endif
299 322
300const struct file_operations ecryptfs_dir_fops = { 323const struct file_operations ecryptfs_dir_fops = {
301 .readdir = ecryptfs_readdir, 324 .readdir = ecryptfs_readdir,
302 .ioctl = ecryptfs_ioctl, 325 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
326#ifdef CONFIG_COMPAT
327 .compat_ioctl = ecryptfs_compat_ioctl,
328#endif
303 .open = ecryptfs_open, 329 .open = ecryptfs_open,
304 .flush = ecryptfs_flush, 330 .flush = ecryptfs_flush,
305 .release = ecryptfs_release, 331 .release = ecryptfs_release,
@@ -315,7 +341,10 @@ const struct file_operations ecryptfs_main_fops = {
315 .write = do_sync_write, 341 .write = do_sync_write,
316 .aio_write = generic_file_aio_write, 342 .aio_write = generic_file_aio_write,
317 .readdir = ecryptfs_readdir, 343 .readdir = ecryptfs_readdir,
318 .ioctl = ecryptfs_ioctl, 344 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
345#ifdef CONFIG_COMPAT
346 .compat_ioctl = ecryptfs_compat_ioctl,
347#endif
319 .mmap = generic_file_mmap, 348 .mmap = generic_file_mmap,
320 .open = ecryptfs_open, 349 .open = ecryptfs_open,
321 .flush = ecryptfs_flush, 350 .flush = ecryptfs_flush,
@@ -324,20 +353,3 @@ const struct file_operations ecryptfs_main_fops = {
324 .fasync = ecryptfs_fasync, 353 .fasync = ecryptfs_fasync,
325 .splice_read = generic_file_splice_read, 354 .splice_read = generic_file_splice_read,
326}; 355};
327
328static int
329ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
330 unsigned long arg)
331{
332 int rc = 0;
333 struct file *lower_file = NULL;
334
335 if (ecryptfs_file_to_private(file))
336 lower_file = ecryptfs_file_to_lower(file);
337 if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
338 rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
339 lower_file, cmd, arg);
340 else
341 rc = -ENOTTY;
342 return rc;
343}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2d4418affac..3fbc94203380 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -142,19 +142,10 @@ out:
142static int grow_file(struct dentry *ecryptfs_dentry) 142static int grow_file(struct dentry *ecryptfs_dentry)
143{ 143{
144 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; 144 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
145 struct file fake_file;
146 struct ecryptfs_file_info tmp_file_info;
147 char zero_virt[] = { 0x00 }; 145 char zero_virt[] = { 0x00 };
148 int rc = 0; 146 int rc = 0;
149 147
150 memset(&fake_file, 0, sizeof(fake_file)); 148 rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
151 fake_file.f_path.dentry = ecryptfs_dentry;
152 memset(&tmp_file_info, 0, sizeof(tmp_file_info));
153 ecryptfs_set_file_private(&fake_file, &tmp_file_info);
154 ecryptfs_set_file_lower(
155 &fake_file,
156 ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
157 rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
158 i_size_write(ecryptfs_inode, 0); 149 i_size_write(ecryptfs_inode, 0);
159 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 150 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
160 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |= 151 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -273,7 +264,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
273 printk(KERN_ERR "%s: Out of memory whilst attempting " 264 printk(KERN_ERR "%s: Out of memory whilst attempting "
274 "to allocate ecryptfs_dentry_info struct\n", 265 "to allocate ecryptfs_dentry_info struct\n",
275 __func__); 266 __func__);
276 goto out_dput; 267 goto out_put;
277 } 268 }
278 ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); 269 ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
279 ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); 270 ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
@@ -348,14 +339,84 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
348out_free_kmem: 339out_free_kmem:
349 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 340 kmem_cache_free(ecryptfs_header_cache_2, page_virt);
350 goto out; 341 goto out;
351out_dput: 342out_put:
352 dput(lower_dentry); 343 dput(lower_dentry);
344 mntput(lower_mnt);
353 d_drop(ecryptfs_dentry); 345 d_drop(ecryptfs_dentry);
354out: 346out:
355 return rc; 347 return rc;
356} 348}
357 349
358/** 350/**
351 * ecryptfs_new_lower_dentry
352 * @name: The name of the new dentry.
353 * @lower_dir_dentry: Parent directory of the new dentry.
354 * @nd: nameidata from last lookup.
355 *
356 * Create a new dentry or get it from lower parent dir.
357 */
358static struct dentry *
359ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
360 struct nameidata *nd)
361{
362 struct dentry *new_dentry;
363 struct dentry *tmp;
364 struct inode *lower_dir_inode;
365
366 lower_dir_inode = lower_dir_dentry->d_inode;
367
368 tmp = d_alloc(lower_dir_dentry, name);
369 if (!tmp)
370 return ERR_PTR(-ENOMEM);
371
372 mutex_lock(&lower_dir_inode->i_mutex);
373 new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
374 mutex_unlock(&lower_dir_inode->i_mutex);
375
376 if (!new_dentry)
377 new_dentry = tmp;
378 else
379 dput(tmp);
380
381 return new_dentry;
382}
383
384
385/**
386 * ecryptfs_lookup_one_lower
387 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
388 * @lower_dir_dentry: lower parent directory
389 * @name: lower file name
390 *
391 * Get the lower dentry from vfs. If lower dentry does not exist yet,
392 * create it.
393 */
394static struct dentry *
395ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
396 struct dentry *lower_dir_dentry, struct qstr *name)
397{
398 struct nameidata nd;
399 struct vfsmount *lower_mnt;
400 int err;
401
402 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
403 ecryptfs_dentry->d_parent));
404 err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
405 mntput(lower_mnt);
406
407 if (!err) {
408 /* we dont need the mount */
409 mntput(nd.path.mnt);
410 return nd.path.dentry;
411 }
412 if (err != -ENOENT)
413 return ERR_PTR(err);
414
415 /* create a new lower dentry */
416 return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
417}
418
419/**
359 * ecryptfs_lookup 420 * ecryptfs_lookup
360 * @ecryptfs_dir_inode: The eCryptfs directory inode 421 * @ecryptfs_dir_inode: The eCryptfs directory inode
361 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up 422 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
@@ -372,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
372 size_t encrypted_and_encoded_name_size; 433 size_t encrypted_and_encoded_name_size;
373 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; 434 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
374 struct dentry *lower_dir_dentry, *lower_dentry; 435 struct dentry *lower_dir_dentry, *lower_dentry;
436 struct qstr lower_name;
375 int rc = 0; 437 int rc = 0;
376 438
377 ecryptfs_dentry->d_op = &ecryptfs_dops; 439 ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -382,14 +444,20 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
382 goto out_d_drop; 444 goto out_d_drop;
383 } 445 }
384 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 446 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
385 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 447 lower_name.name = ecryptfs_dentry->d_name.name;
386 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, 448 lower_name.len = ecryptfs_dentry->d_name.len;
387 lower_dir_dentry, 449 lower_name.hash = ecryptfs_dentry->d_name.hash;
388 ecryptfs_dentry->d_name.len); 450 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
389 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 451 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
452 &lower_name);
453 if (rc < 0)
454 goto out_d_drop;
455 }
456 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
457 lower_dir_dentry, &lower_name);
390 if (IS_ERR(lower_dentry)) { 458 if (IS_ERR(lower_dentry)) {
391 rc = PTR_ERR(lower_dentry); 459 rc = PTR_ERR(lower_dentry);
392 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 460 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
393 "[%d] on lower_dentry = [%s]\n", __func__, rc, 461 "[%d] on lower_dentry = [%s]\n", __func__, rc,
394 encrypted_and_encoded_name); 462 encrypted_and_encoded_name);
395 goto out_d_drop; 463 goto out_d_drop;
@@ -411,14 +479,20 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
411 "filename; rc = [%d]\n", __func__, rc); 479 "filename; rc = [%d]\n", __func__, rc);
412 goto out_d_drop; 480 goto out_d_drop;
413 } 481 }
414 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 482 lower_name.name = encrypted_and_encoded_name;
415 lower_dentry = lookup_one_len(encrypted_and_encoded_name, 483 lower_name.len = encrypted_and_encoded_name_size;
416 lower_dir_dentry, 484 lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
417 encrypted_and_encoded_name_size - 1); 485 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
418 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 486 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
487 &lower_name);
488 if (rc < 0)
489 goto out_d_drop;
490 }
491 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
492 lower_dir_dentry, &lower_name);
419 if (IS_ERR(lower_dentry)) { 493 if (IS_ERR(lower_dentry)) {
420 rc = PTR_ERR(lower_dentry); 494 rc = PTR_ERR(lower_dentry);
421 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 495 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
422 "[%d] on lower_dentry = [%s]\n", __func__, rc, 496 "[%d] on lower_dentry = [%s]\n", __func__, rc,
423 encrypted_and_encoded_name); 497 encrypted_and_encoded_name);
424 goto out_d_drop; 498 goto out_d_drop;
@@ -784,8 +858,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
784{ 858{
785 int rc = 0; 859 int rc = 0;
786 struct inode *inode = dentry->d_inode; 860 struct inode *inode = dentry->d_inode;
787 struct dentry *lower_dentry;
788 struct file fake_ecryptfs_file;
789 struct ecryptfs_crypt_stat *crypt_stat; 861 struct ecryptfs_crypt_stat *crypt_stat;
790 loff_t i_size = i_size_read(inode); 862 loff_t i_size = i_size_read(inode);
791 loff_t lower_size_before_truncate; 863 loff_t lower_size_before_truncate;
@@ -796,23 +868,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
796 goto out; 868 goto out;
797 } 869 }
798 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 870 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
799 /* Set up a fake ecryptfs file, this is used to interface with
800 * the file in the underlying filesystem so that the
801 * truncation has an effect there as well. */
802 memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
803 fake_ecryptfs_file.f_path.dentry = dentry;
804 /* Released at out_free: label */
805 ecryptfs_set_file_private(&fake_ecryptfs_file,
806 kmem_cache_alloc(ecryptfs_file_info_cache,
807 GFP_KERNEL));
808 if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
809 rc = -ENOMEM;
810 goto out;
811 }
812 lower_dentry = ecryptfs_dentry_to_lower(dentry);
813 ecryptfs_set_file_lower(
814 &fake_ecryptfs_file,
815 ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
816 /* Switch on growing or shrinking file */ 871 /* Switch on growing or shrinking file */
817 if (ia->ia_size > i_size) { 872 if (ia->ia_size > i_size) {
818 char zero[] = { 0x00 }; 873 char zero[] = { 0x00 };
@@ -822,7 +877,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
822 * this triggers code that will fill in 0's throughout 877 * this triggers code that will fill in 0's throughout
823 * the intermediate portion of the previous end of the 878 * the intermediate portion of the previous end of the
824 * file and the new and of the file */ 879 * file and the new and of the file */
825 rc = ecryptfs_write(&fake_ecryptfs_file, zero, 880 rc = ecryptfs_write(inode, zero,
826 (ia->ia_size - 1), 1); 881 (ia->ia_size - 1), 1);
827 } else { /* ia->ia_size < i_size_read(inode) */ 882 } else { /* ia->ia_size < i_size_read(inode) */
828 /* We're chopping off all the pages down to the page 883 /* We're chopping off all the pages down to the page
@@ -832,13 +887,23 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
832 size_t num_zeros = (PAGE_CACHE_SIZE 887 size_t num_zeros = (PAGE_CACHE_SIZE
833 - (ia->ia_size & ~PAGE_CACHE_MASK)); 888 - (ia->ia_size & ~PAGE_CACHE_MASK));
834 889
890
891 /*
892 * XXX(truncate) this should really happen at the begginning
893 * of ->setattr. But the code is too messy to that as part
894 * of a larger patch. ecryptfs is also totally missing out
895 * on the inode_change_ok check at the beginning of
896 * ->setattr while would include this.
897 */
898 rc = inode_newsize_ok(inode, ia->ia_size);
899 if (rc)
900 goto out;
901
835 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 902 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
836 rc = vmtruncate(inode, ia->ia_size); 903 truncate_setsize(inode, ia->ia_size);
837 if (rc)
838 goto out_free;
839 lower_ia->ia_size = ia->ia_size; 904 lower_ia->ia_size = ia->ia_size;
840 lower_ia->ia_valid |= ATTR_SIZE; 905 lower_ia->ia_valid |= ATTR_SIZE;
841 goto out_free; 906 goto out;
842 } 907 }
843 if (num_zeros) { 908 if (num_zeros) {
844 char *zeros_virt; 909 char *zeros_virt;
@@ -846,25 +911,25 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
846 zeros_virt = kzalloc(num_zeros, GFP_KERNEL); 911 zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
847 if (!zeros_virt) { 912 if (!zeros_virt) {
848 rc = -ENOMEM; 913 rc = -ENOMEM;
849 goto out_free; 914 goto out;
850 } 915 }
851 rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, 916 rc = ecryptfs_write(inode, zeros_virt,
852 ia->ia_size, num_zeros); 917 ia->ia_size, num_zeros);
853 kfree(zeros_virt); 918 kfree(zeros_virt);
854 if (rc) { 919 if (rc) {
855 printk(KERN_ERR "Error attempting to zero out " 920 printk(KERN_ERR "Error attempting to zero out "
856 "the remainder of the end page on " 921 "the remainder of the end page on "
857 "reducing truncate; rc = [%d]\n", rc); 922 "reducing truncate; rc = [%d]\n", rc);
858 goto out_free; 923 goto out;
859 } 924 }
860 } 925 }
861 vmtruncate(inode, ia->ia_size); 926 truncate_setsize(inode, ia->ia_size);
862 rc = ecryptfs_write_inode_size_to_metadata(inode); 927 rc = ecryptfs_write_inode_size_to_metadata(inode);
863 if (rc) { 928 if (rc) {
864 printk(KERN_ERR "Problem with " 929 printk(KERN_ERR "Problem with "
865 "ecryptfs_write_inode_size_to_metadata; " 930 "ecryptfs_write_inode_size_to_metadata; "
866 "rc = [%d]\n", rc); 931 "rc = [%d]\n", rc);
867 goto out_free; 932 goto out;
868 } 933 }
869 /* We are reducing the size of the ecryptfs file, and need to 934 /* We are reducing the size of the ecryptfs file, and need to
870 * know if we need to reduce the size of the lower file. */ 935 * know if we need to reduce the size of the lower file. */
@@ -878,10 +943,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
878 } else 943 } else
879 lower_ia->ia_valid &= ~ATTR_SIZE; 944 lower_ia->ia_valid &= ~ATTR_SIZE;
880 } 945 }
881out_free:
882 if (ecryptfs_file_to_private(&fake_ecryptfs_file))
883 kmem_cache_free(ecryptfs_file_info_cache,
884 ecryptfs_file_to_private(&fake_ecryptfs_file));
885out: 946out:
886 return rc; 947 return rc;
887} 948}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506ef..73811cfa2ea4 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
515 if (!s) { 515 if (!s) {
516 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 516 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
517 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 517 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
518 rc = -ENOMEM;
518 goto out; 519 goto out;
519 } 520 }
520 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 521 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
806 if (!s) { 807 if (!s) {
807 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 808 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
808 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 809 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
810 rc = -ENOMEM;
809 goto out; 811 goto out;
810 } 812 }
811 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 813 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aafa..0851ab6980f5 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
86 return 0; 86 return 0;
87} 87}
88 88
89int ecryptfs_init_kthread(void) 89int __init ecryptfs_init_kthread(void)
90{ 90{
91 int rc = 0; 91 int rc = 0;
92 92
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 760983d0f25e..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
281 * 281 *
282 * Returns zero on success; non-zero on error 282 * Returns zero on success; non-zero on error
283 */ 283 */
284static int ecryptfs_parse_options(struct super_block *sb, char *options) 284static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
285{ 285{
286 char *p; 286 char *p;
287 int rc = 0; 287 int rc = 0;
@@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
293 int fn_cipher_key_bytes; 293 int fn_cipher_key_bytes;
294 int fn_cipher_key_bytes_set = 0; 294 int fn_cipher_key_bytes_set = 0;
295 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 295 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
296 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; 296 &sbi->mount_crypt_stat;
297 substring_t args[MAX_OPT_ARGS]; 297 substring_t args[MAX_OPT_ARGS];
298 int token; 298 int token;
299 char *sig_src; 299 char *sig_src;
@@ -483,68 +483,7 @@ out:
483} 483}
484 484
485struct kmem_cache *ecryptfs_sb_info_cache; 485struct kmem_cache *ecryptfs_sb_info_cache;
486 486static struct file_system_type ecryptfs_fs_type;
487/**
488 * ecryptfs_fill_super
489 * @sb: The ecryptfs super block
490 * @raw_data: The options passed to mount
491 * @silent: Not used but required by function prototype
492 *
493 * Sets up what we can of the sb, rest is done in ecryptfs_read_super
494 *
495 * Returns zero on success; non-zero otherwise
496 */
497static int
498ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
499{
500 struct ecryptfs_sb_info *esi;
501 int rc = 0;
502
503 /* Released in ecryptfs_put_super() */
504 ecryptfs_set_superblock_private(sb,
505 kmem_cache_zalloc(ecryptfs_sb_info_cache,
506 GFP_KERNEL));
507 esi = ecryptfs_superblock_to_private(sb);
508 if (!esi) {
509 ecryptfs_printk(KERN_WARNING, "Out of memory\n");
510 rc = -ENOMEM;
511 goto out;
512 }
513
514 rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
515 if (rc)
516 goto out;
517
518 sb->s_bdi = &esi->bdi;
519 sb->s_op = &ecryptfs_sops;
520 /* Released through deactivate_super(sb) from get_sb_nodev */
521 sb->s_root = d_alloc(NULL, &(const struct qstr) {
522 .hash = 0,.name = "/",.len = 1});
523 if (!sb->s_root) {
524 ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
525 rc = -ENOMEM;
526 goto out;
527 }
528 sb->s_root->d_op = &ecryptfs_dops;
529 sb->s_root->d_sb = sb;
530 sb->s_root->d_parent = sb->s_root;
531 /* Released in d_release when dput(sb->s_root) is called */
532 /* through deactivate_super(sb) from get_sb_nodev() */
533 ecryptfs_set_dentry_private(sb->s_root,
534 kmem_cache_zalloc(ecryptfs_dentry_info_cache,
535 GFP_KERNEL));
536 if (!ecryptfs_dentry_to_private(sb->s_root)) {
537 ecryptfs_printk(KERN_ERR,
538 "dentry_info_cache alloc failed\n");
539 rc = -ENOMEM;
540 goto out;
541 }
542 rc = 0;
543out:
544 /* Should be able to rely on deactivate_super called from
545 * get_sb_nodev */
546 return rc;
547}
548 487
549/** 488/**
550 * ecryptfs_read_super 489 * ecryptfs_read_super
@@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
565 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); 504 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
566 goto out; 505 goto out;
567 } 506 }
507 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
508 rc = -EINVAL;
509 printk(KERN_ERR "Mount on filesystem of type "
510 "eCryptfs explicitly disallowed due to "
511 "known incompatibilities\n");
512 goto out_free;
513 }
568 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); 514 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
569 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; 515 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
570 sb->s_blocksize = path.dentry->d_sb->s_blocksize; 516 sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -588,11 +534,8 @@ out:
588 * @dev_name: The path to mount over 534 * @dev_name: The path to mount over
589 * @raw_data: The options passed into the kernel 535 * @raw_data: The options passed into the kernel
590 * 536 *
591 * The whole ecryptfs_get_sb process is broken into 4 functions: 537 * The whole ecryptfs_get_sb process is broken into 3 functions:
592 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any 538 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
593 * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
594 * with as much information as it can before needing
595 * the lower filesystem.
596 * ecryptfs_read_super(): this accesses the lower filesystem and uses 539 * ecryptfs_read_super(): this accesses the lower filesystem and uses
597 * ecryptfs_interpose to perform most of the linking 540 * ecryptfs_interpose to perform most of the linking
598 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) 541 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
601 const char *dev_name, void *raw_data, 544 const char *dev_name, void *raw_data,
602 struct vfsmount *mnt) 545 struct vfsmount *mnt)
603{ 546{
547 struct super_block *s;
548 struct ecryptfs_sb_info *sbi;
549 struct ecryptfs_dentry_info *root_info;
550 const char *err = "Getting sb failed";
604 int rc; 551 int rc;
605 struct super_block *sb;
606 552
607 rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt); 553 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
608 if (rc < 0) { 554 if (!sbi) {
609 printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc); 555 rc = -ENOMEM;
610 goto out; 556 goto out;
611 } 557 }
612 sb = mnt->mnt_sb; 558
613 rc = ecryptfs_parse_options(sb, raw_data); 559 rc = ecryptfs_parse_options(sbi, raw_data);
614 if (rc) { 560 if (rc) {
615 printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc); 561 err = "Error parsing options";
616 goto out_abort; 562 goto out;
563 }
564
565 s = sget(fs_type, NULL, set_anon_super, NULL);
566 if (IS_ERR(s)) {
567 rc = PTR_ERR(s);
568 goto out;
617 } 569 }
618 rc = ecryptfs_read_super(sb, dev_name); 570
571 s->s_flags = flags;
572 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
619 if (rc) { 573 if (rc) {
620 printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc); 574 deactivate_locked_super(s);
621 goto out_abort; 575 goto out;
622 } 576 }
623 goto out; 577
624out_abort: 578 ecryptfs_set_superblock_private(s, sbi);
625 dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */ 579 s->s_bdi = &sbi->bdi;
626 deactivate_locked_super(sb); 580
581 /* ->kill_sb() will take care of sbi after that point */
582 sbi = NULL;
583 s->s_op = &ecryptfs_sops;
584
585 rc = -ENOMEM;
586 s->s_root = d_alloc(NULL, &(const struct qstr) {
587 .hash = 0,.name = "/",.len = 1});
588 if (!s->s_root) {
589 deactivate_locked_super(s);
590 goto out;
591 }
592 s->s_root->d_op = &ecryptfs_dops;
593 s->s_root->d_sb = s;
594 s->s_root->d_parent = s->s_root;
595
596 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
597 if (!root_info) {
598 deactivate_locked_super(s);
599 goto out;
600 }
601 /* ->kill_sb() will take care of root_info */
602 ecryptfs_set_dentry_private(s->s_root, root_info);
603 s->s_flags |= MS_ACTIVE;
604 rc = ecryptfs_read_super(s, dev_name);
605 if (rc) {
606 deactivate_locked_super(s);
607 err = "Reading sb failed";
608 goto out;
609 }
610 simple_set_mnt(mnt, s);
611 return 0;
612
627out: 613out:
614 if (sbi) {
615 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
616 kmem_cache_free(ecryptfs_sb_info_cache, sbi);
617 }
618 printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
628 return rc; 619 return rc;
629} 620}
630 621
@@ -633,11 +624,16 @@ out:
633 * @sb: The ecryptfs super block 624 * @sb: The ecryptfs super block
634 * 625 *
635 * Used to bring the superblock down and free the private data. 626 * Used to bring the superblock down and free the private data.
636 * Private data is free'd in ecryptfs_put_super()
637 */ 627 */
638static void ecryptfs_kill_block_super(struct super_block *sb) 628static void ecryptfs_kill_block_super(struct super_block *sb)
639{ 629{
640 generic_shutdown_super(sb); 630 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
631 kill_anon_super(sb);
632 if (!sb_info)
633 return;
634 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
635 bdi_destroy(&sb_info->bdi);
636 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
641} 637}
642 638
643static struct file_system_type ecryptfs_fs_type = { 639static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 2d8dbce9d485..ab2248090515 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
31 31
32static struct hlist_head *ecryptfs_daemon_hash; 32static struct hlist_head *ecryptfs_daemon_hash;
33struct mutex ecryptfs_daemon_hash_mux; 33struct mutex ecryptfs_daemon_hash_mux;
34static int ecryptfs_hash_buckets; 34static int ecryptfs_hash_bits;
35#define ecryptfs_uid_hash(uid) \ 35#define ecryptfs_uid_hash(uid) \
36 hash_long((unsigned long)uid, ecryptfs_hash_buckets) 36 hash_long((unsigned long)uid, ecryptfs_hash_bits)
37 37
38static u32 ecryptfs_msg_counter; 38static u32 ecryptfs_msg_counter;
39static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; 39static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -274,7 +274,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
274 struct user_namespace *user_ns, struct pid *pid, 274 struct user_namespace *user_ns, struct pid *pid,
275 u32 seq) 275 u32 seq)
276{ 276{
277 struct ecryptfs_daemon *daemon; 277 struct ecryptfs_daemon *uninitialized_var(daemon);
278 struct ecryptfs_msg_ctx *msg_ctx; 278 struct ecryptfs_msg_ctx *msg_ctx;
279 size_t msg_size; 279 size_t msg_size;
280 struct nsproxy *nsproxy; 280 struct nsproxy *nsproxy;
@@ -473,7 +473,7 @@ sleep:
473 return rc; 473 return rc;
474} 474}
475 475
476int ecryptfs_init_messaging(void) 476int __init ecryptfs_init_messaging(void)
477{ 477{
478 int i; 478 int i;
479 int rc = 0; 479 int rc = 0;
@@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void)
486 } 486 }
487 mutex_init(&ecryptfs_daemon_hash_mux); 487 mutex_init(&ecryptfs_daemon_hash_mux);
488 mutex_lock(&ecryptfs_daemon_hash_mux); 488 mutex_lock(&ecryptfs_daemon_hash_mux);
489 ecryptfs_hash_buckets = 1; 489 ecryptfs_hash_bits = 1;
490 while (ecryptfs_number_of_users >> ecryptfs_hash_buckets) 490 while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
491 ecryptfs_hash_buckets++; 491 ecryptfs_hash_bits++;
492 ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head) 492 ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
493 * ecryptfs_hash_buckets), GFP_KERNEL); 493 * (1 << ecryptfs_hash_bits)),
494 GFP_KERNEL);
494 if (!ecryptfs_daemon_hash) { 495 if (!ecryptfs_daemon_hash) {
495 rc = -ENOMEM; 496 rc = -ENOMEM;
496 printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); 497 printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
497 mutex_unlock(&ecryptfs_daemon_hash_mux); 498 mutex_unlock(&ecryptfs_daemon_hash_mux);
498 goto out; 499 goto out;
499 } 500 }
500 for (i = 0; i < ecryptfs_hash_buckets; i++) 501 for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
501 INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]); 502 INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
502 mutex_unlock(&ecryptfs_daemon_hash_mux); 503 mutex_unlock(&ecryptfs_daemon_hash_mux);
503 ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) 504 ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
@@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void)
554 int i; 555 int i;
555 556
556 mutex_lock(&ecryptfs_daemon_hash_mux); 557 mutex_lock(&ecryptfs_daemon_hash_mux);
557 for (i = 0; i < ecryptfs_hash_buckets; i++) { 558 for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
558 int rc; 559 int rc;
559 560
560 hlist_for_each_entry(daemon, elem, 561 hlist_for_each_entry(daemon, elem,
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd4..00208c3d7e92 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = {
500 * 500 *
501 * Returns zero on success; non-zero otherwise 501 * Returns zero on success; non-zero otherwise
502 */ 502 */
503int ecryptfs_init_ecryptfs_miscdev(void) 503int __init ecryptfs_init_ecryptfs_miscdev(void)
504{ 504{
505 int rc; 505 int rc;
506 506
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2ee9a3a7b68c..b1d82756544b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -44,17 +44,9 @@
44 * Returns locked and up-to-date page (if ok), with increased 44 * Returns locked and up-to-date page (if ok), with increased
45 * refcnt. 45 * refcnt.
46 */ 46 */
47struct page *ecryptfs_get_locked_page(struct file *file, loff_t index) 47struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
48{ 48{
49 struct dentry *dentry; 49 struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
50 struct inode *inode;
51 struct address_space *mapping;
52 struct page *page;
53
54 dentry = file->f_path.dentry;
55 inode = dentry->d_inode;
56 mapping = inode->i_mapping;
57 page = read_mapping_page(mapping, index, (void *)file);
58 if (!IS_ERR(page)) 50 if (!IS_ERR(page))
59 lock_page(page); 51 lock_page(page);
60 return page; 52 return page;
@@ -198,7 +190,7 @@ out:
198static int ecryptfs_readpage(struct file *file, struct page *page) 190static int ecryptfs_readpage(struct file *file, struct page *page)
199{ 191{
200 struct ecryptfs_crypt_stat *crypt_stat = 192 struct ecryptfs_crypt_stat *crypt_stat =
201 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 193 &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
202 int rc = 0; 194 int rc = 0;
203 195
204 if (!crypt_stat 196 if (!crypt_stat
@@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
300 292
301 if (!PageUptodate(page)) { 293 if (!PageUptodate(page)) {
302 struct ecryptfs_crypt_stat *crypt_stat = 294 struct ecryptfs_crypt_stat *crypt_stat =
303 &ecryptfs_inode_to_private( 295 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
304 file->f_path.dentry->d_inode)->crypt_stat;
305 296
306 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) 297 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
307 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { 298 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
487 unsigned to = from + copied; 478 unsigned to = from + copied;
488 struct inode *ecryptfs_inode = mapping->host; 479 struct inode *ecryptfs_inode = mapping->host;
489 struct ecryptfs_crypt_stat *crypt_stat = 480 struct ecryptfs_crypt_stat *crypt_stat =
490 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 481 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
491 int rc; 482 int rc;
492 483
493 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { 484 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd6552..db184ef15d3d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
93 93
94/** 94/**
95 * ecryptfs_write 95 * ecryptfs_write
96 * @ecryptfs_file: The eCryptfs file into which to write 96 * @ecryptfs_inode: The eCryptfs file into which to write
97 * @data: Virtual address where data to write is located 97 * @data: Virtual address where data to write is located
98 * @offset: Offset in the eCryptfs file at which to begin writing the 98 * @offset: Offset in the eCryptfs file at which to begin writing the
99 * data from @data 99 * data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
109 * 109 *
110 * Returns zero on success; non-zero otherwise 110 * Returns zero on success; non-zero otherwise
111 */ 111 */
112int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 112int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
113 size_t size) 113 size_t size)
114{ 114{
115 struct page *ecryptfs_page; 115 struct page *ecryptfs_page;
116 struct ecryptfs_crypt_stat *crypt_stat; 116 struct ecryptfs_crypt_stat *crypt_stat;
117 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
118 char *ecryptfs_page_virt; 117 char *ecryptfs_page_virt;
119 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); 118 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
120 loff_t data_offset = 0; 119 loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
145 if (num_bytes > total_remaining_zeros) 144 if (num_bytes > total_remaining_zeros)
146 num_bytes = total_remaining_zeros; 145 num_bytes = total_remaining_zeros;
147 } 146 }
148 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 147 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
149 ecryptfs_page_idx); 148 ecryptfs_page_idx);
150 if (IS_ERR(ecryptfs_page)) { 149 if (IS_ERR(ecryptfs_page)) {
151 rc = PTR_ERR(ecryptfs_page); 150 rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
302int ecryptfs_read(char *data, loff_t offset, size_t size, 301int ecryptfs_read(char *data, loff_t offset, size_t size,
303 struct file *ecryptfs_file) 302 struct file *ecryptfs_file)
304{ 303{
304 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
305 struct page *ecryptfs_page; 305 struct page *ecryptfs_page;
306 char *ecryptfs_page_virt; 306 char *ecryptfs_page_virt;
307 loff_t ecryptfs_file_size = 307 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
308 i_size_read(ecryptfs_file->f_dentry->d_inode);
309 loff_t data_offset = 0; 308 loff_t data_offset = 0;
310 loff_t pos; 309 loff_t pos;
311 int rc = 0; 310 int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
327 326
328 if (num_bytes > total_remaining_bytes) 327 if (num_bytes > total_remaining_bytes)
329 num_bytes = total_remaining_bytes; 328 num_bytes = total_remaining_bytes;
330 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 329 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
331 ecryptfs_page_idx); 330 ecryptfs_page_idx);
332 if (IS_ERR(ecryptfs_page)) { 331 if (IS_ERR(ecryptfs_page)) {
333 rc = PTR_ERR(ecryptfs_page); 332 rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0c0ae491d231..f7fc286a3aa9 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
109} 109}
110 110
111/** 111/**
112 * ecryptfs_put_super
113 * @sb: Pointer to the ecryptfs super block
114 *
115 * Final actions when unmounting a file system.
116 * This will handle deallocation and release of our private data.
117 */
118static void ecryptfs_put_super(struct super_block *sb)
119{
120 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
121
122 lock_kernel();
123
124 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
125 bdi_destroy(&sb_info->bdi);
126 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
127 ecryptfs_set_superblock_private(sb, NULL);
128
129 unlock_kernel();
130}
131
132/**
133 * ecryptfs_statfs 112 * ecryptfs_statfs
134 * @sb: The ecryptfs super block 113 * @sb: The ecryptfs super block
135 * @buf: The struct kstatfs to fill in with stats 114 * @buf: The struct kstatfs to fill in with stats
@@ -139,11 +118,15 @@ static void ecryptfs_put_super(struct super_block *sb)
139 */ 118 */
140static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) 119static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
141{ 120{
142 return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf); 121 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
122
123 if (!lower_dentry->d_sb->s_op->statfs)
124 return -ENOSYS;
125 return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
143} 126}
144 127
145/** 128/**
146 * ecryptfs_clear_inode 129 * ecryptfs_evict_inode
147 * @inode - The ecryptfs inode 130 * @inode - The ecryptfs inode
148 * 131 *
149 * Called by iput() when the inode reference count reached zero 132 * Called by iput() when the inode reference count reached zero
@@ -152,8 +135,10 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
152 * on the inode free list. We use this to drop out reference to the 135 * on the inode free list. We use this to drop out reference to the
153 * lower inode. 136 * lower inode.
154 */ 137 */
155static void ecryptfs_clear_inode(struct inode *inode) 138static void ecryptfs_evict_inode(struct inode *inode)
156{ 139{
140 truncate_inode_pages(&inode->i_data, 0);
141 end_writeback(inode);
157 iput(ecryptfs_inode_to_lower(inode)); 142 iput(ecryptfs_inode_to_lower(inode));
158} 143}
159 144
@@ -203,9 +188,8 @@ const struct super_operations ecryptfs_sops = {
203 .alloc_inode = ecryptfs_alloc_inode, 188 .alloc_inode = ecryptfs_alloc_inode,
204 .destroy_inode = ecryptfs_destroy_inode, 189 .destroy_inode = ecryptfs_destroy_inode,
205 .drop_inode = generic_delete_inode, 190 .drop_inode = generic_delete_inode,
206 .put_super = ecryptfs_put_super,
207 .statfs = ecryptfs_statfs, 191 .statfs = ecryptfs_statfs,
208 .remount_fs = NULL, 192 .remount_fs = NULL,
209 .clear_inode = ecryptfs_clear_inode, 193 .evict_inode = ecryptfs_evict_inode,
210 .show_options = ecryptfs_show_options 194 .show_options = ecryptfs_show_options
211}; 195};
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
1140 * ep_poll_callback() when events will become available. 1140 * ep_poll_callback() when events will become available.
1141 */ 1141 */
1142 init_waitqueue_entry(&wait, current); 1142 init_waitqueue_entry(&wait, current);
1143 wait.flags |= WQ_FLAG_EXCLUSIVE; 1143 __add_wait_queue_exclusive(&ep->wq, &wait);
1144 __add_wait_queue(&ep->wq, &wait);
1145 1144
1146 for (;;) { 1145 for (;;) {
1147 /* 1146 /*
diff --git a/fs/exec.c b/fs/exec.c
index 029308754eea..56536ad0e7cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -28,7 +28,6 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/stat.h> 29#include <linux/stat.h>
30#include <linux/fcntl.h> 30#include <linux/fcntl.h>
31#include <linux/smp_lock.h>
32#include <linux/swap.h> 31#include <linux/swap.h>
33#include <linux/string.h> 32#include <linux/string.h>
34#include <linux/init.h> 33#include <linux/init.h>
@@ -131,7 +130,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
131 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) 130 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
132 goto exit; 131 goto exit;
133 132
134 fsnotify_open(file->f_path.dentry); 133 fsnotify_open(file);
135 134
136 error = -ENOEXEC; 135 error = -ENOEXEC;
137 if(file->f_op) { 136 if(file->f_op) {
@@ -244,9 +243,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
244 * use STACK_TOP because that can depend on attributes which aren't 243 * use STACK_TOP because that can depend on attributes which aren't
245 * configured yet. 244 * configured yet.
246 */ 245 */
246 BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
247 vma->vm_end = STACK_TOP_MAX; 247 vma->vm_end = STACK_TOP_MAX;
248 vma->vm_start = vma->vm_end - PAGE_SIZE; 248 vma->vm_start = vma->vm_end - PAGE_SIZE;
249 vma->vm_flags = VM_STACK_FLAGS; 249 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
250 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 250 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
251 INIT_LIST_HEAD(&vma->anon_vma_chain); 251 INIT_LIST_HEAD(&vma->anon_vma_chain);
252 err = insert_vm_struct(mm, vma); 252 err = insert_vm_struct(mm, vma);
@@ -363,13 +363,13 @@ err:
363/* 363/*
364 * count() counts the number of strings in array ARGV. 364 * count() counts the number of strings in array ARGV.
365 */ 365 */
366static int count(char __user * __user * argv, int max) 366static int count(const char __user * const __user * argv, int max)
367{ 367{
368 int i = 0; 368 int i = 0;
369 369
370 if (argv != NULL) { 370 if (argv != NULL) {
371 for (;;) { 371 for (;;) {
372 char __user * p; 372 const char __user * p;
373 373
374 if (get_user(p, argv)) 374 if (get_user(p, argv))
375 return -EFAULT; 375 return -EFAULT;
@@ -378,6 +378,9 @@ static int count(char __user * __user * argv, int max)
378 argv++; 378 argv++;
379 if (i++ >= max) 379 if (i++ >= max)
380 return -E2BIG; 380 return -E2BIG;
381
382 if (fatal_signal_pending(current))
383 return -ERESTARTNOHAND;
381 cond_resched(); 384 cond_resched();
382 } 385 }
383 } 386 }
@@ -389,7 +392,7 @@ static int count(char __user * __user * argv, int max)
389 * processes's memory to the new process's stack. The call to get_user_pages() 392 * processes's memory to the new process's stack. The call to get_user_pages()
390 * ensures the destination page is created and not swapped out. 393 * ensures the destination page is created and not swapped out.
391 */ 394 */
392static int copy_strings(int argc, char __user * __user * argv, 395static int copy_strings(int argc, const char __user *const __user *argv,
393 struct linux_binprm *bprm) 396 struct linux_binprm *bprm)
394{ 397{
395 struct page *kmapped_page = NULL; 398 struct page *kmapped_page = NULL;
@@ -398,7 +401,7 @@ static int copy_strings(int argc, char __user * __user * argv,
398 int ret; 401 int ret;
399 402
400 while (argc-- > 0) { 403 while (argc-- > 0) {
401 char __user *str; 404 const char __user *str;
402 int len; 405 int len;
403 unsigned long pos; 406 unsigned long pos;
404 407
@@ -421,6 +424,12 @@ static int copy_strings(int argc, char __user * __user * argv,
421 while (len > 0) { 424 while (len > 0) {
422 int offset, bytes_to_copy; 425 int offset, bytes_to_copy;
423 426
427 if (fatal_signal_pending(current)) {
428 ret = -ERESTARTNOHAND;
429 goto out;
430 }
431 cond_resched();
432
424 offset = pos % PAGE_SIZE; 433 offset = pos % PAGE_SIZE;
425 if (offset == 0) 434 if (offset == 0)
426 offset = PAGE_SIZE; 435 offset = PAGE_SIZE;
@@ -472,12 +481,13 @@ out:
472/* 481/*
473 * Like copy_strings, but get argv and its values from kernel memory. 482 * Like copy_strings, but get argv and its values from kernel memory.
474 */ 483 */
475int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm) 484int copy_strings_kernel(int argc, const char *const *argv,
485 struct linux_binprm *bprm)
476{ 486{
477 int r; 487 int r;
478 mm_segment_t oldfs = get_fs(); 488 mm_segment_t oldfs = get_fs();
479 set_fs(KERNEL_DS); 489 set_fs(KERNEL_DS);
480 r = copy_strings(argc, (char __user * __user *)argv, bprm); 490 r = copy_strings(argc, (const char __user *const __user *)argv, bprm);
481 set_fs(oldfs); 491 set_fs(oldfs);
482 return r; 492 return r;
483} 493}
@@ -595,6 +605,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
595#else 605#else
596 stack_top = arch_align_stack(stack_top); 606 stack_top = arch_align_stack(stack_top);
597 stack_top = PAGE_ALIGN(stack_top); 607 stack_top = PAGE_ALIGN(stack_top);
608
609 if (unlikely(stack_top < mmap_min_addr) ||
610 unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
611 return -ENOMEM;
612
598 stack_shift = vma->vm_end - stack_top; 613 stack_shift = vma->vm_end - stack_top;
599 614
600 bprm->p -= stack_shift; 615 bprm->p -= stack_shift;
@@ -618,6 +633,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
618 else if (executable_stack == EXSTACK_DISABLE_X) 633 else if (executable_stack == EXSTACK_DISABLE_X)
619 vm_flags &= ~VM_EXEC; 634 vm_flags &= ~VM_EXEC;
620 vm_flags |= mm->def_flags; 635 vm_flags |= mm->def_flags;
636 vm_flags |= VM_STACK_INCOMPLETE_SETUP;
621 637
622 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, 638 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
623 vm_flags); 639 vm_flags);
@@ -632,6 +648,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
632 goto out_unlock; 648 goto out_unlock;
633 } 649 }
634 650
651 /* mprotect_fixup is overkill to remove the temporary stack flags */
652 vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
653
635 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ 654 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
636 stack_size = vma->vm_end - vma->vm_start; 655 stack_size = vma->vm_end - vma->vm_start;
637 /* 656 /*
@@ -650,6 +669,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
650 else 669 else
651 stack_base = vma->vm_start - stack_expand; 670 stack_base = vma->vm_start - stack_expand;
652#endif 671#endif
672 current->mm->start_stack = bprm->p;
653 ret = expand_stack(vma, stack_base); 673 ret = expand_stack(vma, stack_base);
654 if (ret) 674 if (ret)
655 ret = -EFAULT; 675 ret = -EFAULT;
@@ -680,7 +700,7 @@ struct file *open_exec(const char *name)
680 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) 700 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
681 goto exit; 701 goto exit;
682 702
683 fsnotify_open(file->f_path.dentry); 703 fsnotify_open(file);
684 704
685 err = deny_write_access(file); 705 err = deny_write_access(file);
686 if (err) 706 if (err)
@@ -765,7 +785,6 @@ static int de_thread(struct task_struct *tsk)
765 struct signal_struct *sig = tsk->signal; 785 struct signal_struct *sig = tsk->signal;
766 struct sighand_struct *oldsighand = tsk->sighand; 786 struct sighand_struct *oldsighand = tsk->sighand;
767 spinlock_t *lock = &oldsighand->siglock; 787 spinlock_t *lock = &oldsighand->siglock;
768 int count;
769 788
770 if (thread_group_empty(tsk)) 789 if (thread_group_empty(tsk))
771 goto no_thread_group; 790 goto no_thread_group;
@@ -782,13 +801,13 @@ static int de_thread(struct task_struct *tsk)
782 spin_unlock_irq(lock); 801 spin_unlock_irq(lock);
783 return -EAGAIN; 802 return -EAGAIN;
784 } 803 }
804
785 sig->group_exit_task = tsk; 805 sig->group_exit_task = tsk;
786 zap_other_threads(tsk); 806 sig->notify_count = zap_other_threads(tsk);
807 if (!thread_group_leader(tsk))
808 sig->notify_count--;
787 809
788 /* Account for the thread group leader hanging around: */ 810 while (sig->notify_count) {
789 count = thread_group_leader(tsk) ? 1 : 2;
790 sig->notify_count = count;
791 while (atomic_read(&sig->count) > count) {
792 __set_current_state(TASK_UNINTERRUPTIBLE); 811 __set_current_state(TASK_UNINTERRUPTIBLE);
793 spin_unlock_irq(lock); 812 spin_unlock_irq(lock);
794 schedule(); 813 schedule();
@@ -995,7 +1014,7 @@ EXPORT_SYMBOL(flush_old_exec);
995void setup_new_exec(struct linux_binprm * bprm) 1014void setup_new_exec(struct linux_binprm * bprm)
996{ 1015{
997 int i, ch; 1016 int i, ch;
998 char * name; 1017 const char *name;
999 char tcomm[sizeof(current->comm)]; 1018 char tcomm[sizeof(current->comm)];
1000 1019
1001 arch_pick_mmap_layout(current->mm); 1020 arch_pick_mmap_layout(current->mm);
@@ -1115,7 +1134,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
1115 bprm->unsafe = tracehook_unsafe_exec(p); 1134 bprm->unsafe = tracehook_unsafe_exec(p);
1116 1135
1117 n_fs = 1; 1136 n_fs = 1;
1118 write_lock(&p->fs->lock); 1137 spin_lock(&p->fs->lock);
1119 rcu_read_lock(); 1138 rcu_read_lock();
1120 for (t = next_thread(p); t != p; t = next_thread(t)) { 1139 for (t = next_thread(p); t != p; t = next_thread(t)) {
1121 if (t->fs == p->fs) 1140 if (t->fs == p->fs)
@@ -1132,7 +1151,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
1132 res = 1; 1151 res = 1;
1133 } 1152 }
1134 } 1153 }
1135 write_unlock(&p->fs->lock); 1154 spin_unlock(&p->fs->lock);
1136 1155
1137 return res; 1156 return res;
1138} 1157}
@@ -1314,9 +1333,9 @@ EXPORT_SYMBOL(search_binary_handler);
1314/* 1333/*
1315 * sys_execve() executes a new program. 1334 * sys_execve() executes a new program.
1316 */ 1335 */
1317int do_execve(char * filename, 1336int do_execve(const char * filename,
1318 char __user *__user *argv, 1337 const char __user *const __user *argv,
1319 char __user *__user *envp, 1338 const char __user *const __user *envp,
1320 struct pt_regs * regs) 1339 struct pt_regs * regs)
1321{ 1340{
1322 struct linux_binprm *bprm; 1341 struct linux_binprm *bprm;
@@ -1660,12 +1679,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
1660 struct task_struct *tsk = current; 1679 struct task_struct *tsk = current;
1661 struct mm_struct *mm = tsk->mm; 1680 struct mm_struct *mm = tsk->mm;
1662 struct completion *vfork_done; 1681 struct completion *vfork_done;
1663 int core_waiters; 1682 int core_waiters = -EBUSY;
1664 1683
1665 init_completion(&core_state->startup); 1684 init_completion(&core_state->startup);
1666 core_state->dumper.task = tsk; 1685 core_state->dumper.task = tsk;
1667 core_state->dumper.next = NULL; 1686 core_state->dumper.next = NULL;
1668 core_waiters = zap_threads(tsk, mm, core_state, exit_code); 1687
1688 down_write(&mm->mmap_sem);
1689 if (!mm->core_state)
1690 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1669 up_write(&mm->mmap_sem); 1691 up_write(&mm->mmap_sem);
1670 1692
1671 if (unlikely(core_waiters < 0)) 1693 if (unlikely(core_waiters < 0))
@@ -1785,21 +1807,61 @@ static void wait_for_dump_helpers(struct file *file)
1785} 1807}
1786 1808
1787 1809
1810/*
1811 * uhm_pipe_setup
1812 * helper function to customize the process used
1813 * to collect the core in userspace. Specifically
1814 * it sets up a pipe and installs it as fd 0 (stdin)
1815 * for the process. Returns 0 on success, or
1816 * PTR_ERR on failure.
1817 * Note that it also sets the core limit to 1. This
1818 * is a special value that we use to trap recursive
1819 * core dumps
1820 */
1821static int umh_pipe_setup(struct subprocess_info *info)
1822{
1823 struct file *rp, *wp;
1824 struct fdtable *fdt;
1825 struct coredump_params *cp = (struct coredump_params *)info->data;
1826 struct files_struct *cf = current->files;
1827
1828 wp = create_write_pipe(0);
1829 if (IS_ERR(wp))
1830 return PTR_ERR(wp);
1831
1832 rp = create_read_pipe(wp, 0);
1833 if (IS_ERR(rp)) {
1834 free_write_pipe(wp);
1835 return PTR_ERR(rp);
1836 }
1837
1838 cp->file = wp;
1839
1840 sys_close(0);
1841 fd_install(0, rp);
1842 spin_lock(&cf->file_lock);
1843 fdt = files_fdtable(cf);
1844 FD_SET(0, fdt->open_fds);
1845 FD_CLR(0, fdt->close_on_exec);
1846 spin_unlock(&cf->file_lock);
1847
1848 /* and disallow core files too */
1849 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
1850
1851 return 0;
1852}
1853
1788void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1854void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1789{ 1855{
1790 struct core_state core_state; 1856 struct core_state core_state;
1791 char corename[CORENAME_MAX_SIZE + 1]; 1857 char corename[CORENAME_MAX_SIZE + 1];
1792 struct mm_struct *mm = current->mm; 1858 struct mm_struct *mm = current->mm;
1793 struct linux_binfmt * binfmt; 1859 struct linux_binfmt * binfmt;
1794 struct inode * inode;
1795 const struct cred *old_cred; 1860 const struct cred *old_cred;
1796 struct cred *cred; 1861 struct cred *cred;
1797 int retval = 0; 1862 int retval = 0;
1798 int flag = 0; 1863 int flag = 0;
1799 int ispipe = 0; 1864 int ispipe;
1800 char **helper_argv = NULL;
1801 int helper_argc = 0;
1802 int dump_count = 0;
1803 static atomic_t core_dump_count = ATOMIC_INIT(0); 1865 static atomic_t core_dump_count = ATOMIC_INIT(0);
1804 struct coredump_params cprm = { 1866 struct coredump_params cprm = {
1805 .signr = signr, 1867 .signr = signr,
@@ -1818,23 +1880,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1818 binfmt = mm->binfmt; 1880 binfmt = mm->binfmt;
1819 if (!binfmt || !binfmt->core_dump) 1881 if (!binfmt || !binfmt->core_dump)
1820 goto fail; 1882 goto fail;
1821 1883 if (!__get_dumpable(cprm.mm_flags))
1822 cred = prepare_creds();
1823 if (!cred) {
1824 retval = -ENOMEM;
1825 goto fail; 1884 goto fail;
1826 }
1827 1885
1828 down_write(&mm->mmap_sem); 1886 cred = prepare_creds();
1829 /* 1887 if (!cred)
1830 * If another thread got here first, or we are not dumpable, bail out.
1831 */
1832 if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1833 up_write(&mm->mmap_sem);
1834 put_cred(cred);
1835 goto fail; 1888 goto fail;
1836 }
1837
1838 /* 1889 /*
1839 * We cannot trust fsuid as being the "true" uid of the 1890 * We cannot trust fsuid as being the "true" uid of the
1840 * process nor do we know its entire history. We only know it 1891 * process nor do we know its entire history. We only know it
@@ -1847,10 +1898,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1847 } 1898 }
1848 1899
1849 retval = coredump_wait(exit_code, &core_state); 1900 retval = coredump_wait(exit_code, &core_state);
1850 if (retval < 0) { 1901 if (retval < 0)
1851 put_cred(cred); 1902 goto fail_creds;
1852 goto fail;
1853 }
1854 1903
1855 old_cred = override_creds(cred); 1904 old_cred = override_creds(cred);
1856 1905
@@ -1860,27 +1909,21 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1860 */ 1909 */
1861 clear_thread_flag(TIF_SIGPENDING); 1910 clear_thread_flag(TIF_SIGPENDING);
1862 1911
1863 /*
1864 * lock_kernel() because format_corename() is controlled by sysctl, which
1865 * uses lock_kernel()
1866 */
1867 lock_kernel();
1868 ispipe = format_corename(corename, signr); 1912 ispipe = format_corename(corename, signr);
1869 unlock_kernel();
1870
1871 if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
1872 goto fail_unlock;
1873 1913
1874 if (ispipe) { 1914 if (ispipe) {
1875 if (cprm.limit == 0) { 1915 int dump_count;
1916 char **helper_argv;
1917
1918 if (cprm.limit == 1) {
1876 /* 1919 /*
1877 * Normally core limits are irrelevant to pipes, since 1920 * Normally core limits are irrelevant to pipes, since
1878 * we're not writing to the file system, but we use 1921 * we're not writing to the file system, but we use
1879 * cprm.limit of 0 here as a speacial value. Any 1922 * cprm.limit of 1 here as a speacial value. Any
1880 * non-zero limit gets set to RLIM_INFINITY below, but 1923 * non-1 limit gets set to RLIM_INFINITY below, but
1881 * a limit of 0 skips the dump. This is a consistent 1924 * a limit of 0 skips the dump. This is a consistent
1882 * way to catch recursive crashes. We can still crash 1925 * way to catch recursive crashes. We can still crash
1883 * if the core_pattern binary sets RLIM_CORE = !0 1926 * if the core_pattern binary sets RLIM_CORE = !1
1884 * but it runs as root, and can do lots of stupid things 1927 * but it runs as root, and can do lots of stupid things
1885 * Note that we use task_tgid_vnr here to grab the pid 1928 * Note that we use task_tgid_vnr here to grab the pid
1886 * of the process group leader. That way we get the 1929 * of the process group leader. That way we get the
@@ -1888,11 +1931,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1888 * core_pattern process dies. 1931 * core_pattern process dies.
1889 */ 1932 */
1890 printk(KERN_WARNING 1933 printk(KERN_WARNING
1891 "Process %d(%s) has RLIMIT_CORE set to 0\n", 1934 "Process %d(%s) has RLIMIT_CORE set to 1\n",
1892 task_tgid_vnr(current), current->comm); 1935 task_tgid_vnr(current), current->comm);
1893 printk(KERN_WARNING "Aborting core\n"); 1936 printk(KERN_WARNING "Aborting core\n");
1894 goto fail_unlock; 1937 goto fail_unlock;
1895 } 1938 }
1939 cprm.limit = RLIM_INFINITY;
1896 1940
1897 dump_count = atomic_inc_return(&core_dump_count); 1941 dump_count = atomic_inc_return(&core_dump_count);
1898 if (core_pipe_limit && (core_pipe_limit < dump_count)) { 1942 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1902,71 +1946,114 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1902 goto fail_dropcount; 1946 goto fail_dropcount;
1903 } 1947 }
1904 1948
1905 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1949 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
1906 if (!helper_argv) { 1950 if (!helper_argv) {
1907 printk(KERN_WARNING "%s failed to allocate memory\n", 1951 printk(KERN_WARNING "%s failed to allocate memory\n",
1908 __func__); 1952 __func__);
1909 goto fail_dropcount; 1953 goto fail_dropcount;
1910 } 1954 }
1911 1955
1912 cprm.limit = RLIM_INFINITY; 1956 retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
1913 1957 NULL, UMH_WAIT_EXEC, umh_pipe_setup,
1914 /* SIGPIPE can happen, but it's just never processed */ 1958 NULL, &cprm);
1915 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, 1959 argv_free(helper_argv);
1916 &cprm.file)) { 1960 if (retval) {
1917 printk(KERN_INFO "Core dump to %s pipe failed\n", 1961 printk(KERN_INFO "Core dump to %s pipe failed\n",
1918 corename); 1962 corename);
1919 goto fail_dropcount; 1963 goto close_fail;
1920 } 1964 }
1921 } else 1965 } else {
1966 struct inode *inode;
1967
1968 if (cprm.limit < binfmt->min_coredump)
1969 goto fail_unlock;
1970
1922 cprm.file = filp_open(corename, 1971 cprm.file = filp_open(corename,
1923 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1972 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1924 0600); 1973 0600);
1925 if (IS_ERR(cprm.file)) 1974 if (IS_ERR(cprm.file))
1926 goto fail_dropcount; 1975 goto fail_unlock;
1927 inode = cprm.file->f_path.dentry->d_inode;
1928 if (inode->i_nlink > 1)
1929 goto close_fail; /* multiple links - don't dump */
1930 if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
1931 goto close_fail;
1932
1933 /* AK: actually i see no reason to not allow this for named pipes etc.,
1934 but keep the previous behaviour for now. */
1935 if (!ispipe && !S_ISREG(inode->i_mode))
1936 goto close_fail;
1937 /*
1938 * Dont allow local users get cute and trick others to coredump
1939 * into their pre-created files:
1940 * Note, this is not relevant for pipes
1941 */
1942 if (!ispipe && (inode->i_uid != current_fsuid()))
1943 goto close_fail;
1944 if (!cprm.file->f_op)
1945 goto close_fail;
1946 if (!cprm.file->f_op->write)
1947 goto close_fail;
1948 if (!ispipe &&
1949 do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
1950 goto close_fail;
1951 1976
1952 retval = binfmt->core_dump(&cprm); 1977 inode = cprm.file->f_path.dentry->d_inode;
1978 if (inode->i_nlink > 1)
1979 goto close_fail;
1980 if (d_unhashed(cprm.file->f_path.dentry))
1981 goto close_fail;
1982 /*
1983 * AK: actually i see no reason to not allow this for named
1984 * pipes etc, but keep the previous behaviour for now.
1985 */
1986 if (!S_ISREG(inode->i_mode))
1987 goto close_fail;
1988 /*
1989 * Dont allow local users get cute and trick others to coredump
1990 * into their pre-created files.
1991 */
1992 if (inode->i_uid != current_fsuid())
1993 goto close_fail;
1994 if (!cprm.file->f_op || !cprm.file->f_op->write)
1995 goto close_fail;
1996 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
1997 goto close_fail;
1998 }
1953 1999
2000 retval = binfmt->core_dump(&cprm);
1954 if (retval) 2001 if (retval)
1955 current->signal->group_exit_code |= 0x80; 2002 current->signal->group_exit_code |= 0x80;
1956close_fail: 2003
1957 if (ispipe && core_pipe_limit) 2004 if (ispipe && core_pipe_limit)
1958 wait_for_dump_helpers(cprm.file); 2005 wait_for_dump_helpers(cprm.file);
1959 filp_close(cprm.file, NULL); 2006close_fail:
2007 if (cprm.file)
2008 filp_close(cprm.file, NULL);
1960fail_dropcount: 2009fail_dropcount:
1961 if (dump_count) 2010 if (ispipe)
1962 atomic_dec(&core_dump_count); 2011 atomic_dec(&core_dump_count);
1963fail_unlock: 2012fail_unlock:
1964 if (helper_argv) 2013 coredump_finish(mm);
1965 argv_free(helper_argv);
1966
1967 revert_creds(old_cred); 2014 revert_creds(old_cred);
2015fail_creds:
1968 put_cred(cred); 2016 put_cred(cred);
1969 coredump_finish(mm);
1970fail: 2017fail:
1971 return; 2018 return;
1972} 2019}
2020
2021/*
2022 * Core dumping helper functions. These are the only things you should
2023 * do on a core-file: use only these functions to write out all the
2024 * necessary info.
2025 */
2026int dump_write(struct file *file, const void *addr, int nr)
2027{
2028 return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
2029}
2030EXPORT_SYMBOL(dump_write);
2031
2032int dump_seek(struct file *file, loff_t off)
2033{
2034 int ret = 1;
2035
2036 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
2037 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
2038 return 0;
2039 } else {
2040 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
2041
2042 if (!buf)
2043 return 0;
2044 while (off > 0) {
2045 unsigned long n = off;
2046
2047 if (n > PAGE_SIZE)
2048 n = PAGE_SIZE;
2049 if (!dump_write(file, buf, n)) {
2050 ret = 0;
2051 break;
2052 }
2053 off -= n;
2054 }
2055 free_page((unsigned long)buf);
2056 }
2057 return ret;
2058}
2059EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
608 de->inode_no = cpu_to_le64(parent->i_ino); 608 de->inode_no = cpu_to_le64(parent->i_ino);
609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); 609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
610 exofs_set_de_type(de, inode); 610 exofs_set_de_type(de, inode);
611 kunmap_atomic(page, KM_USER0); 611 kunmap_atomic(kaddr, KM_USER0);
612 err = exofs_commit_chunk(page, 0, chunk_size); 612 err = exofs_commit_chunk(page, 0, chunk_size);
613fail: 613fail:
614 page_cache_release(page); 614 page_cache_release(page);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 22721b2fd890..2dc925fa1010 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -256,7 +256,6 @@ static inline int exofs_oi_read(struct exofs_i_info *oi,
256} 256}
257 257
258/* inode.c */ 258/* inode.c */
259void exofs_truncate(struct inode *inode);
260int exofs_setattr(struct dentry *, struct iattr *); 259int exofs_setattr(struct dentry *, struct iattr *);
261int exofs_write_begin(struct file *file, struct address_space *mapping, 260int exofs_write_begin(struct file *file, struct address_space *mapping,
262 loff_t pos, unsigned len, unsigned flags, 261 loff_t pos, unsigned len, unsigned flags,
@@ -264,7 +263,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
264extern struct inode *exofs_iget(struct super_block *, unsigned long); 263extern struct inode *exofs_iget(struct super_block *, unsigned long);
265struct inode *exofs_new_inode(struct inode *, int); 264struct inode *exofs_new_inode(struct inode *, int);
266extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); 265extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
267extern void exofs_delete_inode(struct inode *); 266extern void exofs_evict_inode(struct inode *);
268 267
269/* dir.c: */ 268/* dir.c: */
270int exofs_add_link(struct dentry *, struct inode *); 269int exofs_add_link(struct dentry *, struct inode *);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..68cb23e3bb98 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -30,9 +30,6 @@
30 * along with exofs; if not, write to the Free Software 30 * along with exofs; if not, write to the Free Software
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33
34#include <linux/buffer_head.h>
35
36#include "exofs.h" 33#include "exofs.h"
37 34
38static int exofs_release_file(struct inode *inode, struct file *filp) 35static int exofs_release_file(struct inode *inode, struct file *filp)
@@ -40,20 +37,27 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
40 return 0; 37 return 0;
41} 38}
42 39
43static int exofs_file_fsync(struct file *filp, struct dentry *dentry, 40/* exofs_file_fsync - flush the inode to disk
44 int datasync) 41 *
42 * Note, in exofs all metadata is written as part of inode, regardless.
43 * The writeout is synchronous
44 */
45static int exofs_file_fsync(struct file *filp, int datasync)
45{ 46{
46 int ret; 47 int ret;
47 struct address_space *mapping = filp->f_mapping; 48 struct inode *inode = filp->f_mapping->host;
48 struct inode *inode = dentry->d_inode; 49 struct writeback_control wbc = {
50 .sync_mode = WB_SYNC_ALL,
51 .nr_to_write = 0, /* metadata-only; caller takes care of data */
52 };
49 struct super_block *sb; 53 struct super_block *sb;
50 54
51 ret = filemap_write_and_wait(mapping); 55 if (!(inode->i_state & I_DIRTY))
52 if (ret) 56 return 0;
53 return ret; 57 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
58 return 0;
54 59
55 /* sync the inode attributes */ 60 ret = sync_inode(inode, &wbc);
56 ret = write_inode_now(inode, 1);
57 61
58 /* This is a good place to write the sb */ 62 /* This is a good place to write the sb */
59 /* TODO: Sechedule an sb-sync on create */ 63 /* TODO: Sechedule an sb-sync on create */
@@ -66,9 +70,9 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
66 70
67static int exofs_flush(struct file *file, fl_owner_t id) 71static int exofs_flush(struct file *file, fl_owner_t id)
68{ 72{
69 exofs_file_fsync(file, file->f_path.dentry, 1); 73 int ret = vfs_fsync(file, 0);
70 /* TODO: Flush the OSD target */ 74 /* TODO: Flush the OSD target */
71 return 0; 75 return ret;
72} 76}
73 77
74const struct file_operations exofs_file_operations = { 78const struct file_operations exofs_file_operations = {
@@ -87,6 +91,5 @@ const struct file_operations exofs_file_operations = {
87}; 91};
88 92
89const struct inode_operations exofs_file_inode_operations = { 93const struct inode_operations exofs_file_inode_operations = {
90 .truncate = exofs_truncate,
91 .setattr = exofs_setattr, 94 .setattr = exofs_setattr,
92}; 95};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 76d2a79ef93e..3eadd97324b1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -32,9 +32,6 @@
32 */ 32 */
33 33
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/writeback.h>
36#include <linux/buffer_head.h>
37#include <scsi/scsi_device.h>
38 35
39#include "exofs.h" 36#include "exofs.h"
40 37
@@ -57,6 +54,9 @@ struct page_collect {
57 unsigned nr_pages; 54 unsigned nr_pages;
58 unsigned long length; 55 unsigned long length;
59 loff_t pg_first; /* keep 64bit also in 32-arches */ 56 loff_t pg_first; /* keep 64bit also in 32-arches */
57 bool read_4_write; /* This means two things: that the read is sync
58 * And the pages should not be unlocked.
59 */
60}; 60};
61 61
62static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 62static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -74,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
74 pcol->nr_pages = 0; 74 pcol->nr_pages = 0;
75 pcol->length = 0; 75 pcol->length = 0;
76 pcol->pg_first = -1; 76 pcol->pg_first = -1;
77 pcol->read_4_write = false;
77} 78}
78 79
79static void _pcol_reset(struct page_collect *pcol) 80static void _pcol_reset(struct page_collect *pcol)
@@ -350,7 +351,8 @@ static int readpage_strip(void *data, struct page *page)
350 if (PageError(page)) 351 if (PageError(page))
351 ClearPageError(page); 352 ClearPageError(page);
352 353
353 unlock_page(page); 354 if (!pcol->read_4_write)
355 unlock_page(page);
354 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 356 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
355 " splitting\n", inode->i_ino, page->index); 357 " splitting\n", inode->i_ino, page->index);
356 358
@@ -431,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync)
431 /* readpage_strip might call read_exec(,is_sync==false) at several 433 /* readpage_strip might call read_exec(,is_sync==false) at several
432 * places but not if we have a single page. 434 * places but not if we have a single page.
433 */ 435 */
436 pcol.read_4_write = is_sync;
434 ret = readpage_strip(&pcol, page); 437 ret = readpage_strip(&pcol, page);
435 if (ret) { 438 if (ret) {
436 EXOFS_ERR("_readpage => %d\n", ret); 439 EXOFS_ERR("_readpage => %d\n", ret);
@@ -697,6 +700,13 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
697 return write_exec(&pcol); 700 return write_exec(&pcol);
698} 701}
699 702
703/* i_mutex held using inode->i_size directly */
704static void _write_failed(struct inode *inode, loff_t to)
705{
706 if (to > inode->i_size)
707 truncate_pagecache(inode, to, inode->i_size);
708}
709
700int exofs_write_begin(struct file *file, struct address_space *mapping, 710int exofs_write_begin(struct file *file, struct address_space *mapping,
701 loff_t pos, unsigned len, unsigned flags, 711 loff_t pos, unsigned len, unsigned flags,
702 struct page **pagep, void **fsdata) 712 struct page **pagep, void **fsdata)
@@ -710,7 +720,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
710 fsdata); 720 fsdata);
711 if (ret) { 721 if (ret) {
712 EXOFS_DBGMSG("simple_write_begin faild\n"); 722 EXOFS_DBGMSG("simple_write_begin faild\n");
713 return ret; 723 goto out;
714 } 724 }
715 725
716 page = *pagep; 726 page = *pagep;
@@ -725,6 +735,9 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
725 EXOFS_DBGMSG("__readpage_filler faild\n"); 735 EXOFS_DBGMSG("__readpage_filler faild\n");
726 } 736 }
727 } 737 }
738out:
739 if (unlikely(ret))
740 _write_failed(mapping->host, pos + len);
728 741
729 return ret; 742 return ret;
730} 743}
@@ -750,11 +763,28 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
750 int ret; 763 int ret;
751 764
752 ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); 765 ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
766 if (unlikely(ret))
767 _write_failed(inode, pos + len);
768
769 /* TODO: once simple_write_end marks inode dirty remove */
753 if (i_size != inode->i_size) 770 if (i_size != inode->i_size)
754 mark_inode_dirty(inode); 771 mark_inode_dirty(inode);
755 return ret; 772 return ret;
756} 773}
757 774
775static int exofs_releasepage(struct page *page, gfp_t gfp)
776{
777 EXOFS_DBGMSG("page 0x%lx\n", page->index);
778 WARN_ON(1);
779 return 0;
780}
781
782static void exofs_invalidatepage(struct page *page, unsigned long offset)
783{
784 EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
785 WARN_ON(1);
786}
787
758const struct address_space_operations exofs_aops = { 788const struct address_space_operations exofs_aops = {
759 .readpage = exofs_readpage, 789 .readpage = exofs_readpage,
760 .readpages = exofs_readpages, 790 .readpages = exofs_readpages,
@@ -762,6 +792,21 @@ const struct address_space_operations exofs_aops = {
762 .writepages = exofs_writepages, 792 .writepages = exofs_writepages,
763 .write_begin = exofs_write_begin_export, 793 .write_begin = exofs_write_begin_export,
764 .write_end = exofs_write_end, 794 .write_end = exofs_write_end,
795 .releasepage = exofs_releasepage,
796 .set_page_dirty = __set_page_dirty_nobuffers,
797 .invalidatepage = exofs_invalidatepage,
798
799 /* Not implemented Yet */
800 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
801 .direct_IO = NULL, /* TODO: Should be trivial to do */
802
803 /* With these NULL has special meaning or default is not exported */
804 .sync_page = NULL,
805 .get_xip_mem = NULL,
806 .migratepage = NULL,
807 .launder_page = NULL,
808 .is_partially_uptodate = NULL,
809 .error_remove_page = NULL,
765}; 810};
766 811
767/****************************************************************************** 812/******************************************************************************
@@ -778,87 +823,55 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
778 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); 823 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
779} 824}
780 825
781/*
782 * get_block_t - Fill in a buffer_head
783 * An OSD takes care of block allocation so we just fake an allocation by
784 * putting in the inode's sector_t in the buffer_head.
785 * TODO: What about the case of create==0 and @iblock does not exist in the
786 * object?
787 */
788static int exofs_get_block(struct inode *inode, sector_t iblock,
789 struct buffer_head *bh_result, int create)
790{
791 map_bh(bh_result, inode->i_sb, iblock);
792 return 0;
793}
794
795const struct osd_attr g_attr_logical_length = ATTR_DEF( 826const struct osd_attr g_attr_logical_length = ATTR_DEF(
796 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); 827 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
797 828
798static int _do_truncate(struct inode *inode) 829static int _do_truncate(struct inode *inode, loff_t newsize)
799{ 830{
800 struct exofs_i_info *oi = exofs_i(inode); 831 struct exofs_i_info *oi = exofs_i(inode);
801 loff_t isize = i_size_read(inode);
802 int ret; 832 int ret;
803 833
804 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 834 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
805 835
806 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); 836 ret = exofs_oi_truncate(oi, (u64)newsize);
837 if (likely(!ret))
838 truncate_setsize(inode, newsize);
807 839
808 ret = exofs_oi_truncate(oi, (u64)isize); 840 EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
809 EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize); 841 inode->i_ino, newsize, ret);
810 return ret; 842 return ret;
811} 843}
812 844
813/* 845/*
814 * Truncate a file to the specified size - all we have to do is set the size 846 * Set inode attributes - update size attribute on OSD if needed,
815 * attribute. We make sure the object exists first. 847 * otherwise just call generic functions.
816 */
817void exofs_truncate(struct inode *inode)
818{
819 struct exofs_i_info *oi = exofs_i(inode);
820 int ret;
821
822 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
823 || S_ISLNK(inode->i_mode)))
824 return;
825 if (exofs_inode_is_fast_symlink(inode))
826 return;
827 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
828 return;
829
830 /* if we are about to truncate an object, and it hasn't been
831 * created yet, wait
832 */
833 if (unlikely(wait_obj_created(oi)))
834 goto fail;
835
836 ret = _do_truncate(inode);
837 if (ret)
838 goto fail;
839
840out:
841 mark_inode_dirty(inode);
842 return;
843fail:
844 make_bad_inode(inode);
845 goto out;
846}
847
848/*
849 * Set inode attributes - just call generic functions.
850 */ 848 */
851int exofs_setattr(struct dentry *dentry, struct iattr *iattr) 849int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
852{ 850{
853 struct inode *inode = dentry->d_inode; 851 struct inode *inode = dentry->d_inode;
854 int error; 852 int error;
855 853
854 /* if we are about to modify an object, and it hasn't been
855 * created yet, wait
856 */
857 error = wait_obj_created(exofs_i(inode));
858 if (unlikely(error))
859 return error;
860
856 error = inode_change_ok(inode, iattr); 861 error = inode_change_ok(inode, iattr);
857 if (error) 862 if (unlikely(error))
858 return error; 863 return error;
859 864
860 error = inode_setattr(inode, iattr); 865 if ((iattr->ia_valid & ATTR_SIZE) &&
861 return error; 866 iattr->ia_size != i_size_read(inode)) {
867 error = _do_truncate(inode, iattr->ia_size);
868 if (unlikely(error))
869 return error;
870 }
871
872 setattr_copy(inode, iattr);
873 mark_inode_dirty(inode);
874 return 0;
862} 875}
863 876
864static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( 877static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
@@ -1123,16 +1136,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1123 sbi = sb->s_fs_info; 1136 sbi = sb->s_fs_info;
1124 1137
1125 sb->s_dirt = 1; 1138 sb->s_dirt = 1;
1126 inode->i_uid = current->cred->fsuid; 1139 inode_init_owner(inode, dir, mode);
1127 if (dir->i_mode & S_ISGID) {
1128 inode->i_gid = dir->i_gid;
1129 if (S_ISDIR(mode))
1130 mode |= S_ISGID;
1131 } else {
1132 inode->i_gid = current->cred->fsgid;
1133 }
1134 inode->i_mode = mode;
1135
1136 inode->i_ino = sbi->s_nextid++; 1140 inode->i_ino = sbi->s_nextid++;
1137 inode->i_blkbits = EXOFS_BLKSHIFT; 1141 inode->i_blkbits = EXOFS_BLKSHIFT;
1138 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1142 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -1304,7 +1308,7 @@ static void delete_done(struct exofs_io_state *ios, void *p)
1304 * from the OSD here. We make sure the object was created before we try and 1308 * from the OSD here. We make sure the object was created before we try and
1305 * delete it. 1309 * delete it.
1306 */ 1310 */
1307void exofs_delete_inode(struct inode *inode) 1311void exofs_evict_inode(struct inode *inode)
1308{ 1312{
1309 struct exofs_i_info *oi = exofs_i(inode); 1313 struct exofs_i_info *oi = exofs_i(inode);
1310 struct super_block *sb = inode->i_sb; 1314 struct super_block *sb = inode->i_sb;
@@ -1314,30 +1318,27 @@ void exofs_delete_inode(struct inode *inode)
1314 1318
1315 truncate_inode_pages(&inode->i_data, 0); 1319 truncate_inode_pages(&inode->i_data, 0);
1316 1320
1317 if (is_bad_inode(inode)) 1321 /* TODO: should do better here */
1322 if (inode->i_nlink || is_bad_inode(inode))
1318 goto no_delete; 1323 goto no_delete;
1319 1324
1320 mark_inode_dirty(inode);
1321 exofs_update_inode(inode, inode_needs_sync(inode));
1322
1323 inode->i_size = 0; 1325 inode->i_size = 0;
1324 if (inode->i_blocks) 1326 end_writeback(inode);
1325 exofs_truncate(inode);
1326 1327
1327 clear_inode(inode); 1328 /* if we are deleting an obj that hasn't been created yet, wait */
1329 if (!obj_created(oi)) {
1330 BUG_ON(!obj_2bcreated(oi));
1331 wait_event(oi->i_wq, obj_created(oi));
1332 /* ignore the error attempt a remove anyway */
1333 }
1328 1334
1335 /* Now Remove the OSD objects */
1329 ret = exofs_get_io_state(&sbi->layout, &ios); 1336 ret = exofs_get_io_state(&sbi->layout, &ios);
1330 if (unlikely(ret)) { 1337 if (unlikely(ret)) {
1331 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); 1338 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
1332 return; 1339 return;
1333 } 1340 }
1334 1341
1335 /* if we are deleting an obj that hasn't been created yet, wait */
1336 if (!obj_created(oi)) {
1337 BUG_ON(!obj_2bcreated(oi));
1338 wait_event(oi->i_wq, obj_created(oi));
1339 }
1340
1341 ios->obj.id = exofs_oi_objno(oi); 1342 ios->obj.id = exofs_oi_objno(oi);
1342 ios->done = delete_done; 1343 ios->done = delete_done;
1343 ios->private = sbi; 1344 ios->private = sbi;
@@ -1353,5 +1354,5 @@ void exofs_delete_inode(struct inode *inode)
1353 return; 1354 return;
1354 1355
1355no_delete: 1356no_delete:
1356 clear_inode(inode); 1357 end_writeback(inode);
1357} 1358}
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 4337cad7777b..6550bf70e41d 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -305,8 +305,6 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
305struct _striping_info { 305struct _striping_info {
306 u64 obj_offset; 306 u64 obj_offset;
307 u64 group_length; 307 u64 group_length;
308 u64 total_group_length;
309 u64 Major;
310 unsigned dev; 308 unsigned dev;
311 unsigned unit_off; 309 unsigned unit_off;
312}; 310};
@@ -343,8 +341,6 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
343 (M * group_depth * stripe_unit); 341 (M * group_depth * stripe_unit);
344 342
345 si->group_length = T - H; 343 si->group_length = T - H;
346 si->total_group_length = T;
347 si->Major = M;
348} 344}
349 345
350static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, 346static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
@@ -392,20 +388,19 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
392} 388}
393 389
394static int _prepare_one_group(struct exofs_io_state *ios, u64 length, 390static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
395 struct _striping_info *si, unsigned first_comp) 391 struct _striping_info *si)
396{ 392{
397 unsigned stripe_unit = ios->layout->stripe_unit; 393 unsigned stripe_unit = ios->layout->stripe_unit;
398 unsigned mirrors_p1 = ios->layout->mirrors_p1; 394 unsigned mirrors_p1 = ios->layout->mirrors_p1;
399 unsigned devs_in_group = ios->layout->group_width * mirrors_p1; 395 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
400 unsigned dev = si->dev; 396 unsigned dev = si->dev;
401 unsigned first_dev = dev - (dev % devs_in_group); 397 unsigned first_dev = dev - (dev % devs_in_group);
402 unsigned comp = first_comp + (dev - first_dev);
403 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; 398 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
404 unsigned cur_pg = ios->pages_consumed; 399 unsigned cur_pg = ios->pages_consumed;
405 int ret = 0; 400 int ret = 0;
406 401
407 while (length) { 402 while (length) {
408 struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; 403 struct exofs_per_dev_state *per_dev = &ios->per_dev[dev];
409 unsigned cur_len, page_off = 0; 404 unsigned cur_len, page_off = 0;
410 405
411 if (!per_dev->length) { 406 if (!per_dev->length) {
@@ -424,11 +419,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
424 cur_len = stripe_unit; 419 cur_len = stripe_unit;
425 } 420 }
426 421
427 if (max_comp < comp) 422 if (max_comp < dev)
428 max_comp = comp; 423 max_comp = dev;
429
430 dev += mirrors_p1;
431 dev = (dev % devs_in_group) + first_dev;
432 } else { 424 } else {
433 cur_len = stripe_unit; 425 cur_len = stripe_unit;
434 } 426 }
@@ -440,8 +432,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
440 if (unlikely(ret)) 432 if (unlikely(ret))
441 goto out; 433 goto out;
442 434
443 comp += mirrors_p1; 435 dev += mirrors_p1;
444 comp = (comp % devs_in_group) + first_comp; 436 dev = (dev % devs_in_group) + first_dev;
445 437
446 length -= cur_len; 438 length -= cur_len;
447 } 439 }
@@ -454,18 +446,15 @@ out:
454static int _prepare_for_striping(struct exofs_io_state *ios) 446static int _prepare_for_striping(struct exofs_io_state *ios)
455{ 447{
456 u64 length = ios->length; 448 u64 length = ios->length;
449 u64 offset = ios->offset;
457 struct _striping_info si; 450 struct _striping_info si;
458 unsigned devs_in_group = ios->layout->group_width *
459 ios->layout->mirrors_p1;
460 unsigned first_comp = 0;
461 int ret = 0; 451 int ret = 0;
462 452
463 _calc_stripe_info(ios, ios->offset, &si);
464
465 if (!ios->pages) { 453 if (!ios->pages) {
466 if (ios->kern_buff) { 454 if (ios->kern_buff) {
467 struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; 455 struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
468 456
457 _calc_stripe_info(ios, ios->offset, &si);
469 per_dev->offset = si.obj_offset; 458 per_dev->offset = si.obj_offset;
470 per_dev->dev = si.dev; 459 per_dev->dev = si.dev;
471 460
@@ -479,26 +468,17 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
479 } 468 }
480 469
481 while (length) { 470 while (length) {
471 _calc_stripe_info(ios, offset, &si);
472
482 if (length < si.group_length) 473 if (length < si.group_length)
483 si.group_length = length; 474 si.group_length = length;
484 475
485 ret = _prepare_one_group(ios, si.group_length, &si, first_comp); 476 ret = _prepare_one_group(ios, si.group_length, &si);
486 if (unlikely(ret)) 477 if (unlikely(ret))
487 goto out; 478 goto out;
488 479
480 offset += si.group_length;
489 length -= si.group_length; 481 length -= si.group_length;
490
491 si.group_length = si.total_group_length;
492 si.unit_off = 0;
493 ++si.Major;
494 si.obj_offset = si.Major * ios->layout->stripe_unit *
495 ios->layout->group_depth;
496
497 si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
498 si.dev %= ios->layout->s_numdevs;
499
500 first_comp += devs_in_group;
501 first_comp %= ios->layout->s_numdevs;
502 } 482 }
503 483
504out: 484out:
@@ -599,7 +579,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
599 } else { 579 } else {
600 bio = master_dev->bio; 580 bio = master_dev->bio;
601 /* FIXME: bio_set_dir() */ 581 /* FIXME: bio_set_dir() */
602 bio->bi_rw |= (1 << BIO_RW); 582 bio->bi_rw |= REQ_WRITE;
603 } 583 }
604 584
605 osd_req_write(or, &ios->obj, per_dev->offset, bio, 585 osd_req_write(or, &ios->obj, per_dev->offset, bio,
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 03149b9a5178..047e92fa3af8 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -31,7 +31,6 @@
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33 33
34#include <linux/smp_lock.h>
35#include <linux/string.h> 34#include <linux/string.h>
36#include <linux/parser.h> 35#include <linux/parser.h>
37#include <linux/vfs.h> 36#include <linux/vfs.h>
@@ -743,7 +742,7 @@ static const struct super_operations exofs_sops = {
743 .alloc_inode = exofs_alloc_inode, 742 .alloc_inode = exofs_alloc_inode,
744 .destroy_inode = exofs_destroy_inode, 743 .destroy_inode = exofs_destroy_inode,
745 .write_inode = exofs_write_inode, 744 .write_inode = exofs_write_inode,
746 .delete_inode = exofs_delete_inode, 745 .evict_inode = exofs_evict_inode,
747 .put_super = exofs_put_super, 746 .put_super = exofs_put_super,
748 .write_super = exofs_write_super, 747 .write_super = exofs_write_super,
749 .sync_fs = exofs_sync_fs, 748 .sync_fs = exofs_sync_fs,
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3d..2bcc0431bada 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -200,6 +200,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
200 return error; 200 return error;
201 else { 201 else {
202 inode->i_mode = mode; 202 inode->i_mode = mode;
203 inode->i_ctime = CURRENT_TIME_SEC;
203 mark_inode_dirty(inode); 204 mark_inode_dirty(inode);
204 if (error == 0) 205 if (error == 0)
205 acl = NULL; 206 acl = NULL;
@@ -420,7 +421,7 @@ release_and_out:
420 return error; 421 return error;
421} 422}
422 423
423struct xattr_handler ext2_xattr_acl_access_handler = { 424const struct xattr_handler ext2_xattr_acl_access_handler = {
424 .prefix = POSIX_ACL_XATTR_ACCESS, 425 .prefix = POSIX_ACL_XATTR_ACCESS,
425 .flags = ACL_TYPE_ACCESS, 426 .flags = ACL_TYPE_ACCESS,
426 .list = ext2_xattr_list_acl_access, 427 .list = ext2_xattr_list_acl_access,
@@ -428,7 +429,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
428 .set = ext2_xattr_set_acl, 429 .set = ext2_xattr_set_acl,
429}; 430};
430 431
431struct xattr_handler ext2_xattr_acl_default_handler = { 432const struct xattr_handler ext2_xattr_acl_default_handler = {
432 .prefix = POSIX_ACL_XATTR_DEFAULT, 433 .prefix = POSIX_ACL_XATTR_DEFAULT,
433 .flags = ACL_TYPE_DEFAULT, 434 .flags = ACL_TYPE_DEFAULT,
434 .list = ext2_xattr_list_acl_default, 435 .list = ext2_xattr_list_acl_default,
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 3cf038c055d7..c6c684b44ea1 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -571,7 +571,7 @@ do_more:
571error_return: 571error_return:
572 brelse(bitmap_bh); 572 brelse(bitmap_bh);
573 release_blocks(sb, freed); 573 release_blocks(sb, freed);
574 dquot_free_block(inode, freed); 574 dquot_free_block_nodirty(inode, freed);
575} 575}
576 576
577/** 577/**
@@ -1332,6 +1332,12 @@ retry_alloc:
1332 1332
1333 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); 1333 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1334 /* 1334 /*
1335 * skip this group (and avoid loading bitmap) if there
1336 * are no free blocks
1337 */
1338 if (!free_blocks)
1339 continue;
1340 /*
1335 * skip this group if the number of 1341 * skip this group if the number of
1336 * free blocks is less than half of the reservation 1342 * free blocks is less than half of the reservation
1337 * window size. 1343 * window size.
@@ -1412,7 +1418,8 @@ allocated:
1412 1418
1413 *errp = 0; 1419 *errp = 0;
1414 brelse(bitmap_bh); 1420 brelse(bitmap_bh);
1415 dquot_free_block(inode, *count-num); 1421 dquot_free_block_nodirty(inode, *count-num);
1422 mark_inode_dirty(inode);
1416 *count = num; 1423 *count = num;
1417 return ret_block; 1424 return ret_block;
1418 1425
@@ -1422,8 +1429,10 @@ out:
1422 /* 1429 /*
1423 * Undo the block allocation 1430 * Undo the block allocation
1424 */ 1431 */
1425 if (!performed_allocation) 1432 if (!performed_allocation) {
1426 dquot_free_block(inode, *count); 1433 dquot_free_block_nodirty(inode, *count);
1434 mark_inode_dirty(inode);
1435 }
1427 brelse(bitmap_bh); 1436 brelse(bitmap_bh);
1428 return 0; 1437 return 0;
1429} 1438}
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7516957273ed..764109886ec0 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -448,6 +448,11 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
448 return res; 448 return res;
449} 449}
450 450
451static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
452{
453 return __block_write_begin(page, pos, len, ext2_get_block);
454}
455
451/* Releases the page */ 456/* Releases the page */
452void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, 457void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
453 struct page *page, struct inode *inode, int update_times) 458 struct page *page, struct inode *inode, int update_times)
@@ -458,8 +463,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
458 int err; 463 int err;
459 464
460 lock_page(page); 465 lock_page(page);
461 err = __ext2_write_begin(NULL, page->mapping, pos, len, 466 err = ext2_prepare_chunk(page, pos, len);
462 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
463 BUG_ON(err); 467 BUG_ON(err);
464 de->inode = cpu_to_le32(inode->i_ino); 468 de->inode = cpu_to_le32(inode->i_ino);
465 ext2_set_de_type(de, inode); 469 ext2_set_de_type(de, inode);
@@ -542,8 +546,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
542got_it: 546got_it:
543 pos = page_offset(page) + 547 pos = page_offset(page) +
544 (char*)de - (char*)page_address(page); 548 (char*)de - (char*)page_address(page);
545 err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0, 549 err = ext2_prepare_chunk(page, pos, rec_len);
546 &page, NULL);
547 if (err) 550 if (err)
548 goto out_unlock; 551 goto out_unlock;
549 if (de->inode) { 552 if (de->inode) {
@@ -576,8 +579,7 @@ out_unlock:
576 */ 579 */
577int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) 580int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
578{ 581{
579 struct address_space *mapping = page->mapping; 582 struct inode *inode = page->mapping->host;
580 struct inode *inode = mapping->host;
581 char *kaddr = page_address(page); 583 char *kaddr = page_address(page);
582 unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); 584 unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1);
583 unsigned to = ((char *)dir - kaddr) + 585 unsigned to = ((char *)dir - kaddr) +
@@ -601,8 +603,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
601 from = (char*)pde - (char*)page_address(page); 603 from = (char*)pde - (char*)page_address(page);
602 pos = page_offset(page) + from; 604 pos = page_offset(page) + from;
603 lock_page(page); 605 lock_page(page);
604 err = __ext2_write_begin(NULL, page->mapping, pos, to - from, 0, 606 err = ext2_prepare_chunk(page, pos, to - from);
605 &page, NULL);
606 BUG_ON(err); 607 BUG_ON(err);
607 if (pde) 608 if (pde)
608 pde->rec_len = ext2_rec_len_to_disk(to - from); 609 pde->rec_len = ext2_rec_len_to_disk(to - from);
@@ -621,8 +622,7 @@ out:
621 */ 622 */
622int ext2_make_empty(struct inode *inode, struct inode *parent) 623int ext2_make_empty(struct inode *inode, struct inode *parent)
623{ 624{
624 struct address_space *mapping = inode->i_mapping; 625 struct page *page = grab_cache_page(inode->i_mapping, 0);
625 struct page *page = grab_cache_page(mapping, 0);
626 unsigned chunk_size = ext2_chunk_size(inode); 626 unsigned chunk_size = ext2_chunk_size(inode);
627 struct ext2_dir_entry_2 * de; 627 struct ext2_dir_entry_2 * de;
628 int err; 628 int err;
@@ -631,8 +631,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
631 if (!page) 631 if (!page)
632 return -ENOMEM; 632 return -ENOMEM;
633 633
634 err = __ext2_write_begin(NULL, page->mapping, 0, chunk_size, 0, 634 err = ext2_prepare_chunk(page, 0, chunk_size);
635 &page, NULL);
636 if (err) { 635 if (err) {
637 unlock_page(page); 636 unlock_page(page);
638 goto fail; 637 goto fail;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..416daa62242c 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -119,18 +119,14 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
119/* inode.c */ 119/* inode.c */
120extern struct inode *ext2_iget (struct super_block *, unsigned long); 120extern struct inode *ext2_iget (struct super_block *, unsigned long);
121extern int ext2_write_inode (struct inode *, struct writeback_control *); 121extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_evict_inode(struct inode *);
123extern int ext2_sync_inode (struct inode *); 123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
125extern void ext2_truncate (struct inode *);
126extern int ext2_setattr (struct dentry *, struct iattr *); 125extern int ext2_setattr (struct dentry *, struct iattr *);
127extern void ext2_set_inode_flags(struct inode *inode); 126extern void ext2_set_inode_flags(struct inode *inode);
128extern void ext2_get_inode_flags(struct ext2_inode_info *); 127extern void ext2_get_inode_flags(struct ext2_inode_info *);
129extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 128extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
130 u64 start, u64 len); 129 u64 start, u64 len);
131int __ext2_write_begin(struct file *file, struct address_space *mapping,
132 loff_t pos, unsigned len, unsigned flags,
133 struct page **pagep, void **fsdata);
134 130
135/* ioctl.c */ 131/* ioctl.c */
136extern long ext2_ioctl(struct file *, unsigned int, unsigned long); 132extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
@@ -155,7 +151,7 @@ extern void ext2_write_super (struct super_block *);
155extern const struct file_operations ext2_dir_operations; 151extern const struct file_operations ext2_dir_operations;
156 152
157/* file.c */ 153/* file.c */
158extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync); 154extern int ext2_fsync(struct file *file, int datasync);
159extern const struct inode_operations ext2_file_inode_operations; 155extern const struct inode_operations ext2_file_inode_operations;
160extern const struct file_operations ext2_file_operations; 156extern const struct file_operations ext2_file_operations;
161extern const struct file_operations ext2_xip_file_operations; 157extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
40 return 0; 40 return 0;
41} 41}
42 42
43int ext2_fsync(struct file *file, struct dentry *dentry, int datasync) 43int ext2_fsync(struct file *file, int datasync)
44{ 44{
45 int ret; 45 int ret;
46 struct super_block *sb = dentry->d_inode->i_sb; 46 struct super_block *sb = file->f_mapping->host->i_sb;
47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; 47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
48 48
49 ret = simple_fsync(file, dentry, datasync); 49 ret = generic_file_fsync(file, datasync);
50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { 50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
51 /* We don't really know where the IO error happened... */ 51 /* We don't really know where the IO error happened... */
52 ext2_error(sb, __func__, 52 ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
95#endif 95#endif
96 96
97const struct inode_operations ext2_file_inode_operations = { 97const struct inode_operations ext2_file_inode_operations = {
98 .truncate = ext2_truncate,
99#ifdef CONFIG_EXT2_FS_XATTR 98#ifdef CONFIG_EXT2_FS_XATTR
100 .setxattr = generic_setxattr, 99 .setxattr = generic_setxattr,
101 .getxattr = generic_getxattr, 100 .getxattr = generic_getxattr,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad7d572ee8dc..ad70479aabff 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -106,7 +106,7 @@ void ext2_free_inode (struct inode * inode)
106 struct super_block * sb = inode->i_sb; 106 struct super_block * sb = inode->i_sb;
107 int is_directory; 107 int is_directory;
108 unsigned long ino; 108 unsigned long ino;
109 struct buffer_head *bitmap_bh = NULL; 109 struct buffer_head *bitmap_bh;
110 unsigned long block_group; 110 unsigned long block_group;
111 unsigned long bit; 111 unsigned long bit;
112 struct ext2_super_block * es; 112 struct ext2_super_block * es;
@@ -118,31 +118,25 @@ void ext2_free_inode (struct inode * inode)
118 * Note: we must free any quota before locking the superblock, 118 * Note: we must free any quota before locking the superblock,
119 * as writing the quota to disk may need the lock as well. 119 * as writing the quota to disk may need the lock as well.
120 */ 120 */
121 if (!is_bad_inode(inode)) { 121 /* Quota is already initialized in iput() */
122 /* Quota is already initialized in iput() */ 122 ext2_xattr_delete_inode(inode);
123 ext2_xattr_delete_inode(inode); 123 dquot_free_inode(inode);
124 dquot_free_inode(inode); 124 dquot_drop(inode);
125 dquot_drop(inode);
126 }
127 125
128 es = EXT2_SB(sb)->s_es; 126 es = EXT2_SB(sb)->s_es;
129 is_directory = S_ISDIR(inode->i_mode); 127 is_directory = S_ISDIR(inode->i_mode);
130 128
131 /* Do this BEFORE marking the inode not in use or returning an error */
132 clear_inode (inode);
133
134 if (ino < EXT2_FIRST_INO(sb) || 129 if (ino < EXT2_FIRST_INO(sb) ||
135 ino > le32_to_cpu(es->s_inodes_count)) { 130 ino > le32_to_cpu(es->s_inodes_count)) {
136 ext2_error (sb, "ext2_free_inode", 131 ext2_error (sb, "ext2_free_inode",
137 "reserved or nonexistent inode %lu", ino); 132 "reserved or nonexistent inode %lu", ino);
138 goto error_return; 133 return;
139 } 134 }
140 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); 135 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
141 bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb); 136 bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
142 brelse(bitmap_bh);
143 bitmap_bh = read_inode_bitmap(sb, block_group); 137 bitmap_bh = read_inode_bitmap(sb, block_group);
144 if (!bitmap_bh) 138 if (!bitmap_bh)
145 goto error_return; 139 return;
146 140
147 /* Ok, now we can actually update the inode bitmaps.. */ 141 /* Ok, now we can actually update the inode bitmaps.. */
148 if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group), 142 if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
@@ -154,7 +148,7 @@ void ext2_free_inode (struct inode * inode)
154 mark_buffer_dirty(bitmap_bh); 148 mark_buffer_dirty(bitmap_bh);
155 if (sb->s_flags & MS_SYNCHRONOUS) 149 if (sb->s_flags & MS_SYNCHRONOUS)
156 sync_dirty_buffer(bitmap_bh); 150 sync_dirty_buffer(bitmap_bh);
157error_return: 151
158 brelse(bitmap_bh); 152 brelse(bitmap_bh);
159} 153}
160 154
@@ -550,16 +544,12 @@ got:
550 544
551 sb->s_dirt = 1; 545 sb->s_dirt = 1;
552 mark_buffer_dirty(bh2); 546 mark_buffer_dirty(bh2);
553 inode->i_uid = current_fsuid(); 547 if (test_opt(sb, GRPID)) {
554 if (test_opt (sb, GRPID)) 548 inode->i_mode = mode;
555 inode->i_gid = dir->i_gid; 549 inode->i_uid = current_fsuid();
556 else if (dir->i_mode & S_ISGID) {
557 inode->i_gid = dir->i_gid; 550 inode->i_gid = dir->i_gid;
558 if (S_ISDIR(mode))
559 mode |= S_ISGID;
560 } else 551 } else
561 inode->i_gid = current_fsgid(); 552 inode_init_owner(inode, dir, mode);
562 inode->i_mode = mode;
563 553
564 inode->i_ino = ino; 554 inode->i_ino = ino;
565 inode->i_blocks = 0; 555 inode->i_blocks = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fc13cc119aad..940c96168868 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -22,7 +22,6 @@
22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000 22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
23 */ 23 */
24 24
25#include <linux/smp_lock.h>
26#include <linux/time.h> 25#include <linux/time.h>
27#include <linux/highuid.h> 26#include <linux/highuid.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
@@ -55,29 +54,57 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
55 inode->i_blocks - ea_blocks == 0); 54 inode->i_blocks - ea_blocks == 0);
56} 55}
57 56
57static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
58
59static void ext2_write_failed(struct address_space *mapping, loff_t to)
60{
61 struct inode *inode = mapping->host;
62
63 if (to > inode->i_size) {
64 truncate_pagecache(inode, to, inode->i_size);
65 ext2_truncate_blocks(inode, inode->i_size);
66 }
67}
68
58/* 69/*
59 * Called at the last iput() if i_nlink is zero. 70 * Called at the last iput() if i_nlink is zero.
60 */ 71 */
61void ext2_delete_inode (struct inode * inode) 72void ext2_evict_inode(struct inode * inode)
62{ 73{
63 if (!is_bad_inode(inode)) 74 struct ext2_block_alloc_info *rsv;
75 int want_delete = 0;
76
77 if (!inode->i_nlink && !is_bad_inode(inode)) {
78 want_delete = 1;
64 dquot_initialize(inode); 79 dquot_initialize(inode);
80 } else {
81 dquot_drop(inode);
82 }
83
65 truncate_inode_pages(&inode->i_data, 0); 84 truncate_inode_pages(&inode->i_data, 0);
66 85
67 if (is_bad_inode(inode)) 86 if (want_delete) {
68 goto no_delete; 87 /* set dtime */
69 EXT2_I(inode)->i_dtime = get_seconds(); 88 EXT2_I(inode)->i_dtime = get_seconds();
70 mark_inode_dirty(inode); 89 mark_inode_dirty(inode);
71 __ext2_write_inode(inode, inode_needs_sync(inode)); 90 __ext2_write_inode(inode, inode_needs_sync(inode));
91 /* truncate to 0 */
92 inode->i_size = 0;
93 if (inode->i_blocks)
94 ext2_truncate_blocks(inode, 0);
95 }
96
97 invalidate_inode_buffers(inode);
98 end_writeback(inode);
72 99
73 inode->i_size = 0; 100 ext2_discard_reservation(inode);
74 if (inode->i_blocks) 101 rsv = EXT2_I(inode)->i_block_alloc_info;
75 ext2_truncate (inode); 102 EXT2_I(inode)->i_block_alloc_info = NULL;
76 ext2_free_inode (inode); 103 if (unlikely(rsv))
104 kfree(rsv);
77 105
78 return; 106 if (want_delete)
79no_delete: 107 ext2_free_inode(inode);
80 clear_inode(inode); /* We must guarantee clearing of inode... */
81} 108}
82 109
83typedef struct { 110typedef struct {
@@ -412,6 +439,8 @@ static int ext2_alloc_blocks(struct inode *inode,
412failed_out: 439failed_out:
413 for (i = 0; i <index; i++) 440 for (i = 0; i <index; i++)
414 ext2_free_blocks(inode, new_blocks[i], 1); 441 ext2_free_blocks(inode, new_blocks[i], 1);
442 if (index)
443 mark_inode_dirty(inode);
415 return ret; 444 return ret;
416} 445}
417 446
@@ -754,21 +783,30 @@ ext2_readpages(struct file *file, struct address_space *mapping,
754 return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); 783 return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
755} 784}
756 785
757int __ext2_write_begin(struct file *file, struct address_space *mapping, 786static int
787ext2_write_begin(struct file *file, struct address_space *mapping,
758 loff_t pos, unsigned len, unsigned flags, 788 loff_t pos, unsigned len, unsigned flags,
759 struct page **pagep, void **fsdata) 789 struct page **pagep, void **fsdata)
760{ 790{
761 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 791 int ret;
762 ext2_get_block); 792
793 ret = block_write_begin(mapping, pos, len, flags, pagep,
794 ext2_get_block);
795 if (ret < 0)
796 ext2_write_failed(mapping, pos + len);
797 return ret;
763} 798}
764 799
765static int 800static int ext2_write_end(struct file *file, struct address_space *mapping,
766ext2_write_begin(struct file *file, struct address_space *mapping, 801 loff_t pos, unsigned len, unsigned copied,
767 loff_t pos, unsigned len, unsigned flags, 802 struct page *page, void *fsdata)
768 struct page **pagep, void **fsdata)
769{ 803{
770 *pagep = NULL; 804 int ret;
771 return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata); 805
806 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
807 if (ret < len)
808 ext2_write_failed(mapping, pos + len);
809 return ret;
772} 810}
773 811
774static int 812static int
@@ -776,13 +814,13 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
776 loff_t pos, unsigned len, unsigned flags, 814 loff_t pos, unsigned len, unsigned flags,
777 struct page **pagep, void **fsdata) 815 struct page **pagep, void **fsdata)
778{ 816{
779 /* 817 int ret;
780 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework 818
781 * directory handling code to pass around offsets rather than struct 819 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
782 * pages in order to make this work easily. 820 ext2_get_block);
783 */ 821 if (ret < 0)
784 return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 822 ext2_write_failed(mapping, pos + len);
785 ext2_get_block); 823 return ret;
786} 824}
787 825
788static int ext2_nobh_writepage(struct page *page, 826static int ext2_nobh_writepage(struct page *page,
@@ -801,10 +839,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
801 loff_t offset, unsigned long nr_segs) 839 loff_t offset, unsigned long nr_segs)
802{ 840{
803 struct file *file = iocb->ki_filp; 841 struct file *file = iocb->ki_filp;
804 struct inode *inode = file->f_mapping->host; 842 struct address_space *mapping = file->f_mapping;
805 843 struct inode *inode = mapping->host;
806 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 844 ssize_t ret;
807 offset, nr_segs, ext2_get_block, NULL); 845
846 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
847 iov, offset, nr_segs, ext2_get_block, NULL);
848 if (ret < 0 && (rw & WRITE))
849 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
850 return ret;
808} 851}
809 852
810static int 853static int
@@ -819,7 +862,7 @@ const struct address_space_operations ext2_aops = {
819 .writepage = ext2_writepage, 862 .writepage = ext2_writepage,
820 .sync_page = block_sync_page, 863 .sync_page = block_sync_page,
821 .write_begin = ext2_write_begin, 864 .write_begin = ext2_write_begin,
822 .write_end = generic_write_end, 865 .write_end = ext2_write_end,
823 .bmap = ext2_bmap, 866 .bmap = ext2_bmap,
824 .direct_IO = ext2_direct_IO, 867 .direct_IO = ext2_direct_IO,
825 .writepages = ext2_writepages, 868 .writepages = ext2_writepages,
@@ -968,8 +1011,8 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
968 else if (block_to_free == nr - count) 1011 else if (block_to_free == nr - count)
969 count++; 1012 count++;
970 else { 1013 else {
971 mark_inode_dirty(inode);
972 ext2_free_blocks (inode, block_to_free, count); 1014 ext2_free_blocks (inode, block_to_free, count);
1015 mark_inode_dirty(inode);
973 free_this: 1016 free_this:
974 block_to_free = nr; 1017 block_to_free = nr;
975 count = 1; 1018 count = 1;
@@ -977,8 +1020,8 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
977 } 1020 }
978 } 1021 }
979 if (count > 0) { 1022 if (count > 0) {
980 mark_inode_dirty(inode);
981 ext2_free_blocks (inode, block_to_free, count); 1023 ext2_free_blocks (inode, block_to_free, count);
1024 mark_inode_dirty(inode);
982 } 1025 }
983} 1026}
984 1027
@@ -1028,7 +1071,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
1028 ext2_free_data(inode, p, q); 1071 ext2_free_data(inode, p, q);
1029} 1072}
1030 1073
1031void ext2_truncate(struct inode *inode) 1074static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
1032{ 1075{
1033 __le32 *i_data = EXT2_I(inode)->i_data; 1076 __le32 *i_data = EXT2_I(inode)->i_data;
1034 struct ext2_inode_info *ei = EXT2_I(inode); 1077 struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1040,27 +1083,8 @@ void ext2_truncate(struct inode *inode)
1040 int n; 1083 int n;
1041 long iblock; 1084 long iblock;
1042 unsigned blocksize; 1085 unsigned blocksize;
1043
1044 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1045 S_ISLNK(inode->i_mode)))
1046 return;
1047 if (ext2_inode_is_fast_symlink(inode))
1048 return;
1049 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1050 return;
1051
1052 blocksize = inode->i_sb->s_blocksize; 1086 blocksize = inode->i_sb->s_blocksize;
1053 iblock = (inode->i_size + blocksize-1) 1087 iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1054 >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1055
1056 if (mapping_is_xip(inode->i_mapping))
1057 xip_truncate_page(inode->i_mapping, inode->i_size);
1058 else if (test_opt(inode->i_sb, NOBH))
1059 nobh_truncate_page(inode->i_mapping,
1060 inode->i_size, ext2_get_block);
1061 else
1062 block_truncate_page(inode->i_mapping,
1063 inode->i_size, ext2_get_block);
1064 1088
1065 n = ext2_block_to_path(inode, iblock, offsets, NULL); 1089 n = ext2_block_to_path(inode, iblock, offsets, NULL);
1066 if (n == 0) 1090 if (n == 0)
@@ -1128,6 +1152,54 @@ do_indirects:
1128 ext2_discard_reservation(inode); 1152 ext2_discard_reservation(inode);
1129 1153
1130 mutex_unlock(&ei->truncate_mutex); 1154 mutex_unlock(&ei->truncate_mutex);
1155}
1156
1157static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
1158{
1159 /*
1160 * XXX: it seems like a bug here that we don't allow
1161 * IS_APPEND inode to have blocks-past-i_size trimmed off.
1162 * review and fix this.
1163 *
1164 * Also would be nice to be able to handle IO errors and such,
1165 * but that's probably too much to ask.
1166 */
1167 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1168 S_ISLNK(inode->i_mode)))
1169 return;
1170 if (ext2_inode_is_fast_symlink(inode))
1171 return;
1172 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1173 return;
1174 __ext2_truncate_blocks(inode, offset);
1175}
1176
1177static int ext2_setsize(struct inode *inode, loff_t newsize)
1178{
1179 int error;
1180
1181 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1182 S_ISLNK(inode->i_mode)))
1183 return -EINVAL;
1184 if (ext2_inode_is_fast_symlink(inode))
1185 return -EINVAL;
1186 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1187 return -EPERM;
1188
1189 if (mapping_is_xip(inode->i_mapping))
1190 error = xip_truncate_page(inode->i_mapping, newsize);
1191 else if (test_opt(inode->i_sb, NOBH))
1192 error = nobh_truncate_page(inode->i_mapping,
1193 newsize, ext2_get_block);
1194 else
1195 error = block_truncate_page(inode->i_mapping,
1196 newsize, ext2_get_block);
1197 if (error)
1198 return error;
1199
1200 truncate_setsize(inode, newsize);
1201 __ext2_truncate_blocks(inode, newsize);
1202
1131 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1203 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1132 if (inode_needs_sync(inode)) { 1204 if (inode_needs_sync(inode)) {
1133 sync_mapping_buffers(inode->i_mapping); 1205 sync_mapping_buffers(inode->i_mapping);
@@ -1135,6 +1207,8 @@ do_indirects:
1135 } else { 1207 } else {
1136 mark_inode_dirty(inode); 1208 mark_inode_dirty(inode);
1137 } 1209 }
1210
1211 return 0;
1138} 1212}
1139 1213
1140static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, 1214static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1406,11 +1480,11 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
1406 /* If this is the first large file 1480 /* If this is the first large file
1407 * created, add a flag to the superblock. 1481 * created, add a flag to the superblock.
1408 */ 1482 */
1409 lock_kernel(); 1483 spin_lock(&EXT2_SB(sb)->s_lock);
1410 ext2_update_dynamic_rev(sb); 1484 ext2_update_dynamic_rev(sb);
1411 EXT2_SET_RO_COMPAT_FEATURE(sb, 1485 EXT2_SET_RO_COMPAT_FEATURE(sb,
1412 EXT2_FEATURE_RO_COMPAT_LARGE_FILE); 1486 EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
1413 unlock_kernel(); 1487 spin_unlock(&EXT2_SB(sb)->s_lock);
1414 ext2_write_super(sb); 1488 ext2_write_super(sb);
1415 } 1489 }
1416 } 1490 }
@@ -1467,7 +1541,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1467 if (error) 1541 if (error)
1468 return error; 1542 return error;
1469 1543
1470 if (iattr->ia_valid & ATTR_SIZE) 1544 if (is_quota_modification(inode, iattr))
1471 dquot_initialize(inode); 1545 dquot_initialize(inode);
1472 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 1546 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
1473 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 1547 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
@@ -1475,8 +1549,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1475 if (error) 1549 if (error)
1476 return error; 1550 return error;
1477 } 1551 }
1478 error = inode_setattr(inode, iattr); 1552 if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
1479 if (!error && (iattr->ia_valid & ATTR_MODE)) 1553 error = ext2_setsize(inode, iattr->ia_size);
1554 if (error)
1555 return error;
1556 }
1557 setattr_copy(inode, iattr);
1558 if (iattr->ia_valid & ATTR_MODE)
1480 error = ext2_acl_chmod(inode); 1559 error = ext2_acl_chmod(inode);
1560 mark_inode_dirty(inode);
1561
1481 return error; 1562 return error;
1482} 1563}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 42e4a303b675..1ec602673ea8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -26,7 +26,6 @@
26#include <linux/random.h> 26#include <linux/random.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/exportfs.h> 28#include <linux/exportfs.h>
29#include <linux/smp_lock.h>
30#include <linux/vfs.h> 29#include <linux/vfs.h>
31#include <linux/seq_file.h> 30#include <linux/seq_file.h>
32#include <linux/mount.h> 31#include <linux/mount.h>
@@ -39,7 +38,7 @@
39#include "xip.h" 38#include "xip.h"
40 39
41static void ext2_sync_super(struct super_block *sb, 40static void ext2_sync_super(struct super_block *sb,
42 struct ext2_super_block *es); 41 struct ext2_super_block *es, int wait);
43static int ext2_remount (struct super_block * sb, int * flags, char * data); 42static int ext2_remount (struct super_block * sb, int * flags, char * data);
44static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
45static int ext2_sync_fs(struct super_block *sb, int wait); 44static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -52,9 +51,11 @@ void ext2_error (struct super_block * sb, const char * function,
52 struct ext2_super_block *es = sbi->s_es; 51 struct ext2_super_block *es = sbi->s_es;
53 52
54 if (!(sb->s_flags & MS_RDONLY)) { 53 if (!(sb->s_flags & MS_RDONLY)) {
54 spin_lock(&sbi->s_lock);
55 sbi->s_mount_state |= EXT2_ERROR_FS; 55 sbi->s_mount_state |= EXT2_ERROR_FS;
56 es->s_state |= cpu_to_le16(EXT2_ERROR_FS); 56 es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
57 ext2_sync_super(sb, es); 57 spin_unlock(&sbi->s_lock);
58 ext2_sync_super(sb, es, 1);
58 } 59 }
59 60
60 va_start(args, fmt); 61 va_start(args, fmt);
@@ -84,6 +85,9 @@ void ext2_msg(struct super_block *sb, const char *prefix,
84 va_end(args); 85 va_end(args);
85} 86}
86 87
88/*
89 * This must be called with sbi->s_lock held.
90 */
87void ext2_update_dynamic_rev(struct super_block *sb) 91void ext2_update_dynamic_rev(struct super_block *sb)
88{ 92{
89 struct ext2_super_block *es = EXT2_SB(sb)->s_es; 93 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
@@ -115,7 +119,7 @@ static void ext2_put_super (struct super_block * sb)
115 int i; 119 int i;
116 struct ext2_sb_info *sbi = EXT2_SB(sb); 120 struct ext2_sb_info *sbi = EXT2_SB(sb);
117 121
118 lock_kernel(); 122 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
119 123
120 if (sb->s_dirt) 124 if (sb->s_dirt)
121 ext2_write_super(sb); 125 ext2_write_super(sb);
@@ -124,8 +128,10 @@ static void ext2_put_super (struct super_block * sb)
124 if (!(sb->s_flags & MS_RDONLY)) { 128 if (!(sb->s_flags & MS_RDONLY)) {
125 struct ext2_super_block *es = sbi->s_es; 129 struct ext2_super_block *es = sbi->s_es;
126 130
131 spin_lock(&sbi->s_lock);
127 es->s_state = cpu_to_le16(sbi->s_mount_state); 132 es->s_state = cpu_to_le16(sbi->s_mount_state);
128 ext2_sync_super(sb, es); 133 spin_unlock(&sbi->s_lock);
134 ext2_sync_super(sb, es, 1);
129 } 135 }
130 db_count = sbi->s_gdb_count; 136 db_count = sbi->s_gdb_count;
131 for (i = 0; i < db_count; i++) 137 for (i = 0; i < db_count; i++)
@@ -140,8 +146,6 @@ static void ext2_put_super (struct super_block * sb)
140 sb->s_fs_info = NULL; 146 sb->s_fs_info = NULL;
141 kfree(sbi->s_blockgroup_lock); 147 kfree(sbi->s_blockgroup_lock);
142 kfree(sbi); 148 kfree(sbi);
143
144 unlock_kernel();
145} 149}
146 150
147static struct kmem_cache * ext2_inode_cachep; 151static struct kmem_cache * ext2_inode_cachep;
@@ -191,17 +195,6 @@ static void destroy_inodecache(void)
191 kmem_cache_destroy(ext2_inode_cachep); 195 kmem_cache_destroy(ext2_inode_cachep);
192} 196}
193 197
194static void ext2_clear_inode(struct inode *inode)
195{
196 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
197
198 dquot_drop(inode);
199 ext2_discard_reservation(inode);
200 EXT2_I(inode)->i_block_alloc_info = NULL;
201 if (unlikely(rsv))
202 kfree(rsv);
203}
204
205static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) 198static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
206{ 199{
207 struct super_block *sb = vfs->mnt_sb; 200 struct super_block *sb = vfs->mnt_sb;
@@ -209,6 +202,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
209 struct ext2_super_block *es = sbi->s_es; 202 struct ext2_super_block *es = sbi->s_es;
210 unsigned long def_mount_opts; 203 unsigned long def_mount_opts;
211 204
205 spin_lock(&sbi->s_lock);
212 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 206 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
213 207
214 if (sbi->s_sb_block != 1) 208 if (sbi->s_sb_block != 1)
@@ -281,6 +275,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
281 if (!test_opt(sb, RESERVATION)) 275 if (!test_opt(sb, RESERVATION))
282 seq_puts(seq, ",noreservation"); 276 seq_puts(seq, ",noreservation");
283 277
278 spin_unlock(&sbi->s_lock);
284 return 0; 279 return 0;
285} 280}
286 281
@@ -293,13 +288,12 @@ static const struct super_operations ext2_sops = {
293 .alloc_inode = ext2_alloc_inode, 288 .alloc_inode = ext2_alloc_inode,
294 .destroy_inode = ext2_destroy_inode, 289 .destroy_inode = ext2_destroy_inode,
295 .write_inode = ext2_write_inode, 290 .write_inode = ext2_write_inode,
296 .delete_inode = ext2_delete_inode, 291 .evict_inode = ext2_evict_inode,
297 .put_super = ext2_put_super, 292 .put_super = ext2_put_super,
298 .write_super = ext2_write_super, 293 .write_super = ext2_write_super,
299 .sync_fs = ext2_sync_fs, 294 .sync_fs = ext2_sync_fs,
300 .statfs = ext2_statfs, 295 .statfs = ext2_statfs,
301 .remount_fs = ext2_remount, 296 .remount_fs = ext2_remount,
302 .clear_inode = ext2_clear_inode,
303 .show_options = ext2_show_options, 297 .show_options = ext2_show_options,
304#ifdef CONFIG_QUOTA 298#ifdef CONFIG_QUOTA
305 .quota_read = ext2_quota_read, 299 .quota_read = ext2_quota_read,
@@ -606,7 +600,6 @@ static int ext2_setup_super (struct super_block * sb,
606 if (!le16_to_cpu(es->s_max_mnt_count)) 600 if (!le16_to_cpu(es->s_max_mnt_count))
607 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); 601 es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
608 le16_add_cpu(&es->s_mnt_count, 1); 602 le16_add_cpu(&es->s_mnt_count, 1);
609 ext2_write_super(sb);
610 if (test_opt (sb, DEBUG)) 603 if (test_opt (sb, DEBUG))
611 ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " 604 ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
612 "bpg=%lu, ipg=%lu, mo=%04lx]", 605 "bpg=%lu, ipg=%lu, mo=%04lx]",
@@ -767,6 +760,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
767 sb->s_fs_info = sbi; 760 sb->s_fs_info = sbi;
768 sbi->s_sb_block = sb_block; 761 sbi->s_sb_block = sb_block;
769 762
763 spin_lock_init(&sbi->s_lock);
764
770 /* 765 /*
771 * See what the current blocksize for the device is, and 766 * See what the current blocksize for the device is, and
772 * use that as the blocksize. Otherwise (or if the blocksize 767 * use that as the blocksize. Otherwise (or if the blocksize
@@ -1058,6 +1053,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1058 sb->s_op = &ext2_sops; 1053 sb->s_op = &ext2_sops;
1059 sb->s_export_op = &ext2_export_ops; 1054 sb->s_export_op = &ext2_export_ops;
1060 sb->s_xattr = ext2_xattr_handlers; 1055 sb->s_xattr = ext2_xattr_handlers;
1056
1057#ifdef CONFIG_QUOTA
1058 sb->dq_op = &dquot_operations;
1059 sb->s_qcop = &dquot_quotactl_ops;
1060#endif
1061
1061 root = ext2_iget(sb, EXT2_ROOT_INO); 1062 root = ext2_iget(sb, EXT2_ROOT_INO);
1062 if (IS_ERR(root)) { 1063 if (IS_ERR(root)) {
1063 ret = PTR_ERR(root); 1064 ret = PTR_ERR(root);
@@ -1079,7 +1080,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1079 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) 1080 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
1080 ext2_msg(sb, KERN_WARNING, 1081 ext2_msg(sb, KERN_WARNING,
1081 "warning: mounting ext3 filesystem as ext2"); 1082 "warning: mounting ext3 filesystem as ext2");
1082 ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); 1083 if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
1084 sb->s_flags |= MS_RDONLY;
1085 ext2_write_super(sb);
1083 return 0; 1086 return 0;
1084 1087
1085cantfind_ext2: 1088cantfind_ext2:
@@ -1120,30 +1123,26 @@ static void ext2_clear_super_error(struct super_block *sb)
1120 * be remapped. Nothing we can do but to retry the 1123 * be remapped. Nothing we can do but to retry the
1121 * write and hope for the best. 1124 * write and hope for the best.
1122 */ 1125 */
1123 printk(KERN_ERR "EXT2-fs: %s previous I/O error to " 1126 ext2_msg(sb, KERN_ERR,
1124 "superblock detected", sb->s_id); 1127 "previous I/O error to superblock detected\n");
1125 clear_buffer_write_io_error(sbh); 1128 clear_buffer_write_io_error(sbh);
1126 set_buffer_uptodate(sbh); 1129 set_buffer_uptodate(sbh);
1127 } 1130 }
1128} 1131}
1129 1132
1130static void ext2_commit_super (struct super_block * sb, 1133static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
1131 struct ext2_super_block * es) 1134 int wait)
1132{
1133 ext2_clear_super_error(sb);
1134 es->s_wtime = cpu_to_le32(get_seconds());
1135 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
1136 sb->s_dirt = 0;
1137}
1138
1139static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1140{ 1135{
1141 ext2_clear_super_error(sb); 1136 ext2_clear_super_error(sb);
1137 spin_lock(&EXT2_SB(sb)->s_lock);
1142 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); 1138 es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
1143 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); 1139 es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
1144 es->s_wtime = cpu_to_le32(get_seconds()); 1140 es->s_wtime = cpu_to_le32(get_seconds());
1141 /* unlock before we do IO */
1142 spin_unlock(&EXT2_SB(sb)->s_lock);
1145 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 1143 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
1146 sync_dirty_buffer(EXT2_SB(sb)->s_sbh); 1144 if (wait)
1145 sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
1147 sb->s_dirt = 0; 1146 sb->s_dirt = 0;
1148} 1147}
1149 1148
@@ -1157,43 +1156,18 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
1157 * may have been checked while mounted and e2fsck may have 1156 * may have been checked while mounted and e2fsck may have
1158 * set s_state to EXT2_VALID_FS after some corrections. 1157 * set s_state to EXT2_VALID_FS after some corrections.
1159 */ 1158 */
1160
1161static int ext2_sync_fs(struct super_block *sb, int wait) 1159static int ext2_sync_fs(struct super_block *sb, int wait)
1162{ 1160{
1161 struct ext2_sb_info *sbi = EXT2_SB(sb);
1163 struct ext2_super_block *es = EXT2_SB(sb)->s_es; 1162 struct ext2_super_block *es = EXT2_SB(sb)->s_es;
1164 struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
1165
1166 lock_kernel();
1167 if (buffer_write_io_error(sbh)) {
1168 /*
1169 * Oh, dear. A previous attempt to write the
1170 * superblock failed. This could happen because the
1171 * USB device was yanked out. Or it could happen to
1172 * be a transient write error and maybe the block will
1173 * be remapped. Nothing we can do but to retry the
1174 * write and hope for the best.
1175 */
1176 ext2_msg(sb, KERN_ERR,
1177 "previous I/O error to superblock detected\n");
1178 clear_buffer_write_io_error(sbh);
1179 set_buffer_uptodate(sbh);
1180 }
1181 1163
1164 spin_lock(&sbi->s_lock);
1182 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { 1165 if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
1183 ext2_debug("setting valid to 0\n"); 1166 ext2_debug("setting valid to 0\n");
1184 es->s_state &= cpu_to_le16(~EXT2_VALID_FS); 1167 es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
1185 es->s_free_blocks_count =
1186 cpu_to_le32(ext2_count_free_blocks(sb));
1187 es->s_free_inodes_count =
1188 cpu_to_le32(ext2_count_free_inodes(sb));
1189 es->s_mtime = cpu_to_le32(get_seconds());
1190 ext2_sync_super(sb, es);
1191 } else {
1192 ext2_commit_super(sb, es);
1193 } 1168 }
1194 sb->s_dirt = 0; 1169 spin_unlock(&sbi->s_lock);
1195 unlock_kernel(); 1170 ext2_sync_super(sb, es, wait);
1196
1197 return 0; 1171 return 0;
1198} 1172}
1199 1173
@@ -1215,7 +1189,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1215 unsigned long old_sb_flags; 1189 unsigned long old_sb_flags;
1216 int err; 1190 int err;
1217 1191
1218 lock_kernel(); 1192 spin_lock(&sbi->s_lock);
1219 1193
1220 /* Store the old options */ 1194 /* Store the old options */
1221 old_sb_flags = sb->s_flags; 1195 old_sb_flags = sb->s_flags;
@@ -1254,21 +1228,31 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1254 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; 1228 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1255 } 1229 }
1256 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1230 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1257 unlock_kernel(); 1231 spin_unlock(&sbi->s_lock);
1258 return 0; 1232 return 0;
1259 } 1233 }
1260 if (*flags & MS_RDONLY) { 1234 if (*flags & MS_RDONLY) {
1261 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || 1235 if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
1262 !(sbi->s_mount_state & EXT2_VALID_FS)) { 1236 !(sbi->s_mount_state & EXT2_VALID_FS)) {
1263 unlock_kernel(); 1237 spin_unlock(&sbi->s_lock);
1264 return 0; 1238 return 0;
1265 } 1239 }
1240
1266 /* 1241 /*
1267 * OK, we are remounting a valid rw partition rdonly, so set 1242 * OK, we are remounting a valid rw partition rdonly, so set
1268 * the rdonly flag and then mark the partition as valid again. 1243 * the rdonly flag and then mark the partition as valid again.
1269 */ 1244 */
1270 es->s_state = cpu_to_le16(sbi->s_mount_state); 1245 es->s_state = cpu_to_le16(sbi->s_mount_state);
1271 es->s_mtime = cpu_to_le32(get_seconds()); 1246 es->s_mtime = cpu_to_le32(get_seconds());
1247 spin_unlock(&sbi->s_lock);
1248
1249 err = dquot_suspend(sb, -1);
1250 if (err < 0) {
1251 spin_lock(&sbi->s_lock);
1252 goto restore_opts;
1253 }
1254
1255 ext2_sync_super(sb, es, 1);
1272 } else { 1256 } else {
1273 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, 1257 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
1274 ~EXT2_FEATURE_RO_COMPAT_SUPP); 1258 ~EXT2_FEATURE_RO_COMPAT_SUPP);
@@ -1288,16 +1272,20 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1288 sbi->s_mount_state = le16_to_cpu(es->s_state); 1272 sbi->s_mount_state = le16_to_cpu(es->s_state);
1289 if (!ext2_setup_super (sb, es, 0)) 1273 if (!ext2_setup_super (sb, es, 0))
1290 sb->s_flags &= ~MS_RDONLY; 1274 sb->s_flags &= ~MS_RDONLY;
1275 spin_unlock(&sbi->s_lock);
1276
1277 ext2_write_super(sb);
1278
1279 dquot_resume(sb, -1);
1291 } 1280 }
1292 ext2_sync_super(sb, es); 1281
1293 unlock_kernel();
1294 return 0; 1282 return 0;
1295restore_opts: 1283restore_opts:
1296 sbi->s_mount_opt = old_opts.s_mount_opt; 1284 sbi->s_mount_opt = old_opts.s_mount_opt;
1297 sbi->s_resuid = old_opts.s_resuid; 1285 sbi->s_resuid = old_opts.s_resuid;
1298 sbi->s_resgid = old_opts.s_resgid; 1286 sbi->s_resgid = old_opts.s_resgid;
1299 sb->s_flags = old_sb_flags; 1287 sb->s_flags = old_sb_flags;
1300 unlock_kernel(); 1288 spin_unlock(&sbi->s_lock);
1301 return err; 1289 return err;
1302} 1290}
1303 1291
@@ -1308,6 +1296,8 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1308 struct ext2_super_block *es = sbi->s_es; 1296 struct ext2_super_block *es = sbi->s_es;
1309 u64 fsid; 1297 u64 fsid;
1310 1298
1299 spin_lock(&sbi->s_lock);
1300
1311 if (test_opt (sb, MINIX_DF)) 1301 if (test_opt (sb, MINIX_DF))
1312 sbi->s_overhead_last = 0; 1302 sbi->s_overhead_last = 0;
1313 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { 1303 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
@@ -1362,6 +1352,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1362 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 1352 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
1363 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; 1353 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1364 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; 1354 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1355 spin_unlock(&sbi->s_lock);
1365 return 0; 1356 return 0;
1366} 1357}
1367 1358
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e44dc92609be..8c29ae15129e 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
101 101
102static struct mb_cache *ext2_xattr_cache; 102static struct mb_cache *ext2_xattr_cache;
103 103
104static struct xattr_handler *ext2_xattr_handler_map[] = { 104static const struct xattr_handler *ext2_xattr_handler_map[] = {
105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, 105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
106#ifdef CONFIG_EXT2_FS_POSIX_ACL 106#ifdef CONFIG_EXT2_FS_POSIX_ACL
107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, 107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
113#endif 113#endif
114}; 114};
115 115
116struct xattr_handler *ext2_xattr_handlers[] = { 116const struct xattr_handler *ext2_xattr_handlers[] = {
117 &ext2_xattr_user_handler, 117 &ext2_xattr_user_handler,
118 &ext2_xattr_trusted_handler, 118 &ext2_xattr_trusted_handler,
119#ifdef CONFIG_EXT2_FS_POSIX_ACL 119#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
126 NULL 126 NULL
127}; 127};
128 128
129static inline struct xattr_handler * 129static inline const struct xattr_handler *
130ext2_xattr_handler(int name_index) 130ext2_xattr_handler(int name_index)
131{ 131{
132 struct xattr_handler *handler = NULL; 132 const struct xattr_handler *handler = NULL;
133 133
134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) 134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
135 handler = ext2_xattr_handler_map[name_index]; 135 handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
298 /* list the attribute names */ 298 /* list the attribute names */
299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); 299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
300 entry = EXT2_XATTR_NEXT(entry)) { 300 entry = EXT2_XATTR_NEXT(entry)) {
301 struct xattr_handler *handler = 301 const struct xattr_handler *handler =
302 ext2_xattr_handler(entry->e_name_index); 302 ext2_xattr_handler(entry->e_name_index);
303 303
304 if (handler) { 304 if (handler) {
@@ -345,7 +345,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
345 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) 345 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
346 return; 346 return;
347 347
348 spin_lock(&EXT2_SB(sb)->s_lock);
348 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); 349 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
350 spin_unlock(&EXT2_SB(sb)->s_lock);
349 sb->s_dirt = 1; 351 sb->s_dirt = 1;
350 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 352 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
351} 353}
@@ -672,6 +674,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
672 new_bh = sb_getblk(sb, block); 674 new_bh = sb_getblk(sb, block);
673 if (!new_bh) { 675 if (!new_bh) {
674 ext2_free_blocks(inode, block, 1); 676 ext2_free_blocks(inode, block, 1);
677 mark_inode_dirty(inode);
675 error = -EIO; 678 error = -EIO;
676 goto cleanup; 679 goto cleanup;
677 } 680 }
@@ -701,8 +704,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
701 * written (only some dirty data were not) so we just proceed 704 * written (only some dirty data were not) so we just proceed
702 * as if nothing happened and cleanup the unused block */ 705 * as if nothing happened and cleanup the unused block */
703 if (error && error != -ENOSPC) { 706 if (error && error != -ENOSPC) {
704 if (new_bh && new_bh != old_bh) 707 if (new_bh && new_bh != old_bh) {
705 dquot_free_block(inode, 1); 708 dquot_free_block_nodirty(inode, 1);
709 mark_inode_dirty(inode);
710 }
706 goto cleanup; 711 goto cleanup;
707 } 712 }
708 } else 713 } else
@@ -725,6 +730,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
725 mb_cache_entry_free(ce); 730 mb_cache_entry_free(ce);
726 ea_bdebug(old_bh, "freeing"); 731 ea_bdebug(old_bh, "freeing");
727 ext2_free_blocks(inode, old_bh->b_blocknr, 1); 732 ext2_free_blocks(inode, old_bh->b_blocknr, 1);
733 mark_inode_dirty(inode);
728 /* We let our caller release old_bh, so we 734 /* We let our caller release old_bh, so we
729 * need to duplicate the buffer before. */ 735 * need to duplicate the buffer before. */
730 get_bh(old_bh); 736 get_bh(old_bh);
@@ -734,7 +740,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
734 le32_add_cpu(&HDR(old_bh)->h_refcount, -1); 740 le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
735 if (ce) 741 if (ce)
736 mb_cache_entry_release(ce); 742 mb_cache_entry_release(ce);
737 dquot_free_block(inode, 1); 743 dquot_free_block_nodirty(inode, 1);
744 mark_inode_dirty(inode);
738 mark_buffer_dirty(old_bh); 745 mark_buffer_dirty(old_bh);
739 ea_bdebug(old_bh, "refcount now=%d", 746 ea_bdebug(old_bh, "refcount now=%d",
740 le32_to_cpu(HDR(old_bh)->h_refcount)); 747 le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -797,7 +804,7 @@ ext2_xattr_delete_inode(struct inode *inode)
797 mark_buffer_dirty(bh); 804 mark_buffer_dirty(bh);
798 if (IS_SYNC(inode)) 805 if (IS_SYNC(inode))
799 sync_dirty_buffer(bh); 806 sync_dirty_buffer(bh);
800 dquot_free_block(inode, 1); 807 dquot_free_block_nodirty(inode, 1);
801 } 808 }
802 EXT2_I(inode)->i_file_acl = 0; 809 EXT2_I(inode)->i_file_acl = 0;
803 810
@@ -836,7 +843,7 @@ ext2_xattr_cache_insert(struct buffer_head *bh)
836 ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); 843 ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
837 if (!ce) 844 if (!ce)
838 return -ENOMEM; 845 return -ENOMEM;
839 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); 846 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
840 if (error) { 847 if (error) {
841 mb_cache_entry_free(ce); 848 mb_cache_entry_free(ce);
842 if (error == -EBUSY) { 849 if (error == -EBUSY) {
@@ -910,8 +917,8 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
910 return NULL; /* never share */ 917 return NULL; /* never share */
911 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 918 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
912again: 919again:
913 ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, 920 ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
914 inode->i_sb->s_bdev, hash); 921 hash);
915 while (ce) { 922 while (ce) {
916 struct buffer_head *bh; 923 struct buffer_head *bh;
917 924
@@ -943,7 +950,7 @@ again:
943 unlock_buffer(bh); 950 unlock_buffer(bh);
944 brelse(bh); 951 brelse(bh);
945 } 952 }
946 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); 953 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
947 } 954 }
948 return NULL; 955 return NULL;
949} 956}
@@ -1019,9 +1026,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
1019int __init 1026int __init
1020init_ext2_xattr(void) 1027init_ext2_xattr(void)
1021{ 1028{
1022 ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL, 1029 ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
1023 sizeof(struct mb_cache_entry) +
1024 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1025 if (!ext2_xattr_cache) 1030 if (!ext2_xattr_cache)
1026 return -ENOMEM; 1031 return -ENOMEM;
1027 return 0; 1032 return 0;
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced9..a1a1c2184616 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
55 55
56# ifdef CONFIG_EXT2_FS_XATTR 56# ifdef CONFIG_EXT2_FS_XATTR
57 57
58extern struct xattr_handler ext2_xattr_user_handler; 58extern const struct xattr_handler ext2_xattr_user_handler;
59extern struct xattr_handler ext2_xattr_trusted_handler; 59extern const struct xattr_handler ext2_xattr_trusted_handler;
60extern struct xattr_handler ext2_xattr_acl_access_handler; 60extern const struct xattr_handler ext2_xattr_acl_access_handler;
61extern struct xattr_handler ext2_xattr_acl_default_handler; 61extern const struct xattr_handler ext2_xattr_acl_default_handler;
62extern struct xattr_handler ext2_xattr_security_handler; 62extern const struct xattr_handler ext2_xattr_security_handler;
63 63
64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); 64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
65 65
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
72extern int init_ext2_xattr(void); 72extern int init_ext2_xattr(void);
73extern void exit_ext2_xattr(void); 73extern void exit_ext2_xattr(void);
74 74
75extern struct xattr_handler *ext2_xattr_handlers[]; 75extern const struct xattr_handler *ext2_xattr_handlers[];
76 76
77# else /* CONFIG_EXT2_FS_XATTR */ 77# else /* CONFIG_EXT2_FS_XATTR */
78 78
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index b118c6383c6d..3004e15d5da5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
67 return err; 67 return err;
68} 68}
69 69
70struct xattr_handler ext2_xattr_security_handler = { 70const struct xattr_handler ext2_xattr_security_handler = {
71 .prefix = XATTR_SECURITY_PREFIX, 71 .prefix = XATTR_SECURITY_PREFIX,
72 .list = ext2_xattr_security_list, 72 .list = ext2_xattr_security_list,
73 .get = ext2_xattr_security_get, 73 .get = ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f4771..667e46a8d62d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
50 value, size, flags); 50 value, size, flags);
51} 51}
52 52
53struct xattr_handler ext2_xattr_trusted_handler = { 53const struct xattr_handler ext2_xattr_trusted_handler = {
54 .prefix = XATTR_TRUSTED_PREFIX, 54 .prefix = XATTR_TRUSTED_PREFIX,
55 .list = ext2_xattr_trusted_list, 55 .list = ext2_xattr_trusted_list,
56 .get = ext2_xattr_trusted_get, 56 .get = ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b4..099d20f47163 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext2_xattr_user_handler = { 57const struct xattr_handler ext2_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext2_xattr_user_list, 59 .list = ext2_xattr_user_list,
60 .get = ext2_xattr_user_get, 60 .get = ext2_xattr_user_get,
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 522b15498f45..e8c6ba0e4a3e 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -31,6 +31,7 @@ config EXT3_FS
31config EXT3_DEFAULTS_TO_ORDERED 31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3" 32 bool "Default to 'data=ordered' in ext3"
33 depends on EXT3_FS 33 depends on EXT3_FS
34 default y
34 help 35 help
35 The journal mode options for ext3 have different tradeoffs 36 The journal mode options for ext3 have different tradeoffs
36 between when data is guaranteed to be on disk and 37 between when data is guaranteed to be on disk and
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba34158661..8a11fe212183 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -205,6 +205,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
205 return error; 205 return error;
206 else { 206 else {
207 inode->i_mode = mode; 207 inode->i_mode = mode;
208 inode->i_ctime = CURRENT_TIME_SEC;
208 ext3_mark_inode_dirty(handle, inode); 209 ext3_mark_inode_dirty(handle, inode);
209 if (error == 0) 210 if (error == 0)
210 acl = NULL; 211 acl = NULL;
@@ -456,7 +457,7 @@ release_and_out:
456 return error; 457 return error;
457} 458}
458 459
459struct xattr_handler ext3_xattr_acl_access_handler = { 460const struct xattr_handler ext3_xattr_acl_access_handler = {
460 .prefix = POSIX_ACL_XATTR_ACCESS, 461 .prefix = POSIX_ACL_XATTR_ACCESS,
461 .flags = ACL_TYPE_ACCESS, 462 .flags = ACL_TYPE_ACCESS,
462 .list = ext3_xattr_list_acl_access, 463 .list = ext3_xattr_list_acl_access,
@@ -464,7 +465,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
464 .set = ext3_xattr_set_acl, 465 .set = ext3_xattr_set_acl,
465}; 466};
466 467
467struct xattr_handler ext3_xattr_acl_default_handler = { 468const struct xattr_handler ext3_xattr_acl_default_handler = {
468 .prefix = POSIX_ACL_XATTR_DEFAULT, 469 .prefix = POSIX_ACL_XATTR_DEFAULT,
469 .flags = ACL_TYPE_DEFAULT, 470 .flags = ACL_TYPE_DEFAULT,
470 .list = ext3_xattr_list_acl_default, 471 .list = ext3_xattr_list_acl_default,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a177122a1b25..4a32511f4ded 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1584,6 +1584,12 @@ retry_alloc:
1584 goto io_error; 1584 goto io_error;
1585 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); 1585 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1586 /* 1586 /*
1587 * skip this group (and avoid loading bitmap) if there
1588 * are no free blocks
1589 */
1590 if (!free_blocks)
1591 continue;
1592 /*
1587 * skip this group if the number of 1593 * skip this group if the number of
1588 * free blocks is less than half of the reservation 1594 * free blocks is less than half of the reservation
1589 * window size. 1595 * window size.
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
297 kfree (old); 297 kfree (old);
298 } 298 }
299 if (!parent) 299 if (!parent)
300 root->rb_node = NULL; 300 *root = RB_ROOT;
301 else if (parent->rb_left == n) 301 else if (parent->rb_left == n)
302 parent->rb_left = NULL; 302 parent->rb_left = NULL;
303 else if (parent->rb_right == n) 303 else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,12 +43,12 @@
43 * inode to disk. 43 * inode to disk.
44 */ 44 */
45 45
46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext3_sync_file(struct file *file, int datasync)
47{ 47{
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = file->f_mapping->host;
49 struct ext3_inode_info *ei = EXT3_I(inode); 49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
51 int ret = 0; 51 int ret, needs_barrier = 0;
52 tid_t commit_tid; 52 tid_t commit_tid;
53 53
54 if (inode->i_sb->s_flags & MS_RDONLY) 54 if (inode->i_sb->s_flags & MS_RDONLY)
@@ -70,28 +70,27 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
70 * (they were dirtied by commit). But that's OK - the blocks are 70 * (they were dirtied by commit). But that's OK - the blocks are
71 * safe in-journal, which is all fsync() needs to ensure. 71 * safe in-journal, which is all fsync() needs to ensure.
72 */ 72 */
73 if (ext3_should_journal_data(inode)) { 73 if (ext3_should_journal_data(inode))
74 ret = ext3_force_commit(inode->i_sb); 74 return ext3_force_commit(inode->i_sb);
75 goto out;
76 }
77 75
78 if (datasync) 76 if (datasync)
79 commit_tid = atomic_read(&ei->i_datasync_tid); 77 commit_tid = atomic_read(&ei->i_datasync_tid);
80 else 78 else
81 commit_tid = atomic_read(&ei->i_sync_tid); 79 commit_tid = atomic_read(&ei->i_sync_tid);
82 80
83 if (log_start_commit(journal, commit_tid)) { 81 if (test_opt(inode->i_sb, BARRIER) &&
84 log_wait_commit(journal, commit_tid); 82 !journal_trans_will_send_data_barrier(journal, commit_tid))
85 goto out; 83 needs_barrier = 1;
86 } 84 log_start_commit(journal, commit_tid);
85 ret = log_wait_commit(journal, commit_tid);
87 86
88 /* 87 /*
89 * In case we didn't commit a transaction, we have to flush 88 * In case we didn't commit a transaction, we have to flush
90 * disk caches manually so that data really is on persistent 89 * disk caches manually so that data really is on persistent
91 * storage 90 * storage
92 */ 91 */
93 if (test_opt(inode->i_sb, BARRIER)) 92 if (needs_barrier)
94 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
95out: 94 BLKDEV_IFL_WAIT);
96 return ret; 95 return ret;
97} 96}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 0d0e97ed3ff6..4ab72db3559e 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -119,20 +119,8 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
119 ino = inode->i_ino; 119 ino = inode->i_ino;
120 ext3_debug ("freeing inode %lu\n", ino); 120 ext3_debug ("freeing inode %lu\n", ino);
121 121
122 /*
123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well.
125 */
126 dquot_initialize(inode);
127 ext3_xattr_delete_inode(handle, inode);
128 dquot_free_inode(inode);
129 dquot_drop(inode);
130
131 is_directory = S_ISDIR(inode->i_mode); 122 is_directory = S_ISDIR(inode->i_mode);
132 123
133 /* Do this BEFORE marking the inode not in use or returning an error */
134 clear_inode (inode);
135
136 es = EXT3_SB(sb)->s_es; 124 es = EXT3_SB(sb)->s_es;
137 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 125 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
138 ext3_error (sb, "ext3_free_inode", 126 ext3_error (sb, "ext3_free_inode",
@@ -538,16 +526,13 @@ got:
538 if (S_ISDIR(mode)) 526 if (S_ISDIR(mode))
539 percpu_counter_inc(&sbi->s_dirs_counter); 527 percpu_counter_inc(&sbi->s_dirs_counter);
540 528
541 inode->i_uid = current_fsuid(); 529
542 if (test_opt (sb, GRPID)) 530 if (test_opt(sb, GRPID)) {
543 inode->i_gid = dir->i_gid; 531 inode->i_mode = mode;
544 else if (dir->i_mode & S_ISGID) { 532 inode->i_uid = current_fsuid();
545 inode->i_gid = dir->i_gid; 533 inode->i_gid = dir->i_gid;
546 if (S_ISDIR(mode))
547 mode |= S_ISGID;
548 } else 534 } else
549 inode->i_gid = current_fsgid(); 535 inode_init_owner(inode, dir, mode);
550 inode->i_mode = mode;
551 536
552 inode->i_ino = ino; 537 inode->i_ino = ino;
553 /* This is the optimal IO size (for stat), not the fs block size */ 538 /* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ea33bdf0a300..5e0faf4cda79 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -190,18 +190,28 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
190} 190}
191 191
192/* 192/*
193 * Called at the last iput() if i_nlink is zero. 193 * Called at inode eviction from icache
194 */ 194 */
195void ext3_delete_inode (struct inode * inode) 195void ext3_evict_inode (struct inode *inode)
196{ 196{
197 struct ext3_block_alloc_info *rsv;
197 handle_t *handle; 198 handle_t *handle;
199 int want_delete = 0;
198 200
199 if (!is_bad_inode(inode)) 201 if (!inode->i_nlink && !is_bad_inode(inode)) {
200 dquot_initialize(inode); 202 dquot_initialize(inode);
203 want_delete = 1;
204 }
201 205
202 truncate_inode_pages(&inode->i_data, 0); 206 truncate_inode_pages(&inode->i_data, 0);
203 207
204 if (is_bad_inode(inode)) 208 ext3_discard_reservation(inode);
209 rsv = EXT3_I(inode)->i_block_alloc_info;
210 EXT3_I(inode)->i_block_alloc_info = NULL;
211 if (unlikely(rsv))
212 kfree(rsv);
213
214 if (!want_delete)
205 goto no_delete; 215 goto no_delete;
206 216
207 handle = start_transaction(inode); 217 handle = start_transaction(inode);
@@ -238,15 +248,22 @@ void ext3_delete_inode (struct inode * inode)
238 * having errors), but we can't free the inode if the mark_dirty 248 * having errors), but we can't free the inode if the mark_dirty
239 * fails. 249 * fails.
240 */ 250 */
241 if (ext3_mark_inode_dirty(handle, inode)) 251 if (ext3_mark_inode_dirty(handle, inode)) {
242 /* If that failed, just do the required in-core inode clear. */ 252 /* If that failed, just dquot_drop() and be done with that */
243 clear_inode(inode); 253 dquot_drop(inode);
244 else 254 end_writeback(inode);
255 } else {
256 ext3_xattr_delete_inode(handle, inode);
257 dquot_free_inode(inode);
258 dquot_drop(inode);
259 end_writeback(inode);
245 ext3_free_inode(handle, inode); 260 ext3_free_inode(handle, inode);
261 }
246 ext3_journal_stop(handle); 262 ext3_journal_stop(handle);
247 return; 263 return;
248no_delete: 264no_delete:
249 clear_inode(inode); /* We must guarantee clearing of inode... */ 265 end_writeback(inode);
266 dquot_drop(inode);
250} 267}
251 268
252typedef struct { 269typedef struct {
@@ -1149,9 +1166,25 @@ static int walk_page_buffers( handle_t *handle,
1149static int do_journal_get_write_access(handle_t *handle, 1166static int do_journal_get_write_access(handle_t *handle,
1150 struct buffer_head *bh) 1167 struct buffer_head *bh)
1151{ 1168{
1169 int dirty = buffer_dirty(bh);
1170 int ret;
1171
1152 if (!buffer_mapped(bh) || buffer_freed(bh)) 1172 if (!buffer_mapped(bh) || buffer_freed(bh))
1153 return 0; 1173 return 0;
1154 return ext3_journal_get_write_access(handle, bh); 1174 /*
1175 * __block_prepare_write() could have dirtied some buffers. Clean
1176 * the dirty bit as jbd2_journal_get_write_access() could complain
1177 * otherwise about fs integrity issues. Setting of the dirty bit
1178 * by __block_prepare_write() isn't a real problem here as we clear
1179 * the bit before releasing a page lock and thus writeback cannot
1180 * ever write the buffer.
1181 */
1182 if (dirty)
1183 clear_buffer_dirty(bh);
1184 ret = ext3_journal_get_write_access(handle, bh);
1185 if (!ret && dirty)
1186 ret = ext3_journal_dirty_metadata(handle, bh);
1187 return ret;
1155} 1188}
1156 1189
1157/* 1190/*
@@ -1196,8 +1229,7 @@ retry:
1196 ret = PTR_ERR(handle); 1229 ret = PTR_ERR(handle);
1197 goto out; 1230 goto out;
1198 } 1231 }
1199 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1232 ret = __block_write_begin(page, pos, len, ext3_get_block);
1200 ext3_get_block);
1201 if (ret) 1233 if (ret)
1202 goto write_begin_failed; 1234 goto write_begin_failed;
1203 1235
@@ -1625,10 +1657,7 @@ static int ext3_writeback_writepage(struct page *page,
1625 goto out_fail; 1657 goto out_fail;
1626 } 1658 }
1627 1659
1628 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) 1660 ret = block_write_full_page(page, ext3_get_block, wbc);
1629 ret = nobh_writepage(page, ext3_get_block, wbc);
1630 else
1631 ret = block_write_full_page(page, ext3_get_block, wbc);
1632 1661
1633 err = ext3_journal_stop(handle); 1662 err = ext3_journal_stop(handle);
1634 if (!ret) 1663 if (!ret)
@@ -1785,6 +1814,17 @@ retry:
1785 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1814 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1786 offset, nr_segs, 1815 offset, nr_segs,
1787 ext3_get_block, NULL); 1816 ext3_get_block, NULL);
1817 /*
1818 * In case of error extending write may have instantiated a few
1819 * blocks outside i_size. Trim these off again.
1820 */
1821 if (unlikely((rw & WRITE) && ret < 0)) {
1822 loff_t isize = i_size_read(inode);
1823 loff_t end = offset + iov_length(iov, nr_segs);
1824
1825 if (end > isize)
1826 vmtruncate(inode, isize);
1827 }
1788 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1828 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1789 goto retry; 1829 goto retry;
1790 1830
@@ -1922,17 +1962,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1922 length = blocksize - (offset & (blocksize - 1)); 1962 length = blocksize - (offset & (blocksize - 1));
1923 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1963 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1924 1964
1925 /*
1926 * For "nobh" option, we can only work if we don't need to
1927 * read-in the page - otherwise we create buffers to do the IO.
1928 */
1929 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1930 ext3_should_writeback_data(inode) && PageUptodate(page)) {
1931 zero_user(page, offset, length);
1932 set_page_dirty(page);
1933 goto unlock;
1934 }
1935
1936 if (!page_has_buffers(page)) 1965 if (!page_has_buffers(page))
1937 create_empty_buffers(page, blocksize, 0); 1966 create_empty_buffers(page, blocksize, 0);
1938 1967
@@ -2284,27 +2313,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2284 depth); 2313 depth);
2285 2314
2286 /* 2315 /*
2287 * We've probably journalled the indirect block several
2288 * times during the truncate. But it's no longer
2289 * needed and we now drop it from the transaction via
2290 * journal_revoke().
2291 *
2292 * That's easy if it's exclusively part of this
2293 * transaction. But if it's part of the committing
2294 * transaction then journal_forget() will simply
2295 * brelse() it. That means that if the underlying
2296 * block is reallocated in ext3_get_block(),
2297 * unmap_underlying_metadata() will find this block
2298 * and will try to get rid of it. damn, damn.
2299 *
2300 * If this block has already been committed to the
2301 * journal, a revoke record will be written. And
2302 * revoke records must be emitted *before* clearing
2303 * this block's bit in the bitmaps.
2304 */
2305 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2306
2307 /*
2308 * Everything below this this pointer has been 2316 * Everything below this this pointer has been
2309 * released. Now let this top-of-subtree go. 2317 * released. Now let this top-of-subtree go.
2310 * 2318 *
@@ -2327,6 +2335,31 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2327 truncate_restart_transaction(handle, inode); 2335 truncate_restart_transaction(handle, inode);
2328 } 2336 }
2329 2337
2338 /*
2339 * We've probably journalled the indirect block several
2340 * times during the truncate. But it's no longer
2341 * needed and we now drop it from the transaction via
2342 * journal_revoke().
2343 *
2344 * That's easy if it's exclusively part of this
2345 * transaction. But if it's part of the committing
2346 * transaction then journal_forget() will simply
2347 * brelse() it. That means that if the underlying
2348 * block is reallocated in ext3_get_block(),
2349 * unmap_underlying_metadata() will find this block
2350 * and will try to get rid of it. damn, damn. Thus
2351 * we don't allow a block to be reallocated until
2352 * a transaction freeing it has fully committed.
2353 *
2354 * We also have to make sure journal replay after a
2355 * crash does not overwrite non-journaled data blocks
2356 * with old metadata when the block got reallocated for
2357 * data. Thus we have to store a revoke record for a
2358 * block in the same transaction in which we free the
2359 * block.
2360 */
2361 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2362
2330 ext3_free_blocks(handle, inode, nr, 1); 2363 ext3_free_blocks(handle, inode, nr, 1);
2331 2364
2332 if (parent_bh) { 2365 if (parent_bh) {
@@ -2554,7 +2587,7 @@ out_stop:
2554 * If this was a simple ftruncate(), and the file will remain alive 2587 * If this was a simple ftruncate(), and the file will remain alive
2555 * then we need to clear up the orphan record which we created above. 2588 * then we need to clear up the orphan record which we created above.
2556 * However, if this was a real unlink then we were called by 2589 * However, if this was a real unlink then we were called by
2557 * ext3_delete_inode(), and we allow that function to clean up the 2590 * ext3_evict_inode(), and we allow that function to clean up the
2558 * orphan info for us. 2591 * orphan info for us.
2559 */ 2592 */
2560 if (inode->i_nlink) 2593 if (inode->i_nlink)
@@ -3151,7 +3184,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3151 if (error) 3184 if (error)
3152 return error; 3185 return error;
3153 3186
3154 if (ia_valid & ATTR_SIZE) 3187 if (is_quota_modification(inode, attr))
3155 dquot_initialize(inode); 3188 dquot_initialize(inode);
3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3189 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3190 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
@@ -3198,9 +3231,17 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3198 ext3_journal_stop(handle); 3231 ext3_journal_stop(handle);
3199 } 3232 }
3200 3233
3201 rc = inode_setattr(inode, attr); 3234 if ((attr->ia_valid & ATTR_SIZE) &&
3235 attr->ia_size != i_size_read(inode)) {
3236 rc = vmtruncate(inode, attr->ia_size);
3237 if (rc)
3238 goto err_out;
3239 }
3240
3241 setattr_copy(inode, attr);
3242 mark_inode_dirty(inode);
3202 3243
3203 if (!rc && (ia_valid & ATTR_MODE)) 3244 if (ia_valid & ATTR_MODE)
3204 rc = ext3_acl_chmod(inode); 3245 rc = ext3_acl_chmod(inode);
3205 3246
3206err_out: 3247err_out:
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index ee184084ca42..2b35ddb70d65 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1447,7 +1447,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1447 struct inode *inode) 1447 struct inode *inode)
1448{ 1448{
1449 struct inode *dir = dentry->d_parent->d_inode; 1449 struct inode *dir = dentry->d_parent->d_inode;
1450 unsigned long offset;
1451 struct buffer_head * bh; 1450 struct buffer_head * bh;
1452 struct ext3_dir_entry_2 *de; 1451 struct ext3_dir_entry_2 *de;
1453 struct super_block * sb; 1452 struct super_block * sb;
@@ -1469,7 +1468,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1469 ext3_mark_inode_dirty(handle, dir); 1468 ext3_mark_inode_dirty(handle, dir);
1470 } 1469 }
1471 blocks = dir->i_size >> sb->s_blocksize_bits; 1470 blocks = dir->i_size >> sb->s_blocksize_bits;
1472 for (block = 0, offset = 0; block < blocks; block++) { 1471 for (block = 0; block < blocks; block++) {
1473 bh = ext3_bread(handle, dir, block, 0, &retval); 1472 bh = ext3_bread(handle, dir, block, 0, &retval);
1474 if(!bh) 1473 if(!bh)
1475 return retval; 1474 return retval;
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 54351ac7cef9..0ccd7b12b73c 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -964,7 +964,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
964 ext3_fsblk_t n_blocks_count) 964 ext3_fsblk_t n_blocks_count)
965{ 965{
966 ext3_fsblk_t o_blocks_count; 966 ext3_fsblk_t o_blocks_count;
967 unsigned long o_groups_count;
968 ext3_grpblk_t last; 967 ext3_grpblk_t last;
969 ext3_grpblk_t add; 968 ext3_grpblk_t add;
970 struct buffer_head * bh; 969 struct buffer_head * bh;
@@ -976,7 +975,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
976 * yet: we're going to revalidate es->s_blocks_count after 975 * yet: we're going to revalidate es->s_blocks_count after
977 * taking the s_resize_lock below. */ 976 * taking the s_resize_lock below. */
978 o_blocks_count = le32_to_cpu(es->s_blocks_count); 977 o_blocks_count = le32_to_cpu(es->s_blocks_count);
979 o_groups_count = EXT3_SB(sb)->s_groups_count;
980 978
981 if (test_opt(sb, DEBUG)) 979 if (test_opt(sb, DEBUG))
982 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", 980 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1bee604cc6cd..5dbf4dba03c4 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
410 struct ext3_super_block *es = sbi->s_es; 410 struct ext3_super_block *es = sbi->s_es;
411 int i, err; 411 int i, err;
412 412
413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
414
413 lock_kernel(); 415 lock_kernel();
414 416
415 ext3_xattr_put_super(sb); 417 ext3_xattr_put_super(sb);
@@ -525,17 +527,6 @@ static void destroy_inodecache(void)
525 kmem_cache_destroy(ext3_inode_cachep); 527 kmem_cache_destroy(ext3_inode_cachep);
526} 528}
527 529
528static void ext3_clear_inode(struct inode *inode)
529{
530 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
531
532 dquot_drop(inode);
533 ext3_discard_reservation(inode);
534 EXT3_I(inode)->i_block_alloc_info = NULL;
535 if (unlikely(rsv))
536 kfree(rsv);
537}
538
539static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb) 530static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
540{ 531{
541#if defined(CONFIG_QUOTA) 532#if defined(CONFIG_QUOTA)
@@ -653,11 +644,12 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
653 seq_printf(seq, ",commit=%u", 644 seq_printf(seq, ",commit=%u",
654 (unsigned) (sbi->s_commit_interval / HZ)); 645 (unsigned) (sbi->s_commit_interval / HZ));
655 } 646 }
656 if (test_opt(sb, BARRIER))
657 seq_puts(seq, ",barrier=1");
658 if (test_opt(sb, NOBH))
659 seq_puts(seq, ",nobh");
660 647
648 /*
649 * Always display barrier state so it's clear what the status is.
650 */
651 seq_puts(seq, ",barrier=");
652 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
661 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); 653 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
662 if (test_opt(sb, DATA_ERR_ABORT)) 654 if (test_opt(sb, DATA_ERR_ABORT))
663 seq_puts(seq, ",data_err=abort"); 655 seq_puts(seq, ",data_err=abort");
@@ -744,7 +736,7 @@ static int ext3_release_dquot(struct dquot *dquot);
744static int ext3_mark_dquot_dirty(struct dquot *dquot); 736static int ext3_mark_dquot_dirty(struct dquot *dquot);
745static int ext3_write_info(struct super_block *sb, int type); 737static int ext3_write_info(struct super_block *sb, int type);
746static int ext3_quota_on(struct super_block *sb, int type, int format_id, 738static int ext3_quota_on(struct super_block *sb, int type, int format_id,
747 char *path, int remount); 739 char *path);
748static int ext3_quota_on_mount(struct super_block *sb, int type); 740static int ext3_quota_on_mount(struct super_block *sb, int type);
749static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 741static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
750 size_t len, loff_t off); 742 size_t len, loff_t off);
@@ -763,12 +755,12 @@ static const struct dquot_operations ext3_quota_operations = {
763 755
764static const struct quotactl_ops ext3_qctl_operations = { 756static const struct quotactl_ops ext3_qctl_operations = {
765 .quota_on = ext3_quota_on, 757 .quota_on = ext3_quota_on,
766 .quota_off = vfs_quota_off, 758 .quota_off = dquot_quota_off,
767 .quota_sync = vfs_quota_sync, 759 .quota_sync = dquot_quota_sync,
768 .get_info = vfs_get_dqinfo, 760 .get_info = dquot_get_dqinfo,
769 .set_info = vfs_set_dqinfo, 761 .set_info = dquot_set_dqinfo,
770 .get_dqblk = vfs_get_dqblk, 762 .get_dqblk = dquot_get_dqblk,
771 .set_dqblk = vfs_set_dqblk 763 .set_dqblk = dquot_set_dqblk
772}; 764};
773#endif 765#endif
774 766
@@ -777,14 +769,13 @@ static const struct super_operations ext3_sops = {
777 .destroy_inode = ext3_destroy_inode, 769 .destroy_inode = ext3_destroy_inode,
778 .write_inode = ext3_write_inode, 770 .write_inode = ext3_write_inode,
779 .dirty_inode = ext3_dirty_inode, 771 .dirty_inode = ext3_dirty_inode,
780 .delete_inode = ext3_delete_inode, 772 .evict_inode = ext3_evict_inode,
781 .put_super = ext3_put_super, 773 .put_super = ext3_put_super,
782 .sync_fs = ext3_sync_fs, 774 .sync_fs = ext3_sync_fs,
783 .freeze_fs = ext3_freeze, 775 .freeze_fs = ext3_freeze,
784 .unfreeze_fs = ext3_unfreeze, 776 .unfreeze_fs = ext3_unfreeze,
785 .statfs = ext3_statfs, 777 .statfs = ext3_statfs,
786 .remount_fs = ext3_remount, 778 .remount_fs = ext3_remount,
787 .clear_inode = ext3_clear_inode,
788 .show_options = ext3_show_options, 779 .show_options = ext3_show_options,
789#ifdef CONFIG_QUOTA 780#ifdef CONFIG_QUOTA
790 .quota_read = ext3_quota_read, 781 .quota_read = ext3_quota_read,
@@ -810,8 +801,8 @@ enum {
810 Opt_data_err_abort, Opt_data_err_ignore, 801 Opt_data_err_abort, Opt_data_err_ignore,
811 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 802 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
812 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 803 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
813 Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, 804 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
814 Opt_usrquota, Opt_grpquota 805 Opt_resize, Opt_usrquota, Opt_grpquota
815}; 806};
816 807
817static const match_table_t tokens = { 808static const match_table_t tokens = {
@@ -865,6 +856,8 @@ static const match_table_t tokens = {
865 {Opt_quota, "quota"}, 856 {Opt_quota, "quota"},
866 {Opt_usrquota, "usrquota"}, 857 {Opt_usrquota, "usrquota"},
867 {Opt_barrier, "barrier=%u"}, 858 {Opt_barrier, "barrier=%u"},
859 {Opt_barrier, "barrier"},
860 {Opt_nobarrier, "nobarrier"},
868 {Opt_resize, "resize"}, 861 {Opt_resize, "resize"},
869 {Opt_err, NULL}, 862 {Opt_err, NULL},
870}; 863};
@@ -967,7 +960,11 @@ static int parse_options (char *options, struct super_block *sb,
967 int token; 960 int token;
968 if (!*p) 961 if (!*p)
969 continue; 962 continue;
970 963 /*
964 * Initialize args struct so we know whether arg was
965 * found; some options take optional arguments.
966 */
967 args[0].to = args[0].from = 0;
971 token = match_token(p, tokens, args); 968 token = match_token(p, tokens, args);
972 switch (token) { 969 switch (token) {
973 case Opt_bsd_df: 970 case Opt_bsd_df:
@@ -1215,9 +1212,15 @@ set_qf_format:
1215 case Opt_abort: 1212 case Opt_abort:
1216 set_opt(sbi->s_mount_opt, ABORT); 1213 set_opt(sbi->s_mount_opt, ABORT);
1217 break; 1214 break;
1215 case Opt_nobarrier:
1216 clear_opt(sbi->s_mount_opt, BARRIER);
1217 break;
1218 case Opt_barrier: 1218 case Opt_barrier:
1219 if (match_int(&args[0], &option)) 1219 if (args[0].from) {
1220 return 0; 1220 if (match_int(&args[0], &option))
1221 return 0;
1222 } else
1223 option = 1; /* No argument, default to 1 */
1221 if (option) 1224 if (option)
1222 set_opt(sbi->s_mount_opt, BARRIER); 1225 set_opt(sbi->s_mount_opt, BARRIER);
1223 else 1226 else
@@ -1237,10 +1240,12 @@ set_qf_format:
1237 *n_blocks_count = option; 1240 *n_blocks_count = option;
1238 break; 1241 break;
1239 case Opt_nobh: 1242 case Opt_nobh:
1240 set_opt(sbi->s_mount_opt, NOBH); 1243 ext3_msg(sb, KERN_WARNING,
1244 "warning: ignoring deprecated nobh option");
1241 break; 1245 break;
1242 case Opt_bh: 1246 case Opt_bh:
1243 clear_opt(sbi->s_mount_opt, NOBH); 1247 ext3_msg(sb, KERN_WARNING,
1248 "warning: ignoring deprecated bh option");
1244 break; 1249 break;
1245 default: 1250 default:
1246 ext3_msg(sb, KERN_ERR, 1251 ext3_msg(sb, KERN_ERR,
@@ -1511,7 +1516,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1511 /* Turn quotas off */ 1516 /* Turn quotas off */
1512 for (i = 0; i < MAXQUOTAS; i++) { 1517 for (i = 0; i < MAXQUOTAS; i++) {
1513 if (sb_dqopt(sb)->files[i]) 1518 if (sb_dqopt(sb)->files[i])
1514 vfs_quota_off(sb, i, 0); 1519 dquot_quota_off(sb, i);
1515 } 1520 }
1516#endif 1521#endif
1517 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 1522 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -1890,21 +1895,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1890 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 1895 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1891 spin_lock_init(&sbi->s_next_gen_lock); 1896 spin_lock_init(&sbi->s_next_gen_lock);
1892 1897
1893 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1894 ext3_count_free_blocks(sb));
1895 if (!err) {
1896 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1897 ext3_count_free_inodes(sb));
1898 }
1899 if (!err) {
1900 err = percpu_counter_init(&sbi->s_dirs_counter,
1901 ext3_count_dirs(sb));
1902 }
1903 if (err) {
1904 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1905 goto failed_mount3;
1906 }
1907
1908 /* per fileystem reservation list head & lock */ 1898 /* per fileystem reservation list head & lock */
1909 spin_lock_init(&sbi->s_rsv_window_lock); 1899 spin_lock_init(&sbi->s_rsv_window_lock);
1910 sbi->s_rsv_window_root = RB_ROOT; 1900 sbi->s_rsv_window_root = RB_ROOT;
@@ -1945,15 +1935,29 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1945 if (!test_opt(sb, NOLOAD) && 1935 if (!test_opt(sb, NOLOAD) &&
1946 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { 1936 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
1947 if (ext3_load_journal(sb, es, journal_devnum)) 1937 if (ext3_load_journal(sb, es, journal_devnum))
1948 goto failed_mount3; 1938 goto failed_mount2;
1949 } else if (journal_inum) { 1939 } else if (journal_inum) {
1950 if (ext3_create_journal(sb, es, journal_inum)) 1940 if (ext3_create_journal(sb, es, journal_inum))
1951 goto failed_mount3; 1941 goto failed_mount2;
1952 } else { 1942 } else {
1953 if (!silent) 1943 if (!silent)
1954 ext3_msg(sb, KERN_ERR, 1944 ext3_msg(sb, KERN_ERR,
1955 "error: no journal found. " 1945 "error: no journal found. "
1956 "mounting ext3 over ext2?"); 1946 "mounting ext3 over ext2?");
1947 goto failed_mount2;
1948 }
1949 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1950 ext3_count_free_blocks(sb));
1951 if (!err) {
1952 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1953 ext3_count_free_inodes(sb));
1954 }
1955 if (!err) {
1956 err = percpu_counter_init(&sbi->s_dirs_counter,
1957 ext3_count_dirs(sb));
1958 }
1959 if (err) {
1960 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1957 goto failed_mount3; 1961 goto failed_mount3;
1958 } 1962 }
1959 1963
@@ -1978,20 +1982,12 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1978 ext3_msg(sb, KERN_ERR, 1982 ext3_msg(sb, KERN_ERR,
1979 "error: journal does not support " 1983 "error: journal does not support "
1980 "requested data journaling mode"); 1984 "requested data journaling mode");
1981 goto failed_mount4; 1985 goto failed_mount3;
1982 } 1986 }
1983 default: 1987 default:
1984 break; 1988 break;
1985 } 1989 }
1986 1990
1987 if (test_opt(sb, NOBH)) {
1988 if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
1989 ext3_msg(sb, KERN_WARNING,
1990 "warning: ignoring nobh option - "
1991 "it is supported only with writeback mode");
1992 clear_opt(sbi->s_mount_opt, NOBH);
1993 }
1994 }
1995 /* 1991 /*
1996 * The journal_load will have done any necessary log recovery, 1992 * The journal_load will have done any necessary log recovery,
1997 * so we can safely mount the rest of the filesystem now. 1993 * so we can safely mount the rest of the filesystem now.
@@ -2001,19 +1997,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2001 if (IS_ERR(root)) { 1997 if (IS_ERR(root)) {
2002 ext3_msg(sb, KERN_ERR, "error: get root inode failed"); 1998 ext3_msg(sb, KERN_ERR, "error: get root inode failed");
2003 ret = PTR_ERR(root); 1999 ret = PTR_ERR(root);
2004 goto failed_mount4; 2000 goto failed_mount3;
2005 } 2001 }
2006 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 2002 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
2007 iput(root); 2003 iput(root);
2008 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); 2004 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
2009 goto failed_mount4; 2005 goto failed_mount3;
2010 } 2006 }
2011 sb->s_root = d_alloc_root(root); 2007 sb->s_root = d_alloc_root(root);
2012 if (!sb->s_root) { 2008 if (!sb->s_root) {
2013 ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); 2009 ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
2014 iput(root); 2010 iput(root);
2015 ret = -ENOMEM; 2011 ret = -ENOMEM;
2016 goto failed_mount4; 2012 goto failed_mount3;
2017 } 2013 }
2018 2014
2019 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); 2015 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -2039,12 +2035,11 @@ cantfind_ext3:
2039 sb->s_id); 2035 sb->s_id);
2040 goto failed_mount; 2036 goto failed_mount;
2041 2037
2042failed_mount4:
2043 journal_destroy(sbi->s_journal);
2044failed_mount3: 2038failed_mount3:
2045 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2039 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2046 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2040 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2047 percpu_counter_destroy(&sbi->s_dirs_counter); 2041 percpu_counter_destroy(&sbi->s_dirs_counter);
2042 journal_destroy(sbi->s_journal);
2048failed_mount2: 2043failed_mount2:
2049 for (i = 0; i < db_count; i++) 2044 for (i = 0; i < db_count; i++)
2050 brelse(sbi->s_group_desc[i]); 2045 brelse(sbi->s_group_desc[i]);
@@ -2278,6 +2273,9 @@ static int ext3_load_journal(struct super_block *sb,
2278 return -EINVAL; 2273 return -EINVAL;
2279 } 2274 }
2280 2275
2276 if (!(journal->j_flags & JFS_BARRIER))
2277 printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
2278
2281 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 2279 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2282 err = journal_update_format(journal); 2280 err = journal_update_format(journal);
2283 if (err) { 2281 if (err) {
@@ -2534,6 +2532,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2534 ext3_fsblk_t n_blocks_count = 0; 2532 ext3_fsblk_t n_blocks_count = 0;
2535 unsigned long old_sb_flags; 2533 unsigned long old_sb_flags;
2536 struct ext3_mount_options old_opts; 2534 struct ext3_mount_options old_opts;
2535 int enable_quota = 0;
2537 int err; 2536 int err;
2538#ifdef CONFIG_QUOTA 2537#ifdef CONFIG_QUOTA
2539 int i; 2538 int i;
@@ -2580,6 +2579,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2580 } 2579 }
2581 2580
2582 if (*flags & MS_RDONLY) { 2581 if (*flags & MS_RDONLY) {
2582 err = dquot_suspend(sb, -1);
2583 if (err < 0)
2584 goto restore_opts;
2585
2583 /* 2586 /*
2584 * First of all, the unconditional stuff we have to do 2587 * First of all, the unconditional stuff we have to do
2585 * to disable replay of the journal when we next remount 2588 * to disable replay of the journal when we next remount
@@ -2634,6 +2637,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2634 goto restore_opts; 2637 goto restore_opts;
2635 if (!ext3_setup_super (sb, es, 0)) 2638 if (!ext3_setup_super (sb, es, 0))
2636 sb->s_flags &= ~MS_RDONLY; 2639 sb->s_flags &= ~MS_RDONLY;
2640 enable_quota = 1;
2637 } 2641 }
2638 } 2642 }
2639#ifdef CONFIG_QUOTA 2643#ifdef CONFIG_QUOTA
@@ -2645,6 +2649,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2645#endif 2649#endif
2646 unlock_super(sb); 2650 unlock_super(sb);
2647 unlock_kernel(); 2651 unlock_kernel();
2652
2653 if (enable_quota)
2654 dquot_resume(sb, -1);
2648 return 0; 2655 return 0;
2649restore_opts: 2656restore_opts:
2650 sb->s_flags = old_sb_flags; 2657 sb->s_flags = old_sb_flags;
@@ -2834,24 +2841,21 @@ static int ext3_write_info(struct super_block *sb, int type)
2834 */ 2841 */
2835static int ext3_quota_on_mount(struct super_block *sb, int type) 2842static int ext3_quota_on_mount(struct super_block *sb, int type)
2836{ 2843{
2837 return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], 2844 return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
2838 EXT3_SB(sb)->s_jquota_fmt, type); 2845 EXT3_SB(sb)->s_jquota_fmt, type);
2839} 2846}
2840 2847
2841/* 2848/*
2842 * Standard function to be called on quota_on 2849 * Standard function to be called on quota_on
2843 */ 2850 */
2844static int ext3_quota_on(struct super_block *sb, int type, int format_id, 2851static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2845 char *name, int remount) 2852 char *name)
2846{ 2853{
2847 int err; 2854 int err;
2848 struct path path; 2855 struct path path;
2849 2856
2850 if (!test_opt(sb, QUOTA)) 2857 if (!test_opt(sb, QUOTA))
2851 return -EINVAL; 2858 return -EINVAL;
2852 /* When remounting, no checks are needed and in fact, name is NULL */
2853 if (remount)
2854 return vfs_quota_on(sb, type, format_id, name, remount);
2855 2859
2856 err = kern_path(name, LOOKUP_FOLLOW, &path); 2860 err = kern_path(name, LOOKUP_FOLLOW, &path);
2857 if (err) 2861 if (err)
@@ -2889,7 +2893,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2889 } 2893 }
2890 } 2894 }
2891 2895
2892 err = vfs_quota_on_path(sb, type, format_id, &path); 2896 err = dquot_quota_on_path(sb, type, format_id, &path);
2893 path_put(&path); 2897 path_put(&path);
2894 return err; 2898 return err;
2895} 2899}
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 534a94c3a933..e69dc6dfaa89 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
104 104
105static struct mb_cache *ext3_xattr_cache; 105static struct mb_cache *ext3_xattr_cache;
106 106
107static struct xattr_handler *ext3_xattr_handler_map[] = { 107static const struct xattr_handler *ext3_xattr_handler_map[] = {
108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, 108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
109#ifdef CONFIG_EXT3_FS_POSIX_ACL 109#ifdef CONFIG_EXT3_FS_POSIX_ACL
110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, 110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
116#endif 116#endif
117}; 117};
118 118
119struct xattr_handler *ext3_xattr_handlers[] = { 119const struct xattr_handler *ext3_xattr_handlers[] = {
120 &ext3_xattr_user_handler, 120 &ext3_xattr_user_handler,
121 &ext3_xattr_trusted_handler, 121 &ext3_xattr_trusted_handler,
122#ifdef CONFIG_EXT3_FS_POSIX_ACL 122#ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
129 NULL 129 NULL
130}; 130};
131 131
132static inline struct xattr_handler * 132static inline const struct xattr_handler *
133ext3_xattr_handler(int name_index) 133ext3_xattr_handler(int name_index)
134{ 134{
135 struct xattr_handler *handler = NULL; 135 const struct xattr_handler *handler = NULL;
136 136
137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) 137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
138 handler = ext3_xattr_handler_map[name_index]; 138 handler = ext3_xattr_handler_map[name_index];
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
338 size_t rest = buffer_size; 338 size_t rest = buffer_size;
339 339
340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { 340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
341 struct xattr_handler *handler = 341 const struct xattr_handler *handler =
342 ext3_xattr_handler(entry->e_name_index); 342 ext3_xattr_handler(entry->e_name_index);
343 343
344 if (handler) { 344 if (handler) {
@@ -1139,7 +1139,7 @@ ext3_xattr_cache_insert(struct buffer_head *bh)
1139 ea_bdebug(bh, "out of memory"); 1139 ea_bdebug(bh, "out of memory");
1140 return; 1140 return;
1141 } 1141 }
1142 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); 1142 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
1143 if (error) { 1143 if (error) {
1144 mb_cache_entry_free(ce); 1144 mb_cache_entry_free(ce);
1145 if (error == -EBUSY) { 1145 if (error == -EBUSY) {
@@ -1211,8 +1211,8 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
1211 return NULL; /* never share */ 1211 return NULL; /* never share */
1212 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1212 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1213again: 1213again:
1214 ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, 1214 ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
1215 inode->i_sb->s_bdev, hash); 1215 hash);
1216 while (ce) { 1216 while (ce) {
1217 struct buffer_head *bh; 1217 struct buffer_head *bh;
1218 1218
@@ -1237,7 +1237,7 @@ again:
1237 return bh; 1237 return bh;
1238 } 1238 }
1239 brelse(bh); 1239 brelse(bh);
1240 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); 1240 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
1241 } 1241 }
1242 return NULL; 1242 return NULL;
1243} 1243}
@@ -1313,9 +1313,7 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *header,
1313int __init 1313int __init
1314init_ext3_xattr(void) 1314init_ext3_xattr(void)
1315{ 1315{
1316 ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, 1316 ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
1317 sizeof(struct mb_cache_entry) +
1318 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1319 if (!ext3_xattr_cache) 1317 if (!ext3_xattr_cache)
1320 return -ENOMEM; 1318 return -ENOMEM;
1321 return 0; 1319 return 0;
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82ab..377fe7201169 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
58 58
59# ifdef CONFIG_EXT3_FS_XATTR 59# ifdef CONFIG_EXT3_FS_XATTR
60 60
61extern struct xattr_handler ext3_xattr_user_handler; 61extern const struct xattr_handler ext3_xattr_user_handler;
62extern struct xattr_handler ext3_xattr_trusted_handler; 62extern const struct xattr_handler ext3_xattr_trusted_handler;
63extern struct xattr_handler ext3_xattr_acl_access_handler; 63extern const struct xattr_handler ext3_xattr_acl_access_handler;
64extern struct xattr_handler ext3_xattr_acl_default_handler; 64extern const struct xattr_handler ext3_xattr_acl_default_handler;
65extern struct xattr_handler ext3_xattr_security_handler; 65extern const struct xattr_handler ext3_xattr_security_handler;
66 66
67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); 67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
68 68
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
76extern int init_ext3_xattr(void); 76extern int init_ext3_xattr(void);
77extern void exit_ext3_xattr(void); 77extern void exit_ext3_xattr(void);
78 78
79extern struct xattr_handler *ext3_xattr_handlers[]; 79extern const struct xattr_handler *ext3_xattr_handlers[];
80 80
81# else /* CONFIG_EXT3_FS_XATTR */ 81# else /* CONFIG_EXT3_FS_XATTR */
82 82
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3af91f476dff..03a99bfc59f9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
69 return err; 69 return err;
70} 70}
71 71
72struct xattr_handler ext3_xattr_security_handler = { 72const struct xattr_handler ext3_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext3_xattr_security_list, 74 .list = ext3_xattr_security_list,
75 .get = ext3_xattr_security_get, 75 .get = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed96..dc8edda9ffe0 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
51 value, size, flags); 51 value, size, flags);
52} 52}
53 53
54struct xattr_handler ext3_xattr_trusted_handler = { 54const struct xattr_handler ext3_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext3_xattr_trusted_list, 56 .list = ext3_xattr_trusted_list,
57 .get = ext3_xattr_trusted_get, 57 .get = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a68..7a321974d584 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext3_xattr_user_handler = { 57const struct xattr_handler ext3_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext3_xattr_user_list, 59 .list = ext3_xattr_user_list,
60 .get = ext3_xattr_user_get, 60 .get = ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6f..5e2ed4504ead 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -204,6 +204,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
204 return error; 204 return error;
205 else { 205 else {
206 inode->i_mode = mode; 206 inode->i_mode = mode;
207 inode->i_ctime = ext4_current_time(inode);
207 ext4_mark_inode_dirty(handle, inode); 208 ext4_mark_inode_dirty(handle, inode);
208 if (error == 0) 209 if (error == 0)
209 acl = NULL; 210 acl = NULL;
@@ -454,7 +455,7 @@ release_and_out:
454 return error; 455 return error;
455} 456}
456 457
457struct xattr_handler ext4_xattr_acl_access_handler = { 458const struct xattr_handler ext4_xattr_acl_access_handler = {
458 .prefix = POSIX_ACL_XATTR_ACCESS, 459 .prefix = POSIX_ACL_XATTR_ACCESS,
459 .flags = ACL_TYPE_ACCESS, 460 .flags = ACL_TYPE_ACCESS,
460 .list = ext4_xattr_list_acl_access, 461 .list = ext4_xattr_list_acl_access,
@@ -462,7 +463,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
462 .set = ext4_xattr_set_acl, 463 .set = ext4_xattr_set_acl,
463}; 464};
464 465
465struct xattr_handler ext4_xattr_acl_default_handler = { 466const struct xattr_handler ext4_xattr_acl_default_handler = {
466 .prefix = POSIX_ACL_XATTR_DEFAULT, 467 .prefix = POSIX_ACL_XATTR_DEFAULT,
467 .flags = ACL_TYPE_DEFAULT, 468 .flags = ACL_TYPE_DEFAULT,
468 .list = ext4_xattr_list_acl_default, 469 .list = ext4_xattr_list_acl_default,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..bd30799a43ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -377,14 +377,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
377 ext4_grpblk_t bit; 377 ext4_grpblk_t bit;
378 unsigned int i; 378 unsigned int i;
379 struct ext4_group_desc *desc; 379 struct ext4_group_desc *desc;
380 struct ext4_super_block *es; 380 struct ext4_sb_info *sbi = EXT4_SB(sb);
381 struct ext4_sb_info *sbi;
382 int err = 0, ret, blk_free_count; 381 int err = 0, ret, blk_free_count;
383 ext4_grpblk_t blocks_freed; 382 ext4_grpblk_t blocks_freed;
384 struct ext4_group_info *grp; 383 struct ext4_group_info *grp;
385 384
386 sbi = EXT4_SB(sb);
387 es = sbi->s_es;
388 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 385 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
389 386
390 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 387 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -477,7 +474,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
477 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 474 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
478 if (!err) 475 if (!err)
479 err = ret; 476 err = ret;
480 sb->s_dirt = 1;
481 477
482error_return: 478error_return:
483 brelse(bitmap_bh); 479 brelse(bitmap_bh);
@@ -591,14 +587,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 587 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 588 if (count)
593 *count = ar.len; 589 *count = ar.len;
594
595 /* 590 /*
596 * Account for the allocated meta blocks 591 * Account for the allocated meta blocks. We will never
592 * fail EDQUOT for metdata, but we do account for it.
597 */ 593 */
598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 594 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 595 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 596 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 597 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
598 dquot_alloc_block_nofail(inode, ar.len);
602 } 599 }
603 return ret; 600 return ret;
604} 601}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..3db5084db9bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
72 else if (start_blk >= (entry->start_blk + entry->count)) 72 else if (start_blk >= (entry->start_blk + entry->count))
73 n = &(*n)->rb_right; 73 n = &(*n)->rb_right;
74 else { 74 else {
75 if (start_blk + count > (entry->start_blk + 75 if (start_blk + count > (entry->start_blk +
76 entry->count)) 76 entry->count))
77 entry->count = (start_blk + count - 77 entry->count = (start_blk + count -
78 entry->start_blk); 78 entry->start_blk);
79 new_node = *n; 79 new_node = *n;
80 new_entry = rb_entry(new_node, struct ext4_system_zone, 80 new_entry = rb_entry(new_node, struct ext4_system_zone,
@@ -229,16 +229,20 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
229 229
230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || 230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
231 (start_blk + count < start_blk) || 231 (start_blk + count < start_blk) ||
232 (start_blk + count > ext4_blocks_count(sbi->s_es))) 232 (start_blk + count > ext4_blocks_count(sbi->s_es))) {
233 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
233 return 0; 234 return 0;
235 }
234 while (n) { 236 while (n) {
235 entry = rb_entry(n, struct ext4_system_zone, node); 237 entry = rb_entry(n, struct ext4_system_zone, node);
236 if (start_blk + count - 1 < entry->start_blk) 238 if (start_blk + count - 1 < entry->start_blk)
237 n = n->rb_left; 239 n = n->rb_left;
238 else if (start_blk >= (entry->start_blk + entry->count)) 240 else if (start_blk >= (entry->start_blk + entry->count))
239 n = n->rb_right; 241 n = n->rb_right;
240 else 242 else {
243 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
241 return 0; 244 return 0;
245 }
242 } 246 }
243 return 1; 247 return 1;
244} 248}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..374510f72baa 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -61,10 +61,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
61} 61}
62 62
63 63
64int ext4_check_dir_entry(const char *function, struct inode *dir, 64int __ext4_check_dir_entry(const char *function, unsigned int line,
65 struct ext4_dir_entry_2 *de, 65 struct inode *dir,
66 struct buffer_head *bh, 66 struct ext4_dir_entry_2 *de,
67 unsigned int offset) 67 struct buffer_head *bh,
68 unsigned int offset)
68{ 69{
69 const char *error_msg = NULL; 70 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len, 71 const int rlen = ext4_rec_len_from_disk(de->rec_len,
@@ -83,12 +84,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 84 error_msg = "inode out of bounds";
84 85
85 if (error_msg != NULL) 86 if (error_msg != NULL)
86 __ext4_error(dir->i_sb, function, 87 ext4_error_inode(dir, function, line, bh->b_blocknr,
87 "bad entry in directory #%lu: %s - block=%llu" 88 "bad entry in directory: %s - "
88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 89 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, 90 error_msg, (unsigned) (offset%bh->b_size), offset,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset,
92 le32_to_cpu(de->inode), 91 le32_to_cpu(de->inode),
93 rlen, de->name_len); 92 rlen, de->name_len);
94 return error_msg == NULL ? 1 : 0; 93 return error_msg == NULL ? 1 : 0;
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
111 110
112 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 111 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT4_FEATURE_COMPAT_DIR_INDEX) && 112 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || 113 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) { 114 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext4_dx_readdir(filp, dirent, filldir); 115 err = ext4_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) { 116 if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,21 @@ static int ext4_readdir(struct file *filp,
122 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
124 */ 123 */
125 EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; 124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
125 EXT4_INODE_INDEX);
126 } 126 }
127 stored = 0; 127 stored = 0;
128 offset = filp->f_pos & (sb->s_blocksize - 1); 128 offset = filp->f_pos & (sb->s_blocksize - 1);
129 129
130 while (!error && !stored && filp->f_pos < inode->i_size) { 130 while (!error && !stored && filp->f_pos < inode->i_size) {
131 ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 131 struct ext4_map_blocks map;
132 struct buffer_head map_bh;
133 struct buffer_head *bh = NULL; 132 struct buffer_head *bh = NULL;
134 133
135 map_bh.b_state = 0; 134 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
136 err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); 135 map.m_len = 1;
136 err = ext4_map_blocks(NULL, inode, &map, 0);
137 if (err > 0) { 137 if (err > 0) {
138 pgoff_t index = map_bh.b_blocknr >> 138 pgoff_t index = map.m_pblk >>
139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 139 (PAGE_CACHE_SHIFT - inode->i_blkbits);
140 if (!ra_has_index(&filp->f_ra, index)) 140 if (!ra_has_index(&filp->f_ra, index))
141 page_cache_sync_readahead( 141 page_cache_sync_readahead(
@@ -143,7 +143,7 @@ static int ext4_readdir(struct file *filp,
143 &filp->f_ra, filp, 143 &filp->f_ra, filp,
144 index, 1); 144 index, 1);
145 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 145 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
146 bh = ext4_bread(NULL, inode, blk, 0, &err); 146 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
147 } 147 }
148 148
149 /* 149 /*
@@ -152,9 +152,8 @@ static int ext4_readdir(struct file *filp,
152 */ 152 */
153 if (!bh) { 153 if (!bh) {
154 if (!dir_has_error) { 154 if (!dir_has_error) {
155 ext4_error(sb, "directory #%lu " 155 EXT4_ERROR_INODE(inode, "directory "
156 "contains a hole at offset %Lu", 156 "contains a hole at offset %Lu",
157 inode->i_ino,
158 (unsigned long long) filp->f_pos); 157 (unsigned long long) filp->f_pos);
159 dir_has_error = 1; 158 dir_has_error = 1;
160 } 159 }
@@ -195,7 +194,7 @@ revalidate:
195 while (!error && filp->f_pos < inode->i_size 194 while (!error && filp->f_pos < inode->i_size
196 && offset < sb->s_blocksize) { 195 && offset < sb->s_blocksize) {
197 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 196 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
198 if (!ext4_check_dir_entry("ext4_readdir", inode, de, 197 if (!ext4_check_dir_entry(inode, de,
199 bh, offset)) { 198 bh, offset)) {
200 /* 199 /*
201 * On error, skip the f_pos to the next block 200 * On error, skip the f_pos to the next block
@@ -345,7 +344,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
345 struct dir_private_info *info; 344 struct dir_private_info *info;
346 int len; 345 int len;
347 346
348 info = (struct dir_private_info *) dir_file->private_data; 347 info = dir_file->private_data;
349 p = &info->root.rb_node; 348 p = &info->root.rb_node;
350 349
351 /* Create and allocate the fname structure */ 350 /* Create and allocate the fname structure */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..889ec9d5e6ad 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/blockgroup_lock.h> 30#include <linux/blockgroup_lock.h>
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#ifdef __KERNEL__
33#include <linux/compat.h>
34#endif
32 35
33/* 36/*
34 * The fourth extended filesystem constants/structures 37 * The fourth extended filesystem constants/structures
@@ -54,10 +57,13 @@
54#endif 57#endif
55 58
56#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a); 60 ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
61
62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
58 64
59#define EXT4_ERROR_FILE(file, fmt, a...) \ 65#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a); 66 ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
61 67
62/* data type for block offset of block group */ 68/* data type for block offset of block group */
63typedef int ext4_grpblk_t; 69typedef int ext4_grpblk_t;
@@ -72,7 +78,7 @@ typedef __u32 ext4_lblk_t;
72typedef unsigned int ext4_group_t; 78typedef unsigned int ext4_group_t;
73 79
74/* 80/*
75 * Flags used in mballoc's allocation_context flags field. 81 * Flags used in mballoc's allocation_context flags field.
76 * 82 *
77 * Also used to show what's going on for debugging purposes when the 83 * Also used to show what's going on for debugging purposes when the
78 * flag field is exported via the traceport interface 84 * flag field is exported via the traceport interface
@@ -126,6 +132,29 @@ struct ext4_allocation_request {
126}; 132};
127 133
128/* 134/*
135 * Logical to physical block mapping, used by ext4_map_blocks()
136 *
137 * This structure is used to pass requests into ext4_map_blocks() as
138 * well as to store the information returned by ext4_map_blocks(). It
139 * takes less room on the stack than a struct buffer_head.
140 */
141#define EXT4_MAP_NEW (1 << BH_New)
142#define EXT4_MAP_MAPPED (1 << BH_Mapped)
143#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
144#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
145#define EXT4_MAP_UNINIT (1 << BH_Uninit)
146#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
147 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
148 EXT4_MAP_UNINIT)
149
150struct ext4_map_blocks {
151 ext4_fsblk_t m_pblk;
152 ext4_lblk_t m_lblk;
153 unsigned int m_len;
154 unsigned int m_flags;
155};
156
157/*
129 * For delayed allocation tracking 158 * For delayed allocation tracking
130 */ 159 */
131struct mpage_da_data { 160struct mpage_da_data {
@@ -141,13 +170,15 @@ struct mpage_da_data {
141}; 170};
142#define EXT4_IO_UNWRITTEN 0x1 171#define EXT4_IO_UNWRITTEN 0x1
143typedef struct ext4_io_end { 172typedef struct ext4_io_end {
144 struct list_head list; /* per-file finished AIO list */ 173 struct list_head list; /* per-file finished IO list */
145 struct inode *inode; /* file being written to */ 174 struct inode *inode; /* file being written to */
146 unsigned int flag; /* unwritten or not */ 175 unsigned int flag; /* unwritten or not */
147 struct page *page; /* page struct for buffer write */ 176 struct page *page; /* page struct for buffer write */
148 loff_t offset; /* offset in the file */ 177 loff_t offset; /* offset in the file */
149 ssize_t size; /* size of the extent */ 178 ssize_t size; /* size of the extent */
150 struct work_struct work; /* data work queue */ 179 struct work_struct work; /* data work queue */
180 struct kiocb *iocb; /* iocb struct for AIO */
181 int result; /* error value for AIO */
151} ext4_io_end_t; 182} ext4_io_end_t;
152 183
153/* 184/*
@@ -321,6 +352,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
321 return flags & EXT4_OTHER_FLMASK; 352 return flags & EXT4_OTHER_FLMASK;
322} 353}
323 354
355/*
356 * Inode flags used for atomic set/get
357 */
358enum {
359 EXT4_INODE_SECRM = 0, /* Secure deletion */
360 EXT4_INODE_UNRM = 1, /* Undelete */
361 EXT4_INODE_COMPR = 2, /* Compress file */
362 EXT4_INODE_SYNC = 3, /* Synchronous updates */
363 EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
364 EXT4_INODE_APPEND = 5, /* writes to file may only append */
365 EXT4_INODE_NODUMP = 6, /* do not dump file */
366 EXT4_INODE_NOATIME = 7, /* do not update atime */
367/* Reserved for compression usage... */
368 EXT4_INODE_DIRTY = 8,
369 EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
370 EXT4_INODE_NOCOMPR = 10, /* Don't compress */
371 EXT4_INODE_ECOMPR = 11, /* Compression error */
372/* End compression flags --- maybe not all used */
373 EXT4_INODE_INDEX = 12, /* hash-indexed directory */
374 EXT4_INODE_IMAGIC = 13, /* AFS directory */
375 EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
376 EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
377 EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
378 EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
379 EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
380 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
381 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
382 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
383 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
384};
385
386#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
387#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
388 printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
389 EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
390
391/*
392 * Since it's pretty easy to mix up bit numbers and hex values, and we
393 * can't do a compile-time test for ENUM values, we use a run-time
394 * test to make sure that EXT4_XXX_FL is consistent with respect to
395 * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
396 * out so it won't cost any extra space in the compiled kernel image.
397 * But it's important that these values are the same, since we are
398 * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
399 * must be consistent with the values of FS_XXX_FL defined in
400 * include/linux/fs.h and the on-disk values found in ext2, ext3, and
401 * ext4 filesystems, and of course the values defined in e2fsprogs.
402 *
403 * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
404 */
405static inline void ext4_check_flag_values(void)
406{
407 CHECK_FLAG_VALUE(SECRM);
408 CHECK_FLAG_VALUE(UNRM);
409 CHECK_FLAG_VALUE(COMPR);
410 CHECK_FLAG_VALUE(SYNC);
411 CHECK_FLAG_VALUE(IMMUTABLE);
412 CHECK_FLAG_VALUE(APPEND);
413 CHECK_FLAG_VALUE(NODUMP);
414 CHECK_FLAG_VALUE(NOATIME);
415 CHECK_FLAG_VALUE(DIRTY);
416 CHECK_FLAG_VALUE(COMPRBLK);
417 CHECK_FLAG_VALUE(NOCOMPR);
418 CHECK_FLAG_VALUE(ECOMPR);
419 CHECK_FLAG_VALUE(INDEX);
420 CHECK_FLAG_VALUE(IMAGIC);
421 CHECK_FLAG_VALUE(JOURNAL_DATA);
422 CHECK_FLAG_VALUE(NOTAIL);
423 CHECK_FLAG_VALUE(DIRSYNC);
424 CHECK_FLAG_VALUE(TOPDIR);
425 CHECK_FLAG_VALUE(HUGE_FILE);
426 CHECK_FLAG_VALUE(EXTENTS);
427 CHECK_FLAG_VALUE(EA_INODE);
428 CHECK_FLAG_VALUE(EOFBLOCKS);
429 CHECK_FLAG_VALUE(RESERVED);
430}
431
324/* Used to pass group descriptor data when online resize is done */ 432/* Used to pass group descriptor data when online resize is done */
325struct ext4_new_group_input { 433struct ext4_new_group_input {
326 __u32 group; /* Group number for this data */ 434 __u32 group; /* Group number for this data */
@@ -332,6 +440,18 @@ struct ext4_new_group_input {
332 __u16 unused; 440 __u16 unused;
333}; 441};
334 442
443#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
444struct compat_ext4_new_group_input {
445 u32 group;
446 compat_u64 block_bitmap;
447 compat_u64 inode_bitmap;
448 compat_u64 inode_table;
449 u32 blocks_count;
450 u16 reserved_blocks;
451 u16 unused;
452};
453#endif
454
335/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ 455/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
336struct ext4_new_group_data { 456struct ext4_new_group_data {
337 __u32 group; 457 __u32 group;
@@ -345,7 +465,7 @@ struct ext4_new_group_data {
345}; 465};
346 466
347/* 467/*
348 * Flags used by ext4_get_blocks() 468 * Flags used by ext4_map_blocks()
349 */ 469 */
350 /* Allocate any needed blocks and/or convert an unitialized 470 /* Allocate any needed blocks and/or convert an unitialized
351 extent to be an initialized ext4 */ 471 extent to be an initialized ext4 */
@@ -355,7 +475,7 @@ struct ext4_new_group_data {
355#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ 475#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
356 EXT4_GET_BLOCKS_CREATE) 476 EXT4_GET_BLOCKS_CREATE)
357 /* Caller is from the delayed allocation writeout path, 477 /* Caller is from the delayed allocation writeout path,
358 so set the magic i_delalloc_reserve_flag after taking the 478 so set the magic i_delalloc_reserve_flag after taking the
359 inode allocation semaphore for */ 479 inode allocation semaphore for */
360#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 480#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
361 /* caller is from the direct IO path, request to creation of an 481 /* caller is from the direct IO path, request to creation of an
@@ -398,6 +518,7 @@ struct ext4_new_group_data {
398#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 518#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
399#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 519#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
400 520
521#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
401/* 522/*
402 * ioctl commands in 32 bit emulation 523 * ioctl commands in 32 bit emulation
403 */ 524 */
@@ -408,11 +529,13 @@ struct ext4_new_group_data {
408#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) 529#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
409#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) 530#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
410#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) 531#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
532#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
411#ifdef CONFIG_JBD2_DEBUG 533#ifdef CONFIG_JBD2_DEBUG
412#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) 534#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
413#endif 535#endif
414#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 536#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
415#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 537#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
538#endif
416 539
417 540
418/* 541/*
@@ -616,9 +739,8 @@ struct ext4_ext_cache {
616 */ 739 */
617struct ext4_inode_info { 740struct ext4_inode_info {
618 __le32 i_data[15]; /* unconverted */ 741 __le32 i_data[15]; /* unconverted */
619 __u32 i_flags;
620 ext4_fsblk_t i_file_acl;
621 __u32 i_dtime; 742 __u32 i_dtime;
743 ext4_fsblk_t i_file_acl;
622 744
623 /* 745 /*
624 * i_block_group is the number of the block group which contains 746 * i_block_group is the number of the block group which contains
@@ -629,6 +751,7 @@ struct ext4_inode_info {
629 */ 751 */
630 ext4_group_t i_block_group; 752 ext4_group_t i_block_group;
631 unsigned long i_state_flags; /* Dynamic state flags */ 753 unsigned long i_state_flags; /* Dynamic state flags */
754 unsigned long i_flags;
632 755
633 ext4_lblk_t i_dir_start_lookup; 756 ext4_lblk_t i_dir_start_lookup;
634#ifdef CONFIG_EXT4_FS_XATTR 757#ifdef CONFIG_EXT4_FS_XATTR
@@ -755,7 +878,6 @@ struct ext4_inode_info {
755#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ 878#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
756#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ 879#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
757#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ 880#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
758#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
759#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 881#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
760#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 882#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
761#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 883#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
@@ -864,7 +986,7 @@ struct ext4_super_block {
864 __le32 s_last_orphan; /* start of list of inodes to delete */ 986 __le32 s_last_orphan; /* start of list of inodes to delete */
865 __le32 s_hash_seed[4]; /* HTREE hash seed */ 987 __le32 s_hash_seed[4]; /* HTREE hash seed */
866 __u8 s_def_hash_version; /* Default hash version to use */ 988 __u8 s_def_hash_version; /* Default hash version to use */
867 __u8 s_reserved_char_pad; 989 __u8 s_jnl_backup_type;
868 __le16 s_desc_size; /* size of group descriptor */ 990 __le16 s_desc_size; /* size of group descriptor */
869/*100*/ __le32 s_default_mount_opts; 991/*100*/ __le32 s_default_mount_opts;
870 __le32 s_first_meta_bg; /* First metablock block group */ 992 __le32 s_first_meta_bg; /* First metablock block group */
@@ -882,12 +1004,34 @@ struct ext4_super_block {
882 __le64 s_mmp_block; /* Block for multi-mount protection */ 1004 __le64 s_mmp_block; /* Block for multi-mount protection */
883 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1005 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
884 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1006 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
885 __u8 s_reserved_char_pad2; 1007 __u8 s_reserved_char_pad;
886 __le16 s_reserved_pad; 1008 __le16 s_reserved_pad;
887 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ 1009 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
888 __u32 s_reserved[160]; /* Padding to the end of the block */ 1010 __le32 s_snapshot_inum; /* Inode number of active snapshot */
1011 __le32 s_snapshot_id; /* sequential ID of active snapshot */
1012 __le64 s_snapshot_r_blocks_count; /* reserved blocks for active
1013 snapshot's future use */
1014 __le32 s_snapshot_list; /* inode number of the head of the
1015 on-disk snapshot list */
1016#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
1017 __le32 s_error_count; /* number of fs errors */
1018 __le32 s_first_error_time; /* first time an error happened */
1019 __le32 s_first_error_ino; /* inode involved in first error */
1020 __le64 s_first_error_block; /* block involved of first error */
1021 __u8 s_first_error_func[32]; /* function where the error happened */
1022 __le32 s_first_error_line; /* line number where error happened */
1023 __le32 s_last_error_time; /* most recent time of an error */
1024 __le32 s_last_error_ino; /* inode involved in last error */
1025 __le32 s_last_error_line; /* line number where error happened */
1026 __le64 s_last_error_block; /* block involved of last error */
1027 __u8 s_last_error_func[32]; /* function where the error happened */
1028#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
1029 __u8 s_mount_opts[64];
1030 __le32 s_reserved[112]; /* Padding to the end of the block */
889}; 1031};
890 1032
1033#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
1034
891#ifdef __KERNEL__ 1035#ifdef __KERNEL__
892 1036
893/* 1037/*
@@ -1025,6 +1169,9 @@ struct ext4_sb_info {
1025 1169
1026 /* workqueue for dio unwritten */ 1170 /* workqueue for dio unwritten */
1027 struct workqueue_struct *dio_unwritten_wq; 1171 struct workqueue_struct *dio_unwritten_wq;
1172
1173 /* timer for periodic error stats printing */
1174 struct timer_list s_err_report;
1028}; 1175};
1029 1176
1030static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1177static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1062,22 +1209,25 @@ enum {
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ 1209 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1210 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1211 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1212 EXT4_STATE_NEWENTRY, /* File just added to dir */
1065}; 1213};
1066 1214
1067static inline int ext4_test_inode_state(struct inode *inode, int bit) 1215#define EXT4_INODE_BIT_FNS(name, field) \
1068{ 1216static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags); 1217{ \
1218 return test_bit(bit, &EXT4_I(inode)->i_##field); \
1219} \
1220static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1221{ \
1222 set_bit(bit, &EXT4_I(inode)->i_##field); \
1223} \
1224static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1225{ \
1226 clear_bit(bit, &EXT4_I(inode)->i_##field); \
1070} 1227}
1071 1228
1072static inline void ext4_set_inode_state(struct inode *inode, int bit) 1229EXT4_INODE_BIT_FNS(flag, flags)
1073{ 1230EXT4_INODE_BIT_FNS(state, state_flags)
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags);
1075}
1076
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit)
1078{
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1081#else 1231#else
1082/* Assume that user mode programs are passing in an ext4fs superblock, not 1232/* Assume that user mode programs are passing in an ext4fs superblock, not
1083 * a kernel struct super_block. This will allow us to call the feature-test 1233 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1192,6 +1342,10 @@ static inline void ext4_clear_inode_state(struct inode *inode, int bit)
1192#define EXT4_DEFM_JMODE_DATA 0x0020 1342#define EXT4_DEFM_JMODE_DATA 0x0020
1193#define EXT4_DEFM_JMODE_ORDERED 0x0040 1343#define EXT4_DEFM_JMODE_ORDERED 0x0040
1194#define EXT4_DEFM_JMODE_WBACK 0x0060 1344#define EXT4_DEFM_JMODE_WBACK 0x0060
1345#define EXT4_DEFM_NOBARRIER 0x0100
1346#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
1347#define EXT4_DEFM_DISCARD 0x0400
1348#define EXT4_DEFM_NODELALLOC 0x0800
1195 1349
1196/* 1350/*
1197 * Default journal batch times 1351 * Default journal batch times
@@ -1258,13 +1412,50 @@ struct ext4_dir_entry_2 {
1258#define EXT4_MAX_REC_LEN ((1<<16)-1) 1412#define EXT4_MAX_REC_LEN ((1<<16)-1)
1259 1413
1260/* 1414/*
1415 * If we ever get support for fs block sizes > page_size, we'll need
1416 * to remove the #if statements in the next two functions...
1417 */
1418static inline unsigned int
1419ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
1420{
1421 unsigned len = le16_to_cpu(dlen);
1422
1423#if (PAGE_CACHE_SIZE >= 65536)
1424 if (len == EXT4_MAX_REC_LEN || len == 0)
1425 return blocksize;
1426 return (len & 65532) | ((len & 3) << 16);
1427#else
1428 return len;
1429#endif
1430}
1431
1432static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
1433{
1434 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
1435 BUG();
1436#if (PAGE_CACHE_SIZE >= 65536)
1437 if (len < 65536)
1438 return cpu_to_le16(len);
1439 if (len == blocksize) {
1440 if (blocksize == 65536)
1441 return cpu_to_le16(EXT4_MAX_REC_LEN);
1442 else
1443 return cpu_to_le16(0);
1444 }
1445 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
1446#else
1447 return cpu_to_le16(len);
1448#endif
1449}
1450
1451/*
1261 * Hash Tree Directory indexing 1452 * Hash Tree Directory indexing
1262 * (c) Daniel Phillips, 2001 1453 * (c) Daniel Phillips, 2001
1263 */ 1454 */
1264 1455
1265#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ 1456#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
1266 EXT4_FEATURE_COMPAT_DIR_INDEX) && \ 1457 EXT4_FEATURE_COMPAT_DIR_INDEX) && \
1267 (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) 1458 ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
1268#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) 1459#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
1269#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) 1460#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
1270 1461
@@ -1389,16 +1580,18 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1389 ext4_init_block_bitmap(sb, NULL, group, desc) 1580 ext4_init_block_bitmap(sb, NULL, group, desc)
1390 1581
1391/* dir.c */ 1582/* dir.c */
1392extern int ext4_check_dir_entry(const char *, struct inode *, 1583extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
1393 struct ext4_dir_entry_2 *, 1584 struct ext4_dir_entry_2 *,
1394 struct buffer_head *, unsigned int); 1585 struct buffer_head *, unsigned int);
1586#define ext4_check_dir_entry(dir, de, bh, offset) \
1587 __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
1395extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1588extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1396 __u32 minor_hash, 1589 __u32 minor_hash,
1397 struct ext4_dir_entry_2 *dirent); 1590 struct ext4_dir_entry_2 *dirent);
1398extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1591extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1399 1592
1400/* fsync.c */ 1593/* fsync.c */
1401extern int ext4_sync_file(struct file *, struct dentry *, int); 1594extern int ext4_sync_file(struct file *, int);
1402 1595
1403/* hash.c */ 1596/* hash.c */
1404extern int ext4fs_dirhash(const char *name, int len, struct 1597extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1450,7 +1643,8 @@ extern int ext4_write_inode(struct inode *, struct writeback_control *);
1450extern int ext4_setattr(struct dentry *, struct iattr *); 1643extern int ext4_setattr(struct dentry *, struct iattr *);
1451extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1644extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1452 struct kstat *stat); 1645 struct kstat *stat);
1453extern void ext4_delete_inode(struct inode *); 1646extern void ext4_evict_inode(struct inode *);
1647extern void ext4_clear_inode(struct inode *);
1454extern int ext4_sync_inode(handle_t *, struct inode *); 1648extern int ext4_sync_inode(handle_t *, struct inode *);
1455extern void ext4_dirty_inode(struct inode *); 1649extern void ext4_dirty_inode(struct inode *);
1456extern int ext4_change_inode_journal_flag(struct inode *, int); 1650extern int ext4_change_inode_journal_flag(struct inode *, int);
@@ -1480,8 +1674,6 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1480extern int ext4_ext_migrate(struct inode *); 1674extern int ext4_ext_migrate(struct inode *);
1481 1675
1482/* namei.c */ 1676/* namei.c */
1483extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
1484extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
1485extern int ext4_orphan_add(handle_t *, struct inode *); 1677extern int ext4_orphan_add(handle_t *, struct inode *);
1486extern int ext4_orphan_del(handle_t *, struct inode *); 1678extern int ext4_orphan_del(handle_t *, struct inode *);
1487extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 1679extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1495,25 +1687,38 @@ extern int ext4_group_extend(struct super_block *sb,
1495 ext4_fsblk_t n_blocks_count); 1687 ext4_fsblk_t n_blocks_count);
1496 1688
1497/* super.c */ 1689/* super.c */
1498extern void __ext4_error(struct super_block *, const char *, const char *, ...) 1690extern void __ext4_error(struct super_block *, const char *, unsigned int,
1499 __attribute__ ((format (printf, 3, 4))); 1691 const char *, ...)
1500#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) 1692 __attribute__ ((format (printf, 4, 5)));
1501extern void ext4_error_inode(const char *, struct inode *, const char *, ...) 1693#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
1502 __attribute__ ((format (printf, 3, 4))); 1694 __LINE__, ## message)
1503extern void ext4_error_file(const char *, struct file *, const char *, ...) 1695extern void ext4_error_inode(struct inode *, const char *, unsigned int,
1504 __attribute__ ((format (printf, 3, 4))); 1696 ext4_fsblk_t, const char *, ...)
1505extern void __ext4_std_error(struct super_block *, const char *, int); 1697 __attribute__ ((format (printf, 5, 6)));
1506extern void ext4_abort(struct super_block *, const char *, const char *, ...) 1698extern void ext4_error_file(struct file *, const char *, unsigned int,
1507 __attribute__ ((format (printf, 3, 4))); 1699 const char *, ...)
1508extern void __ext4_warning(struct super_block *, const char *, 1700 __attribute__ ((format (printf, 4, 5)));
1701extern void __ext4_std_error(struct super_block *, const char *,
1702 unsigned int, int);
1703extern void __ext4_abort(struct super_block *, const char *, unsigned int,
1704 const char *, ...)
1705 __attribute__ ((format (printf, 4, 5)));
1706#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
1707 __LINE__, ## message)
1708extern void __ext4_warning(struct super_block *, const char *, unsigned int,
1509 const char *, ...) 1709 const char *, ...)
1510 __attribute__ ((format (printf, 3, 4))); 1710 __attribute__ ((format (printf, 4, 5)));
1511#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) 1711#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
1712 __LINE__, ## message)
1512extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1713extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1513 __attribute__ ((format (printf, 3, 4))); 1714 __attribute__ ((format (printf, 3, 4)));
1514extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1715extern void __ext4_grp_locked_error(const char *, unsigned int, \
1515 const char *, const char *, ...) 1716 struct super_block *, ext4_group_t, \
1516 __attribute__ ((format (printf, 4, 5))); 1717 unsigned long, ext4_fsblk_t, \
1718 const char *, ...)
1719 __attribute__ ((format (printf, 7, 8)));
1720#define ext4_grp_locked_error(sb, grp, message...) \
1721 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
1517extern void ext4_update_dynamic_rev(struct super_block *sb); 1722extern void ext4_update_dynamic_rev(struct super_block *sb);
1518extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1723extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1519 __u32 compat); 1724 __u32 compat);
@@ -1647,7 +1852,7 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1647#define ext4_std_error(sb, errno) \ 1852#define ext4_std_error(sb, errno) \
1648do { \ 1853do { \
1649 if ((errno)) \ 1854 if ((errno)) \
1650 __ext4_std_error((sb), __func__, (errno)); \ 1855 __ext4_std_error((sb), __func__, __LINE__, (errno)); \
1651} while (0) 1856} while (0)
1652 1857
1653#ifdef CONFIG_SMP 1858#ifdef CONFIG_SMP
@@ -1678,6 +1883,7 @@ struct ext4_group_info {
1678 ext4_grpblk_t bb_first_free; /* first free block */ 1883 ext4_grpblk_t bb_first_free; /* first free block */
1679 ext4_grpblk_t bb_free; /* total free blocks */ 1884 ext4_grpblk_t bb_free; /* total free blocks */
1680 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 1885 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1886 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
1681 struct list_head bb_prealloc_list; 1887 struct list_head bb_prealloc_list;
1682#ifdef DOUBLE_CHECK 1888#ifdef DOUBLE_CHECK
1683 void *bb_bitmap; 1889 void *bb_bitmap;
@@ -1738,6 +1944,12 @@ static inline void ext4_unlock_group(struct super_block *sb,
1738 spin_unlock(ext4_group_lock_ptr(sb, group)); 1944 spin_unlock(ext4_group_lock_ptr(sb, group));
1739} 1945}
1740 1946
1947static inline void ext4_mark_super_dirty(struct super_block *sb)
1948{
1949 if (EXT4_SB(sb)->s_journal == NULL)
1950 sb->s_dirt =1;
1951}
1952
1741/* 1953/*
1742 * Inodes and files operations 1954 * Inodes and files operations
1743 */ 1955 */
@@ -1772,9 +1984,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1772extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1984extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1773extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1985extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1774 int chunk); 1986 int chunk);
1775extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1987extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
1776 ext4_lblk_t iblock, unsigned int max_blocks, 1988 struct ext4_map_blocks *map, int flags);
1777 struct buffer_head *bh_result, int flags);
1778extern void ext4_ext_truncate(struct inode *); 1989extern void ext4_ext_truncate(struct inode *);
1779extern void ext4_ext_init(struct super_block *); 1990extern void ext4_ext_init(struct super_block *);
1780extern void ext4_ext_release(struct super_block *); 1991extern void ext4_ext_release(struct super_block *);
@@ -1782,9 +1993,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1782 loff_t len); 1993 loff_t len);
1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1994extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1784 ssize_t len); 1995 ssize_t len);
1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1996extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
1786 sector_t block, unsigned int max_blocks, 1997 struct ext4_map_blocks *map, int flags);
1787 struct buffer_head *bh, int flags);
1788extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1998extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1789 __u64 start, __u64 len); 1999 __u64 start, __u64 len);
1790/* move_extent.c */ 2000/* move_extent.c */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 53d2764d71ca..6e272ef6ba96 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,29 +6,29 @@
6 6
7#include <trace/events/ext4.h> 7#include <trace/events/ext4.h>
8 8
9int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 9int __ext4_journal_get_undo_access(const char *where, unsigned int line,
10 struct buffer_head *bh) 10 handle_t *handle, struct buffer_head *bh)
11{ 11{
12 int err = 0; 12 int err = 0;
13 13
14 if (ext4_handle_valid(handle)) { 14 if (ext4_handle_valid(handle)) {
15 err = jbd2_journal_get_undo_access(handle, bh); 15 err = jbd2_journal_get_undo_access(handle, bh);
16 if (err) 16 if (err)
17 ext4_journal_abort_handle(where, __func__, bh, 17 ext4_journal_abort_handle(where, line, __func__, bh,
18 handle, err); 18 handle, err);
19 } 19 }
20 return err; 20 return err;
21} 21}
22 22
23int __ext4_journal_get_write_access(const char *where, handle_t *handle, 23int __ext4_journal_get_write_access(const char *where, unsigned int line,
24 struct buffer_head *bh) 24 handle_t *handle, struct buffer_head *bh)
25{ 25{
26 int err = 0; 26 int err = 0;
27 27
28 if (ext4_handle_valid(handle)) { 28 if (ext4_handle_valid(handle)) {
29 err = jbd2_journal_get_write_access(handle, bh); 29 err = jbd2_journal_get_write_access(handle, bh);
30 if (err) 30 if (err)
31 ext4_journal_abort_handle(where, __func__, bh, 31 ext4_journal_abort_handle(where, line, __func__, bh,
32 handle, err); 32 handle, err);
33 } 33 }
34 return err; 34 return err;
@@ -46,9 +46,9 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
46 * If the handle isn't valid we're not journaling, but we still need to 46 * If the handle isn't valid we're not journaling, but we still need to
47 * call into ext4_journal_revoke() to put the buffer head. 47 * call into ext4_journal_revoke() to put the buffer head.
48 */ 48 */
49int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 49int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
50 struct inode *inode, struct buffer_head *bh, 50 int is_metadata, struct inode *inode,
51 ext4_fsblk_t blocknr) 51 struct buffer_head *bh, ext4_fsblk_t blocknr)
52{ 52{
53 int err; 53 int err;
54 54
@@ -79,8 +79,8 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
79 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 79 BUFFER_TRACE(bh, "call jbd2_journal_forget");
80 err = jbd2_journal_forget(handle, bh); 80 err = jbd2_journal_forget(handle, bh);
81 if (err) 81 if (err)
82 ext4_journal_abort_handle(where, __func__, bh, 82 ext4_journal_abort_handle(where, line, __func__,
83 handle, err); 83 bh, handle, err);
84 return err; 84 return err;
85 } 85 }
86 return 0; 86 return 0;
@@ -92,15 +92,16 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
92 BUFFER_TRACE(bh, "call jbd2_journal_revoke"); 92 BUFFER_TRACE(bh, "call jbd2_journal_revoke");
93 err = jbd2_journal_revoke(handle, blocknr, bh); 93 err = jbd2_journal_revoke(handle, blocknr, bh);
94 if (err) { 94 if (err) {
95 ext4_journal_abort_handle(where, __func__, bh, handle, err); 95 ext4_journal_abort_handle(where, line, __func__,
96 ext4_abort(inode->i_sb, __func__, 96 bh, handle, err);
97 __ext4_abort(inode->i_sb, where, line,
97 "error %d when attempting revoke", err); 98 "error %d when attempting revoke", err);
98 } 99 }
99 BUFFER_TRACE(bh, "exit"); 100 BUFFER_TRACE(bh, "exit");
100 return err; 101 return err;
101} 102}
102 103
103int __ext4_journal_get_create_access(const char *where, 104int __ext4_journal_get_create_access(const char *where, unsigned int line,
104 handle_t *handle, struct buffer_head *bh) 105 handle_t *handle, struct buffer_head *bh)
105{ 106{
106 int err = 0; 107 int err = 0;
@@ -108,22 +109,23 @@ int __ext4_journal_get_create_access(const char *where,
108 if (ext4_handle_valid(handle)) { 109 if (ext4_handle_valid(handle)) {
109 err = jbd2_journal_get_create_access(handle, bh); 110 err = jbd2_journal_get_create_access(handle, bh);
110 if (err) 111 if (err)
111 ext4_journal_abort_handle(where, __func__, bh, 112 ext4_journal_abort_handle(where, line, __func__,
112 handle, err); 113 bh, handle, err);
113 } 114 }
114 return err; 115 return err;
115} 116}
116 117
117int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, 118int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
118 struct inode *inode, struct buffer_head *bh) 119 handle_t *handle, struct inode *inode,
120 struct buffer_head *bh)
119{ 121{
120 int err = 0; 122 int err = 0;
121 123
122 if (ext4_handle_valid(handle)) { 124 if (ext4_handle_valid(handle)) {
123 err = jbd2_journal_dirty_metadata(handle, bh); 125 err = jbd2_journal_dirty_metadata(handle, bh);
124 if (err) 126 if (err)
125 ext4_journal_abort_handle(where, __func__, bh, 127 ext4_journal_abort_handle(where, line, __func__,
126 handle, err); 128 bh, handle, err);
127 } else { 129 } else {
128 if (inode) 130 if (inode)
129 mark_buffer_dirty_inode(bh, inode); 131 mark_buffer_dirty_inode(bh, inode);
@@ -132,14 +134,33 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
132 if (inode && inode_needs_sync(inode)) { 134 if (inode && inode_needs_sync(inode)) {
133 sync_dirty_buffer(bh); 135 sync_dirty_buffer(bh);
134 if (buffer_req(bh) && !buffer_uptodate(bh)) { 136 if (buffer_req(bh) && !buffer_uptodate(bh)) {
135 ext4_error(inode->i_sb, 137 struct ext4_super_block *es;
136 "IO error syncing inode, " 138
137 "inode=%lu, block=%llu", 139 es = EXT4_SB(inode->i_sb)->s_es;
138 inode->i_ino, 140 es->s_last_error_block =
139 (unsigned long long) bh->b_blocknr); 141 cpu_to_le64(bh->b_blocknr);
142 ext4_error_inode(inode, where, line,
143 bh->b_blocknr,
144 "IO error syncing itable block");
140 err = -EIO; 145 err = -EIO;
141 } 146 }
142 } 147 }
143 } 148 }
144 return err; 149 return err;
145} 150}
151
152int __ext4_handle_dirty_super(const char *where, unsigned int line,
153 handle_t *handle, struct super_block *sb)
154{
155 struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
156 int err = 0;
157
158 if (ext4_handle_valid(handle)) {
159 err = jbd2_journal_dirty_metadata(handle, bh);
160 if (err)
161 ext4_journal_abort_handle(where, line, __func__,
162 bh, handle, err);
163 } else
164 sb->s_dirt = 1;
165 return err;
166}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..b0bd792c58c5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,39 +122,47 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
122/* 122/*
123 * Wrapper functions with which ext4 calls into JBD. 123 * Wrapper functions with which ext4 calls into JBD.
124 */ 124 */
125void ext4_journal_abort_handle(const char *caller, const char *err_fn, 125void ext4_journal_abort_handle(const char *caller, unsigned int line,
126 const char *err_fn,
126 struct buffer_head *bh, handle_t *handle, int err); 127 struct buffer_head *bh, handle_t *handle, int err);
127 128
128int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 129int __ext4_journal_get_undo_access(const char *where, unsigned int line,
129 struct buffer_head *bh); 130 handle_t *handle, struct buffer_head *bh);
130 131
131int __ext4_journal_get_write_access(const char *where, handle_t *handle, 132int __ext4_journal_get_write_access(const char *where, unsigned int line,
132 struct buffer_head *bh); 133 handle_t *handle, struct buffer_head *bh);
133 134
134int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 135int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
135 struct inode *inode, struct buffer_head *bh, 136 int is_metadata, struct inode *inode,
136 ext4_fsblk_t blocknr); 137 struct buffer_head *bh, ext4_fsblk_t blocknr);
137 138
138int __ext4_journal_get_create_access(const char *where, 139int __ext4_journal_get_create_access(const char *where, unsigned int line,
139 handle_t *handle, struct buffer_head *bh); 140 handle_t *handle, struct buffer_head *bh);
140 141
141int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, 142int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
142 struct inode *inode, struct buffer_head *bh); 143 handle_t *handle, struct inode *inode,
144 struct buffer_head *bh);
145
146int __ext4_handle_dirty_super(const char *where, unsigned int line,
147 handle_t *handle, struct super_block *sb);
143 148
144#define ext4_journal_get_undo_access(handle, bh) \ 149#define ext4_journal_get_undo_access(handle, bh) \
145 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 150 __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
146#define ext4_journal_get_write_access(handle, bh) \ 151#define ext4_journal_get_write_access(handle, bh) \
147 __ext4_journal_get_write_access(__func__, (handle), (bh)) 152 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 153#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
149 __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ 154 __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
150 (block_nr)) 155 (bh), (block_nr))
151#define ext4_journal_get_create_access(handle, bh) \ 156#define ext4_journal_get_create_access(handle, bh) \
152 __ext4_journal_get_create_access(__func__, (handle), (bh)) 157 __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
153#define ext4_handle_dirty_metadata(handle, inode, bh) \ 158#define ext4_handle_dirty_metadata(handle, inode, bh) \
154 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) 159 __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
160 (bh))
161#define ext4_handle_dirty_super(handle, sb) \
162 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
155 163
156handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 164handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
157int __ext4_journal_stop(const char *where, handle_t *handle); 165int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
158 166
159#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) 167#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
160 168
@@ -207,7 +215,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
207} 215}
208 216
209#define ext4_journal_stop(handle) \ 217#define ext4_journal_stop(handle) \
210 __ext4_journal_stop(__func__, (handle)) 218 __ext4_journal_stop(__func__, __LINE__, (handle))
211 219
212static inline handle_t *ext4_journal_current_handle(void) 220static inline handle_t *ext4_journal_current_handle(void)
213{ 221{
@@ -273,7 +281,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
273 return 1; 281 return 1;
274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 282 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
275 return 1; 283 return 1;
276 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 284 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
277 return 1; 285 return 1;
278 return 0; 286 return 0;
279} 287}
@@ -284,7 +292,7 @@ static inline int ext4_should_order_data(struct inode *inode)
284 return 0; 292 return 0;
285 if (!S_ISREG(inode->i_mode)) 293 if (!S_ISREG(inode->i_mode))
286 return 0; 294 return 0;
287 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 295 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
288 return 0; 296 return 0;
289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 297 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
290 return 1; 298 return 1;
@@ -297,7 +305,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
297 return 0; 305 return 0;
298 if (EXT4_JOURNAL(inode) == NULL) 306 if (EXT4_JOURNAL(inode) == NULL)
299 return 1; 307 return 1;
300 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 308 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
301 return 0; 309 return 0;
302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 310 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
303 return 1; 311 return 1;
@@ -308,20 +316,18 @@ static inline int ext4_should_writeback_data(struct inode *inode)
308 * This function controls whether or not we should try to go down the 316 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking 317 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based 318 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is 319 * files, and it doesn't work if data journaling is enabled, since the
312 * enabled, since the dioread_nolock code uses b_private to pass 320 * dioread_nolock code uses b_private to pass information back to the
313 * information back to the I/O completion handler, and this conflicts 321 * I/O completion handler, and this conflicts with the jbd's use of
314 * with the jbd's use of b_private. 322 * b_private.
315 */ 323 */
316static inline int ext4_should_dioread_nolock(struct inode *inode) 324static inline int ext4_should_dioread_nolock(struct inode *inode)
317{ 325{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) 326 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0; 327 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode)) 328 if (!S_ISREG(inode->i_mode))
323 return 0; 329 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 330 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
325 return 0; 331 return 0;
326 if (ext4_should_journal_data(inode)) 332 if (ext4_should_journal_data(inode))
327 return 0; 333 return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca8..06328d3e5717 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
107 if (err <= 0) 107 if (err <= 0)
108 return err; 108 return err;
109 err = ext4_truncate_restart_trans(handle, inode, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /* 110 if (err == 0)
111 * We have dropped i_data_sem so someone might have cached again 111 err = -EAGAIN;
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115 112
116 return err; 113 return err;
117} 114}
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
185 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 182 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
186 /* 183 /*
187 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 184 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
188 * block groups per flexgroup, reserve the first block 185 * block groups per flexgroup, reserve the first block
189 * group for directories and special files. Regular 186 * group for directories and special files. Regular
190 * files will start at the second block group. This 187 * files will start at the second block group. This
191 * tends to speed up directory access and improves 188 * tends to speed up directory access and improves
192 * fsck times. 189 * fsck times.
193 */ 190 */
194 block_group &= ~(flex_size-1); 191 block_group &= ~(flex_size-1);
@@ -404,9 +401,9 @@ static int ext4_valid_extent_entries(struct inode *inode,
404 return 1; 401 return 1;
405} 402}
406 403
407static int __ext4_ext_check(const char *function, struct inode *inode, 404static int __ext4_ext_check(const char *function, unsigned int line,
408 struct ext4_extent_header *eh, 405 struct inode *inode, struct ext4_extent_header *eh,
409 int depth) 406 int depth)
410{ 407{
411 const char *error_msg; 408 const char *error_msg;
412 int max = 0; 409 int max = 0;
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
439 return 0; 436 return 0;
440 437
441corrupted: 438corrupted:
442 __ext4_error(inode->i_sb, function, 439 ext4_error_inode(inode, function, line, 0,
443 "bad header/extent in inode #%lu: %s - magic %x, " 440 "bad header/extent: %s - magic %x, "
444 "entries %u, max %u(%u), depth %u(%u)", 441 "entries %u, max %u(%u), depth %u(%u)",
445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 442 error_msg, le16_to_cpu(eh->eh_magic),
446 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 443 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
447 max, le16_to_cpu(eh->eh_depth), depth); 444 max, le16_to_cpu(eh->eh_depth), depth);
448 445
@@ -450,7 +447,7 @@ corrupted:
450} 447}
451 448
452#define ext4_ext_check(inode, eh, depth) \ 449#define ext4_ext_check(inode, eh, depth) \
453 __ext4_ext_check(__func__, inode, eh, depth) 450 __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
454 451
455int ext4_ext_check_inode(struct inode *inode) 452int ext4_ext_check_inode(struct inode *inode)
456{ 453{
@@ -1086,7 +1083,6 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1086{ 1083{
1087 struct ext4_ext_path *curp = path; 1084 struct ext4_ext_path *curp = path;
1088 struct ext4_extent_header *neh; 1085 struct ext4_extent_header *neh;
1089 struct ext4_extent_idx *fidx;
1090 struct buffer_head *bh; 1086 struct buffer_head *bh;
1091 ext4_fsblk_t newblock; 1087 ext4_fsblk_t newblock;
1092 int err = 0; 1088 int err = 0;
@@ -1147,10 +1143,10 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1147 ext4_idx_store_pblock(curp->p_idx, newblock); 1143 ext4_idx_store_pblock(curp->p_idx, newblock);
1148 1144
1149 neh = ext_inode_hdr(inode); 1145 neh = ext_inode_hdr(inode);
1150 fidx = EXT_FIRST_INDEX(neh);
1151 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1146 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1152 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1147 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1153 le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); 1148 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1149 idx_pblock(EXT_FIRST_INDEX(neh)));
1154 1150
1155 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1151 neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1156 err = ext4_ext_dirty(handle, inode, curp); 1152 err = ext4_ext_dirty(handle, inode, curp);
@@ -1622,9 +1618,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
1622 merge_done = 1; 1618 merge_done = 1;
1623 WARN_ON(eh->eh_entries == 0); 1619 WARN_ON(eh->eh_entries == 0);
1624 if (!eh->eh_entries) 1620 if (!eh->eh_entries)
1625 ext4_error(inode->i_sb, 1621 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1628 } 1622 }
1629 1623
1630 return merge_done; 1624 return merge_done;
@@ -2039,7 +2033,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2039 struct ext4_ext_cache *cex; 2033 struct ext4_ext_cache *cex;
2040 int ret = EXT4_EXT_CACHE_NO; 2034 int ret = EXT4_EXT_CACHE_NO;
2041 2035
2042 /* 2036 /*
2043 * We borrow i_block_reservation_lock to protect i_cached_extent 2037 * We borrow i_block_reservation_lock to protect i_cached_extent
2044 */ 2038 */
2045 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2039 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2355,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2361 int depth = ext_depth(inode); 2355 int depth = ext_depth(inode);
2362 struct ext4_ext_path *path; 2356 struct ext4_ext_path *path;
2363 handle_t *handle; 2357 handle_t *handle;
2364 int i = 0, err = 0; 2358 int i, err;
2365 2359
2366 ext_debug("truncate since %u\n", start); 2360 ext_debug("truncate since %u\n", start);
2367 2361
@@ -2370,23 +2364,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2370 if (IS_ERR(handle)) 2364 if (IS_ERR(handle))
2371 return PTR_ERR(handle); 2365 return PTR_ERR(handle);
2372 2366
2367again:
2373 ext4_ext_invalidate_cache(inode); 2368 ext4_ext_invalidate_cache(inode);
2374 2369
2375 /* 2370 /*
2376 * We start scanning from right side, freeing all the blocks 2371 * We start scanning from right side, freeing all the blocks
2377 * after i_size and walking into the tree depth-wise. 2372 * after i_size and walking into the tree depth-wise.
2378 */ 2373 */
2374 depth = ext_depth(inode);
2379 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2375 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2380 if (path == NULL) { 2376 if (path == NULL) {
2381 ext4_journal_stop(handle); 2377 ext4_journal_stop(handle);
2382 return -ENOMEM; 2378 return -ENOMEM;
2383 } 2379 }
2380 path[0].p_depth = depth;
2384 path[0].p_hdr = ext_inode_hdr(inode); 2381 path[0].p_hdr = ext_inode_hdr(inode);
2385 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2382 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2386 err = -EIO; 2383 err = -EIO;
2387 goto out; 2384 goto out;
2388 } 2385 }
2389 path[0].p_depth = depth; 2386 i = err = 0;
2390 2387
2391 while (i >= 0 && err == 0) { 2388 while (i >= 0 && err == 0) {
2392 if (i == depth) { 2389 if (i == depth) {
@@ -2480,6 +2477,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2480out: 2477out:
2481 ext4_ext_drop_refs(path); 2478 ext4_ext_drop_refs(path);
2482 kfree(path); 2479 kfree(path);
2480 if (err == -EAGAIN)
2481 goto again;
2483 ext4_journal_stop(handle); 2482 ext4_journal_stop(handle);
2484 2483
2485 return err; 2484 return err;
@@ -2544,7 +2543,7 @@ static void bi_complete(struct bio *bio, int error)
2544/* FIXME!! we need to try to merge to left or right after zero-out */ 2543/* FIXME!! we need to try to merge to left or right after zero-out */
2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2544static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2546{ 2545{
2547 int ret = -EIO; 2546 int ret;
2548 struct bio *bio; 2547 struct bio *bio;
2549 int blkbits, blocksize; 2548 int blkbits, blocksize;
2550 sector_t ee_pblock; 2549 sector_t ee_pblock;
@@ -2568,6 +2567,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2568 len = ee_len; 2567 len = ee_len;
2569 2568
2570 bio = bio_alloc(GFP_NOIO, len); 2569 bio = bio_alloc(GFP_NOIO, len);
2570 if (!bio)
2571 return -ENOMEM;
2572
2571 bio->bi_sector = ee_pblock; 2573 bio->bi_sector = ee_pblock;
2572 bio->bi_bdev = inode->i_sb->s_bdev; 2574 bio->bi_bdev = inode->i_sb->s_bdev;
2573 2575
@@ -2595,22 +2597,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2595 submit_bio(WRITE, bio); 2597 submit_bio(WRITE, bio);
2596 wait_for_completion(&event); 2598 wait_for_completion(&event);
2597 2599
2598 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 2600 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2599 ret = 0; 2601 bio_put(bio);
2600 else { 2602 return -EIO;
2601 ret = -EIO;
2602 break;
2603 } 2603 }
2604 bio_put(bio); 2604 bio_put(bio);
2605 ee_len -= done; 2605 ee_len -= done;
2606 ee_pblock += done << (blkbits - 9); 2606 ee_pblock += done << (blkbits - 9);
2607 } 2607 }
2608 return ret; 2608 return 0;
2609} 2609}
2610 2610
2611#define EXT4_EXT_ZERO_LEN 7 2611#define EXT4_EXT_ZERO_LEN 7
2612/* 2612/*
2613 * This function is called by ext4_ext_get_blocks() if someone tries to write 2613 * This function is called by ext4_ext_map_blocks() if someone tries to write
2614 * to an uninitialized extent. It may result in splitting the uninitialized 2614 * to an uninitialized extent. It may result in splitting the uninitialized
2615 * extent into multiple extents (upto three - one initialized and two 2615 * extent into multiple extents (upto three - one initialized and two
2616 * uninitialized). 2616 * uninitialized).
@@ -2620,39 +2620,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2620 * c> Splits in three extents: Somone is writing in middle of the extent 2620 * c> Splits in three extents: Somone is writing in middle of the extent
2621 */ 2621 */
2622static int ext4_ext_convert_to_initialized(handle_t *handle, 2622static int ext4_ext_convert_to_initialized(handle_t *handle,
2623 struct inode *inode, 2623 struct inode *inode,
2624 struct ext4_ext_path *path, 2624 struct ext4_map_blocks *map,
2625 ext4_lblk_t iblock, 2625 struct ext4_ext_path *path)
2626 unsigned int max_blocks)
2627{ 2626{
2628 struct ext4_extent *ex, newex, orig_ex; 2627 struct ext4_extent *ex, newex, orig_ex;
2629 struct ext4_extent *ex1 = NULL; 2628 struct ext4_extent *ex1 = NULL;
2630 struct ext4_extent *ex2 = NULL; 2629 struct ext4_extent *ex2 = NULL;
2631 struct ext4_extent *ex3 = NULL; 2630 struct ext4_extent *ex3 = NULL;
2632 struct ext4_extent_header *eh; 2631 struct ext4_extent_header *eh;
2633 ext4_lblk_t ee_block; 2632 ext4_lblk_t ee_block, eof_block;
2634 unsigned int allocated, ee_len, depth; 2633 unsigned int allocated, ee_len, depth;
2635 ext4_fsblk_t newblock; 2634 ext4_fsblk_t newblock;
2636 int err = 0; 2635 int err = 0;
2637 int ret = 0; 2636 int ret = 0;
2637 int may_zeroout;
2638
2639 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2640 "block %llu, max_blocks %u\n", inode->i_ino,
2641 (unsigned long long)map->m_lblk, map->m_len);
2642
2643 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2644 inode->i_sb->s_blocksize_bits;
2645 if (eof_block < map->m_lblk + map->m_len)
2646 eof_block = map->m_lblk + map->m_len;
2638 2647
2639 depth = ext_depth(inode); 2648 depth = ext_depth(inode);
2640 eh = path[depth].p_hdr; 2649 eh = path[depth].p_hdr;
2641 ex = path[depth].p_ext; 2650 ex = path[depth].p_ext;
2642 ee_block = le32_to_cpu(ex->ee_block); 2651 ee_block = le32_to_cpu(ex->ee_block);
2643 ee_len = ext4_ext_get_actual_len(ex); 2652 ee_len = ext4_ext_get_actual_len(ex);
2644 allocated = ee_len - (iblock - ee_block); 2653 allocated = ee_len - (map->m_lblk - ee_block);
2645 newblock = iblock - ee_block + ext_pblock(ex); 2654 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2655
2646 ex2 = ex; 2656 ex2 = ex;
2647 orig_ex.ee_block = ex->ee_block; 2657 orig_ex.ee_block = ex->ee_block;
2648 orig_ex.ee_len = cpu_to_le16(ee_len); 2658 orig_ex.ee_len = cpu_to_le16(ee_len);
2649 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2659 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2650 2660
2661 /*
2662 * It is safe to convert extent to initialized via explicit
2663 * zeroout only if extent is fully insde i_size or new_size.
2664 */
2665 may_zeroout = ee_block + ee_len <= eof_block;
2666
2651 err = ext4_ext_get_access(handle, inode, path + depth); 2667 err = ext4_ext_get_access(handle, inode, path + depth);
2652 if (err) 2668 if (err)
2653 goto out; 2669 goto out;
2654 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2670 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2655 if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { 2671 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2656 err = ext4_ext_zeroout(inode, &orig_ex); 2672 err = ext4_ext_zeroout(inode, &orig_ex);
2657 if (err) 2673 if (err)
2658 goto fix_extent_len; 2674 goto fix_extent_len;
@@ -2665,10 +2681,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2665 return allocated; 2681 return allocated;
2666 } 2682 }
2667 2683
2668 /* ex1: ee_block to iblock - 1 : uninitialized */ 2684 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2669 if (iblock > ee_block) { 2685 if (map->m_lblk > ee_block) {
2670 ex1 = ex; 2686 ex1 = ex;
2671 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2687 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2672 ext4_ext_mark_uninitialized(ex1); 2688 ext4_ext_mark_uninitialized(ex1);
2673 ex2 = &newex; 2689 ex2 = &newex;
2674 } 2690 }
@@ -2677,15 +2693,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2677 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2693 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2678 * overlap of blocks. 2694 * overlap of blocks.
2679 */ 2695 */
2680 if (!ex1 && allocated > max_blocks) 2696 if (!ex1 && allocated > map->m_len)
2681 ex2->ee_len = cpu_to_le16(max_blocks); 2697 ex2->ee_len = cpu_to_le16(map->m_len);
2682 /* ex3: to ee_block + ee_len : uninitialised */ 2698 /* ex3: to ee_block + ee_len : uninitialised */
2683 if (allocated > max_blocks) { 2699 if (allocated > map->m_len) {
2684 unsigned int newdepth; 2700 unsigned int newdepth;
2685 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2701 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2686 if (allocated <= EXT4_EXT_ZERO_LEN) { 2702 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2687 /* 2703 /*
2688 * iblock == ee_block is handled by the zerouout 2704 * map->m_lblk == ee_block is handled by the zerouout
2689 * at the beginning. 2705 * at the beginning.
2690 * Mark first half uninitialized. 2706 * Mark first half uninitialized.
2691 * Mark second half initialized and zero out the 2707 * Mark second half initialized and zero out the
@@ -2698,7 +2714,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2698 ext4_ext_dirty(handle, inode, path + depth); 2714 ext4_ext_dirty(handle, inode, path + depth);
2699 2715
2700 ex3 = &newex; 2716 ex3 = &newex;
2701 ex3->ee_block = cpu_to_le32(iblock); 2717 ex3->ee_block = cpu_to_le32(map->m_lblk);
2702 ext4_ext_store_pblock(ex3, newblock); 2718 ext4_ext_store_pblock(ex3, newblock);
2703 ex3->ee_len = cpu_to_le16(allocated); 2719 ex3->ee_len = cpu_to_le16(allocated);
2704 err = ext4_ext_insert_extent(handle, inode, path, 2720 err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2727,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2711 ex->ee_len = orig_ex.ee_len; 2727 ex->ee_len = orig_ex.ee_len;
2712 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2728 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2713 ext4_ext_dirty(handle, inode, path + depth); 2729 ext4_ext_dirty(handle, inode, path + depth);
2714 /* blocks available from iblock */ 2730 /* blocks available from map->m_lblk */
2715 return allocated; 2731 return allocated;
2716 2732
2717 } else if (err) 2733 } else if (err)
@@ -2733,8 +2749,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2733 */ 2749 */
2734 depth = ext_depth(inode); 2750 depth = ext_depth(inode);
2735 ext4_ext_drop_refs(path); 2751 ext4_ext_drop_refs(path);
2736 path = ext4_ext_find_extent(inode, 2752 path = ext4_ext_find_extent(inode, map->m_lblk,
2737 iblock, path); 2753 path);
2738 if (IS_ERR(path)) { 2754 if (IS_ERR(path)) {
2739 err = PTR_ERR(path); 2755 err = PTR_ERR(path);
2740 return err; 2756 return err;
@@ -2754,12 +2770,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2754 return allocated; 2770 return allocated;
2755 } 2771 }
2756 ex3 = &newex; 2772 ex3 = &newex;
2757 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 2773 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2758 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2774 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2759 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2775 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2760 ext4_ext_mark_uninitialized(ex3); 2776 ext4_ext_mark_uninitialized(ex3);
2761 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2777 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2762 if (err == -ENOSPC) { 2778 if (err == -ENOSPC && may_zeroout) {
2763 err = ext4_ext_zeroout(inode, &orig_ex); 2779 err = ext4_ext_zeroout(inode, &orig_ex);
2764 if (err) 2780 if (err)
2765 goto fix_extent_len; 2781 goto fix_extent_len;
@@ -2769,7 +2785,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2769 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2785 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2770 ext4_ext_dirty(handle, inode, path + depth); 2786 ext4_ext_dirty(handle, inode, path + depth);
2771 /* zeroed the full extent */ 2787 /* zeroed the full extent */
2772 /* blocks available from iblock */ 2788 /* blocks available from map->m_lblk */
2773 return allocated; 2789 return allocated;
2774 2790
2775 } else if (err) 2791 } else if (err)
@@ -2783,11 +2799,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2783 * update the extent length after successful insert of the 2799 * update the extent length after successful insert of the
2784 * split extent 2800 * split extent
2785 */ 2801 */
2786 orig_ex.ee_len = cpu_to_le16(ee_len - 2802 ee_len -= ext4_ext_get_actual_len(ex3);
2787 ext4_ext_get_actual_len(ex3)); 2803 orig_ex.ee_len = cpu_to_le16(ee_len);
2804 may_zeroout = ee_block + ee_len <= eof_block;
2805
2788 depth = newdepth; 2806 depth = newdepth;
2789 ext4_ext_drop_refs(path); 2807 ext4_ext_drop_refs(path);
2790 path = ext4_ext_find_extent(inode, iblock, path); 2808 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2791 if (IS_ERR(path)) { 2809 if (IS_ERR(path)) {
2792 err = PTR_ERR(path); 2810 err = PTR_ERR(path);
2793 goto out; 2811 goto out;
@@ -2801,14 +2819,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2801 if (err) 2819 if (err)
2802 goto out; 2820 goto out;
2803 2821
2804 allocated = max_blocks; 2822 allocated = map->m_len;
2805 2823
2806 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2824 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2807 * to insert a extent in the middle zerout directly 2825 * to insert a extent in the middle zerout directly
2808 * otherwise give the extent a chance to merge to left 2826 * otherwise give the extent a chance to merge to left
2809 */ 2827 */
2810 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2828 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2811 iblock != ee_block) { 2829 map->m_lblk != ee_block && may_zeroout) {
2812 err = ext4_ext_zeroout(inode, &orig_ex); 2830 err = ext4_ext_zeroout(inode, &orig_ex);
2813 if (err) 2831 if (err)
2814 goto fix_extent_len; 2832 goto fix_extent_len;
@@ -2818,7 +2836,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2818 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2836 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2819 ext4_ext_dirty(handle, inode, path + depth); 2837 ext4_ext_dirty(handle, inode, path + depth);
2820 /* zero out the first half */ 2838 /* zero out the first half */
2821 /* blocks available from iblock */ 2839 /* blocks available from map->m_lblk */
2822 return allocated; 2840 return allocated;
2823 } 2841 }
2824 } 2842 }
@@ -2829,12 +2847,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2829 */ 2847 */
2830 if (ex1 && ex1 != ex) { 2848 if (ex1 && ex1 != ex) {
2831 ex1 = ex; 2849 ex1 = ex;
2832 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2850 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2833 ext4_ext_mark_uninitialized(ex1); 2851 ext4_ext_mark_uninitialized(ex1);
2834 ex2 = &newex; 2852 ex2 = &newex;
2835 } 2853 }
2836 /* ex2: iblock to iblock + maxblocks-1 : initialised */ 2854 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2837 ex2->ee_block = cpu_to_le32(iblock); 2855 ex2->ee_block = cpu_to_le32(map->m_lblk);
2838 ext4_ext_store_pblock(ex2, newblock); 2856 ext4_ext_store_pblock(ex2, newblock);
2839 ex2->ee_len = cpu_to_le16(allocated); 2857 ex2->ee_len = cpu_to_le16(allocated);
2840 if (ex2 != ex) 2858 if (ex2 != ex)
@@ -2877,7 +2895,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2877 goto out; 2895 goto out;
2878insert: 2896insert:
2879 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 2897 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2880 if (err == -ENOSPC) { 2898 if (err == -ENOSPC && may_zeroout) {
2881 err = ext4_ext_zeroout(inode, &orig_ex); 2899 err = ext4_ext_zeroout(inode, &orig_ex);
2882 if (err) 2900 if (err)
2883 goto fix_extent_len; 2901 goto fix_extent_len;
@@ -2904,7 +2922,7 @@ fix_extent_len:
2904} 2922}
2905 2923
2906/* 2924/*
2907 * This function is called by ext4_ext_get_blocks() from 2925 * This function is called by ext4_ext_map_blocks() from
2908 * ext4_get_blocks_dio_write() when DIO to write 2926 * ext4_get_blocks_dio_write() when DIO to write
2909 * to an uninitialized extent. 2927 * to an uninitialized extent.
2910 * 2928 *
@@ -2918,7 +2936,7 @@ fix_extent_len:
2918 * One of more index blocks maybe needed if the extent tree grow after 2936 * One of more index blocks maybe needed if the extent tree grow after
2919 * the unintialized extent split. To prevent ENOSPC occur at the IO 2937 * the unintialized extent split. To prevent ENOSPC occur at the IO
2920 * complete, we need to split the uninitialized extent before DIO submit 2938 * complete, we need to split the uninitialized extent before DIO submit
2921 * the IO. The uninitilized extent called at this time will be split 2939 * the IO. The uninitialized extent called at this time will be split
2922 * into three uninitialized extent(at most). After IO complete, the part 2940 * into three uninitialized extent(at most). After IO complete, the part
2923 * being filled will be convert to initialized by the end_io callback function 2941 * being filled will be convert to initialized by the end_io callback function
2924 * via ext4_convert_unwritten_extents(). 2942 * via ext4_convert_unwritten_extents().
@@ -2927,51 +2945,62 @@ fix_extent_len:
2927 */ 2945 */
2928static int ext4_split_unwritten_extents(handle_t *handle, 2946static int ext4_split_unwritten_extents(handle_t *handle,
2929 struct inode *inode, 2947 struct inode *inode,
2948 struct ext4_map_blocks *map,
2930 struct ext4_ext_path *path, 2949 struct ext4_ext_path *path,
2931 ext4_lblk_t iblock,
2932 unsigned int max_blocks,
2933 int flags) 2950 int flags)
2934{ 2951{
2935 struct ext4_extent *ex, newex, orig_ex; 2952 struct ext4_extent *ex, newex, orig_ex;
2936 struct ext4_extent *ex1 = NULL; 2953 struct ext4_extent *ex1 = NULL;
2937 struct ext4_extent *ex2 = NULL; 2954 struct ext4_extent *ex2 = NULL;
2938 struct ext4_extent *ex3 = NULL; 2955 struct ext4_extent *ex3 = NULL;
2939 struct ext4_extent_header *eh; 2956 ext4_lblk_t ee_block, eof_block;
2940 ext4_lblk_t ee_block;
2941 unsigned int allocated, ee_len, depth; 2957 unsigned int allocated, ee_len, depth;
2942 ext4_fsblk_t newblock; 2958 ext4_fsblk_t newblock;
2943 int err = 0; 2959 int err = 0;
2960 int may_zeroout;
2961
2962 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2963 "block %llu, max_blocks %u\n", inode->i_ino,
2964 (unsigned long long)map->m_lblk, map->m_len);
2965
2966 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2967 inode->i_sb->s_blocksize_bits;
2968 if (eof_block < map->m_lblk + map->m_len)
2969 eof_block = map->m_lblk + map->m_len;
2944 2970
2945 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2946 "iblock %llu, max_blocks %u\n", inode->i_ino,
2947 (unsigned long long)iblock, max_blocks);
2948 depth = ext_depth(inode); 2971 depth = ext_depth(inode);
2949 eh = path[depth].p_hdr;
2950 ex = path[depth].p_ext; 2972 ex = path[depth].p_ext;
2951 ee_block = le32_to_cpu(ex->ee_block); 2973 ee_block = le32_to_cpu(ex->ee_block);
2952 ee_len = ext4_ext_get_actual_len(ex); 2974 ee_len = ext4_ext_get_actual_len(ex);
2953 allocated = ee_len - (iblock - ee_block); 2975 allocated = ee_len - (map->m_lblk - ee_block);
2954 newblock = iblock - ee_block + ext_pblock(ex); 2976 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2977
2955 ex2 = ex; 2978 ex2 = ex;
2956 orig_ex.ee_block = ex->ee_block; 2979 orig_ex.ee_block = ex->ee_block;
2957 orig_ex.ee_len = cpu_to_le16(ee_len); 2980 orig_ex.ee_len = cpu_to_le16(ee_len);
2958 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2981 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2959 2982
2960 /* 2983 /*
2984 * It is safe to convert extent to initialized via explicit
2985 * zeroout only if extent is fully insde i_size or new_size.
2986 */
2987 may_zeroout = ee_block + ee_len <= eof_block;
2988
2989 /*
2961 * If the uninitialized extent begins at the same logical 2990 * If the uninitialized extent begins at the same logical
2962 * block where the write begins, and the write completely 2991 * block where the write begins, and the write completely
2963 * covers the extent, then we don't need to split it. 2992 * covers the extent, then we don't need to split it.
2964 */ 2993 */
2965 if ((iblock == ee_block) && (allocated <= max_blocks)) 2994 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2966 return allocated; 2995 return allocated;
2967 2996
2968 err = ext4_ext_get_access(handle, inode, path + depth); 2997 err = ext4_ext_get_access(handle, inode, path + depth);
2969 if (err) 2998 if (err)
2970 goto out; 2999 goto out;
2971 /* ex1: ee_block to iblock - 1 : uninitialized */ 3000 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2972 if (iblock > ee_block) { 3001 if (map->m_lblk > ee_block) {
2973 ex1 = ex; 3002 ex1 = ex;
2974 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3003 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2975 ext4_ext_mark_uninitialized(ex1); 3004 ext4_ext_mark_uninitialized(ex1);
2976 ex2 = &newex; 3005 ex2 = &newex;
2977 } 3006 }
@@ -2980,18 +3009,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2980 * we insert ex3, if ex1 is NULL. This is to avoid temporary 3009 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2981 * overlap of blocks. 3010 * overlap of blocks.
2982 */ 3011 */
2983 if (!ex1 && allocated > max_blocks) 3012 if (!ex1 && allocated > map->m_len)
2984 ex2->ee_len = cpu_to_le16(max_blocks); 3013 ex2->ee_len = cpu_to_le16(map->m_len);
2985 /* ex3: to ee_block + ee_len : uninitialised */ 3014 /* ex3: to ee_block + ee_len : uninitialised */
2986 if (allocated > max_blocks) { 3015 if (allocated > map->m_len) {
2987 unsigned int newdepth; 3016 unsigned int newdepth;
2988 ex3 = &newex; 3017 ex3 = &newex;
2989 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 3018 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2990 ext4_ext_store_pblock(ex3, newblock + max_blocks); 3019 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2991 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 3020 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2992 ext4_ext_mark_uninitialized(ex3); 3021 ext4_ext_mark_uninitialized(ex3);
2993 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 3022 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2994 if (err == -ENOSPC) { 3023 if (err == -ENOSPC && may_zeroout) {
2995 err = ext4_ext_zeroout(inode, &orig_ex); 3024 err = ext4_ext_zeroout(inode, &orig_ex);
2996 if (err) 3025 if (err)
2997 goto fix_extent_len; 3026 goto fix_extent_len;
@@ -3001,7 +3030,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3001 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3030 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3002 ext4_ext_dirty(handle, inode, path + depth); 3031 ext4_ext_dirty(handle, inode, path + depth);
3003 /* zeroed the full extent */ 3032 /* zeroed the full extent */
3004 /* blocks available from iblock */ 3033 /* blocks available from map->m_lblk */
3005 return allocated; 3034 return allocated;
3006 3035
3007 } else if (err) 3036 } else if (err)
@@ -3015,16 +3044,17 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3015 * update the extent length after successful insert of the 3044 * update the extent length after successful insert of the
3016 * split extent 3045 * split extent
3017 */ 3046 */
3018 orig_ex.ee_len = cpu_to_le16(ee_len - 3047 ee_len -= ext4_ext_get_actual_len(ex3);
3019 ext4_ext_get_actual_len(ex3)); 3048 orig_ex.ee_len = cpu_to_le16(ee_len);
3049 may_zeroout = ee_block + ee_len <= eof_block;
3050
3020 depth = newdepth; 3051 depth = newdepth;
3021 ext4_ext_drop_refs(path); 3052 ext4_ext_drop_refs(path);
3022 path = ext4_ext_find_extent(inode, iblock, path); 3053 path = ext4_ext_find_extent(inode, map->m_lblk, path);
3023 if (IS_ERR(path)) { 3054 if (IS_ERR(path)) {
3024 err = PTR_ERR(path); 3055 err = PTR_ERR(path);
3025 goto out; 3056 goto out;
3026 } 3057 }
3027 eh = path[depth].p_hdr;
3028 ex = path[depth].p_ext; 3058 ex = path[depth].p_ext;
3029 if (ex2 != &newex) 3059 if (ex2 != &newex)
3030 ex2 = ex; 3060 ex2 = ex;
@@ -3033,7 +3063,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3033 if (err) 3063 if (err)
3034 goto out; 3064 goto out;
3035 3065
3036 allocated = max_blocks; 3066 allocated = map->m_len;
3037 } 3067 }
3038 /* 3068 /*
3039 * If there was a change of depth as part of the 3069 * If there was a change of depth as part of the
@@ -3042,15 +3072,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3042 */ 3072 */
3043 if (ex1 && ex1 != ex) { 3073 if (ex1 && ex1 != ex) {
3044 ex1 = ex; 3074 ex1 = ex;
3045 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3075 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3046 ext4_ext_mark_uninitialized(ex1); 3076 ext4_ext_mark_uninitialized(ex1);
3047 ex2 = &newex; 3077 ex2 = &newex;
3048 } 3078 }
3049 /* 3079 /*
3050 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, 3080 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3051 * uninitialised still. 3081 * using direct I/O, uninitialised still.
3052 */ 3082 */
3053 ex2->ee_block = cpu_to_le32(iblock); 3083 ex2->ee_block = cpu_to_le32(map->m_lblk);
3054 ext4_ext_store_pblock(ex2, newblock); 3084 ext4_ext_store_pblock(ex2, newblock);
3055 ex2->ee_len = cpu_to_le16(allocated); 3085 ex2->ee_len = cpu_to_le16(allocated);
3056 ext4_ext_mark_uninitialized(ex2); 3086 ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3092,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3062 goto out; 3092 goto out;
3063insert: 3093insert:
3064 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3094 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3065 if (err == -ENOSPC) { 3095 if (err == -ENOSPC && may_zeroout) {
3066 err = ext4_ext_zeroout(inode, &orig_ex); 3096 err = ext4_ext_zeroout(inode, &orig_ex);
3067 if (err) 3097 if (err)
3068 goto fix_extent_len; 3098 goto fix_extent_len;
@@ -3152,10 +3182,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3152 3182
3153static int 3183static int
3154ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3184ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3155 ext4_lblk_t iblock, unsigned int max_blocks, 3185 struct ext4_map_blocks *map,
3156 struct ext4_ext_path *path, int flags, 3186 struct ext4_ext_path *path, int flags,
3157 unsigned int allocated, struct buffer_head *bh_result, 3187 unsigned int allocated, ext4_fsblk_t newblock)
3158 ext4_fsblk_t newblock)
3159{ 3188{
3160 int ret = 0; 3189 int ret = 0;
3161 int err = 0; 3190 int err = 0;
@@ -3163,15 +3192,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3163 3192
3164 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3193 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3165 "block %llu, max_blocks %u, flags %d, allocated %u", 3194 "block %llu, max_blocks %u, flags %d, allocated %u",
3166 inode->i_ino, (unsigned long long)iblock, max_blocks, 3195 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3167 flags, allocated); 3196 flags, allocated);
3168 ext4_ext_show_leaf(inode, path); 3197 ext4_ext_show_leaf(inode, path);
3169 3198
3170 /* get_block() before submit the IO, split the extent */ 3199 /* get_block() before submit the IO, split the extent */
3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3200 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3172 ret = ext4_split_unwritten_extents(handle, 3201 ret = ext4_split_unwritten_extents(handle, inode, map,
3173 inode, path, iblock, 3202 path, flags);
3174 max_blocks, flags);
3175 /* 3203 /*
3176 * Flag the inode(non aio case) or end_io struct (aio case) 3204 * Flag the inode(non aio case) or end_io struct (aio case)
3177 * that this IO needs to convertion to written when IO is 3205 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3210,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3182 else 3210 else
3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3211 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode)) 3212 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result); 3213 map->m_flags |= EXT4_MAP_UNINIT;
3186 goto out; 3214 goto out;
3187 } 3215 }
3188 /* IO end_io complete, convert the filled extent to written */ 3216 /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3238,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3210 * the buffer head will be unmapped so that 3238 * the buffer head will be unmapped so that
3211 * a read from the block returns 0s. 3239 * a read from the block returns 0s.
3212 */ 3240 */
3213 set_buffer_unwritten(bh_result); 3241 map->m_flags |= EXT4_MAP_UNWRITTEN;
3214 goto out1; 3242 goto out1;
3215 } 3243 }
3216 3244
3217 /* buffered write, writepage time, convert*/ 3245 /* buffered write, writepage time, convert*/
3218 ret = ext4_ext_convert_to_initialized(handle, inode, 3246 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3219 path, iblock,
3220 max_blocks);
3221 if (ret >= 0) 3247 if (ret >= 0)
3222 ext4_update_inode_fsync_trans(handle, inode, 1); 3248 ext4_update_inode_fsync_trans(handle, inode, 1);
3223out: 3249out:
@@ -3226,7 +3252,7 @@ out:
3226 goto out2; 3252 goto out2;
3227 } else 3253 } else
3228 allocated = ret; 3254 allocated = ret;
3229 set_buffer_new(bh_result); 3255 map->m_flags |= EXT4_MAP_NEW;
3230 /* 3256 /*
3231 * if we allocated more blocks than requested 3257 * if we allocated more blocks than requested
3232 * we need to make sure we unmap the extra block 3258 * we need to make sure we unmap the extra block
@@ -3234,11 +3260,11 @@ out:
3234 * unmapped later when we find the buffer_head marked 3260 * unmapped later when we find the buffer_head marked
3235 * new. 3261 * new.
3236 */ 3262 */
3237 if (allocated > max_blocks) { 3263 if (allocated > map->m_len) {
3238 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3264 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3239 newblock + max_blocks, 3265 newblock + map->m_len,
3240 allocated - max_blocks); 3266 allocated - map->m_len);
3241 allocated = max_blocks; 3267 allocated = map->m_len;
3242 } 3268 }
3243 3269
3244 /* 3270 /*
@@ -3252,13 +3278,13 @@ out:
3252 ext4_da_update_reserve_space(inode, allocated, 0); 3278 ext4_da_update_reserve_space(inode, allocated, 0);
3253 3279
3254map_out: 3280map_out:
3255 set_buffer_mapped(bh_result); 3281 map->m_flags |= EXT4_MAP_MAPPED;
3256out1: 3282out1:
3257 if (allocated > max_blocks) 3283 if (allocated > map->m_len)
3258 allocated = max_blocks; 3284 allocated = map->m_len;
3259 ext4_ext_show_leaf(inode, path); 3285 ext4_ext_show_leaf(inode, path);
3260 bh_result->b_bdev = inode->i_sb->s_bdev; 3286 map->m_pblk = newblock;
3261 bh_result->b_blocknr = newblock; 3287 map->m_len = allocated;
3262out2: 3288out2:
3263 if (path) { 3289 if (path) {
3264 ext4_ext_drop_refs(path); 3290 ext4_ext_drop_refs(path);
@@ -3284,26 +3310,23 @@ out2:
3284 * 3310 *
3285 * return < 0, error case. 3311 * return < 0, error case.
3286 */ 3312 */
3287int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 3313int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3288 ext4_lblk_t iblock, 3314 struct ext4_map_blocks *map, int flags)
3289 unsigned int max_blocks, struct buffer_head *bh_result,
3290 int flags)
3291{ 3315{
3292 struct ext4_ext_path *path = NULL; 3316 struct ext4_ext_path *path = NULL;
3293 struct ext4_extent_header *eh; 3317 struct ext4_extent_header *eh;
3294 struct ext4_extent newex, *ex, *last_ex; 3318 struct ext4_extent newex, *ex, *last_ex;
3295 ext4_fsblk_t newblock; 3319 ext4_fsblk_t newblock;
3296 int err = 0, depth, ret, cache_type; 3320 int i, err = 0, depth, ret, cache_type;
3297 unsigned int allocated = 0; 3321 unsigned int allocated = 0;
3298 struct ext4_allocation_request ar; 3322 struct ext4_allocation_request ar;
3299 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3323 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3300 3324
3301 __clear_bit(BH_New, &bh_result->b_state);
3302 ext_debug("blocks %u/%u requested for inode %lu\n", 3325 ext_debug("blocks %u/%u requested for inode %lu\n",
3303 iblock, max_blocks, inode->i_ino); 3326 map->m_lblk, map->m_len, inode->i_ino);
3304 3327
3305 /* check in cache */ 3328 /* check in cache */
3306 cache_type = ext4_ext_in_cache(inode, iblock, &newex); 3329 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
3307 if (cache_type) { 3330 if (cache_type) {
3308 if (cache_type == EXT4_EXT_CACHE_GAP) { 3331 if (cache_type == EXT4_EXT_CACHE_GAP) {
3309 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3332 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3339,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3316 /* we should allocate requested block */ 3339 /* we should allocate requested block */
3317 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3340 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3318 /* block is already allocated */ 3341 /* block is already allocated */
3319 newblock = iblock 3342 newblock = map->m_lblk
3320 - le32_to_cpu(newex.ee_block) 3343 - le32_to_cpu(newex.ee_block)
3321 + ext_pblock(&newex); 3344 + ext_pblock(&newex);
3322 /* number of remaining blocks in the extent */ 3345 /* number of remaining blocks in the extent */
3323 allocated = ext4_ext_get_actual_len(&newex) - 3346 allocated = ext4_ext_get_actual_len(&newex) -
3324 (iblock - le32_to_cpu(newex.ee_block)); 3347 (map->m_lblk - le32_to_cpu(newex.ee_block));
3325 goto out; 3348 goto out;
3326 } else { 3349 } else {
3327 BUG(); 3350 BUG();
@@ -3329,7 +3352,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3329 } 3352 }
3330 3353
3331 /* find extent for this block */ 3354 /* find extent for this block */
3332 path = ext4_ext_find_extent(inode, iblock, NULL); 3355 path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3333 if (IS_ERR(path)) { 3356 if (IS_ERR(path)) {
3334 err = PTR_ERR(path); 3357 err = PTR_ERR(path);
3335 path = NULL; 3358 path = NULL;
@@ -3345,8 +3368,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3345 */ 3368 */
3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3369 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3347 EXT4_ERROR_INODE(inode, "bad extent address " 3370 EXT4_ERROR_INODE(inode, "bad extent address "
3348 "iblock: %d, depth: %d pblock %lld", 3371 "lblock: %lu, depth: %d pblock %lld",
3349 iblock, depth, path[depth].p_block); 3372 (unsigned long) map->m_lblk, depth,
3373 path[depth].p_block);
3350 err = -EIO; 3374 err = -EIO;
3351 goto out2; 3375 goto out2;
3352 } 3376 }
@@ -3364,12 +3388,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3364 */ 3388 */
3365 ee_len = ext4_ext_get_actual_len(ex); 3389 ee_len = ext4_ext_get_actual_len(ex);
3366 /* if found extent covers block, simply return it */ 3390 /* if found extent covers block, simply return it */
3367 if (in_range(iblock, ee_block, ee_len)) { 3391 if (in_range(map->m_lblk, ee_block, ee_len)) {
3368 newblock = iblock - ee_block + ee_start; 3392 newblock = map->m_lblk - ee_block + ee_start;
3369 /* number of remaining blocks in the extent */ 3393 /* number of remaining blocks in the extent */
3370 allocated = ee_len - (iblock - ee_block); 3394 allocated = ee_len - (map->m_lblk - ee_block);
3371 ext_debug("%u fit into %u:%d -> %llu\n", iblock, 3395 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3372 ee_block, ee_len, newblock); 3396 ee_block, ee_len, newblock);
3373 3397
3374 /* Do not put uninitialized extent in the cache */ 3398 /* Do not put uninitialized extent in the cache */
3375 if (!ext4_ext_is_uninitialized(ex)) { 3399 if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3403,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3379 goto out; 3403 goto out;
3380 } 3404 }
3381 ret = ext4_ext_handle_uninitialized_extents(handle, 3405 ret = ext4_ext_handle_uninitialized_extents(handle,
3382 inode, iblock, max_blocks, path, 3406 inode, map, path, flags, allocated,
3383 flags, allocated, bh_result, newblock); 3407 newblock);
3384 return ret; 3408 return ret;
3385 } 3409 }
3386 } 3410 }
@@ -3394,7 +3418,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3394 * put just found gap into cache to speed up 3418 * put just found gap into cache to speed up
3395 * subsequent requests 3419 * subsequent requests
3396 */ 3420 */
3397 ext4_ext_put_gap_in_cache(inode, path, iblock); 3421 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3398 goto out2; 3422 goto out2;
3399 } 3423 }
3400 /* 3424 /*
@@ -3402,11 +3426,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3402 */ 3426 */
3403 3427
3404 /* find neighbour allocated blocks */ 3428 /* find neighbour allocated blocks */
3405 ar.lleft = iblock; 3429 ar.lleft = map->m_lblk;
3406 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3430 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3407 if (err) 3431 if (err)
3408 goto out2; 3432 goto out2;
3409 ar.lright = iblock; 3433 ar.lright = map->m_lblk;
3410 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3434 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3411 if (err) 3435 if (err)
3412 goto out2; 3436 goto out2;
@@ -3417,26 +3441,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3417 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3441 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3418 * EXT_UNINIT_MAX_LEN. 3442 * EXT_UNINIT_MAX_LEN.
3419 */ 3443 */
3420 if (max_blocks > EXT_INIT_MAX_LEN && 3444 if (map->m_len > EXT_INIT_MAX_LEN &&
3421 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3445 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3422 max_blocks = EXT_INIT_MAX_LEN; 3446 map->m_len = EXT_INIT_MAX_LEN;
3423 else if (max_blocks > EXT_UNINIT_MAX_LEN && 3447 else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3424 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3448 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3425 max_blocks = EXT_UNINIT_MAX_LEN; 3449 map->m_len = EXT_UNINIT_MAX_LEN;
3426 3450
3427 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 3451 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3428 newex.ee_block = cpu_to_le32(iblock); 3452 newex.ee_block = cpu_to_le32(map->m_lblk);
3429 newex.ee_len = cpu_to_le16(max_blocks); 3453 newex.ee_len = cpu_to_le16(map->m_len);
3430 err = ext4_ext_check_overlap(inode, &newex, path); 3454 err = ext4_ext_check_overlap(inode, &newex, path);
3431 if (err) 3455 if (err)
3432 allocated = ext4_ext_get_actual_len(&newex); 3456 allocated = ext4_ext_get_actual_len(&newex);
3433 else 3457 else
3434 allocated = max_blocks; 3458 allocated = map->m_len;
3435 3459
3436 /* allocate new block */ 3460 /* allocate new block */
3437 ar.inode = inode; 3461 ar.inode = inode;
3438 ar.goal = ext4_ext_find_goal(inode, path, iblock); 3462 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3439 ar.logical = iblock; 3463 ar.logical = map->m_lblk;
3440 ar.len = allocated; 3464 ar.len = allocated;
3441 if (S_ISREG(inode->i_mode)) 3465 if (S_ISREG(inode->i_mode))
3442 ar.flags = EXT4_MB_HINT_DATA; 3466 ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3494,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3494 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3495 }
3472 if (ext4_should_dioread_nolock(inode)) 3496 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result); 3497 map->m_flags |= EXT4_MAP_UNINIT;
3474 } 3498 }
3475 3499
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { 3500 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
3477 if (unlikely(!eh->eh_entries)) { 3501 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode, 3502 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d", 3503 "eh->eh_entries == 0 and "
3480 ex->ee_block); 3504 "EOFBLOCKS_FL set");
3481 err = -EIO; 3505 err = -EIO;
3482 goto out2; 3506 goto out2;
3483 } 3507 }
3484 last_ex = EXT_LAST_EXTENT(eh); 3508 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) 3509 /*
3486 + ext4_ext_get_actual_len(last_ex)) 3510 * If the current leaf block was reached by looking at
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 3511 * the last index block all the way down the tree, and
3512 * we are extending the inode beyond the last extent
3513 * in the current leaf block, then clear the
3514 * EOFBLOCKS_FL flag.
3515 */
3516 for (i = depth-1; i >= 0; i--) {
3517 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3518 break;
3519 }
3520 if ((i < 0) &&
3521 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3522 ext4_ext_get_actual_len(last_ex)))
3523 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3488 } 3524 }
3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3525 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3490 if (err) { 3526 if (err) {
@@ -3500,9 +3536,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3500 /* previous routine could use block we allocated */ 3536 /* previous routine could use block we allocated */
3501 newblock = ext_pblock(&newex); 3537 newblock = ext_pblock(&newex);
3502 allocated = ext4_ext_get_actual_len(&newex); 3538 allocated = ext4_ext_get_actual_len(&newex);
3503 if (allocated > max_blocks) 3539 if (allocated > map->m_len)
3504 allocated = max_blocks; 3540 allocated = map->m_len;
3505 set_buffer_new(bh_result); 3541 map->m_flags |= EXT4_MAP_NEW;
3506 3542
3507 /* 3543 /*
3508 * Update reserved blocks/metadata blocks after successful 3544 * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3552,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3516 * when it is _not_ an uninitialized extent. 3552 * when it is _not_ an uninitialized extent.
3517 */ 3553 */
3518 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3554 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3519 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3555 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
3520 EXT4_EXT_CACHE_EXTENT); 3556 EXT4_EXT_CACHE_EXTENT);
3521 ext4_update_inode_fsync_trans(handle, inode, 1); 3557 ext4_update_inode_fsync_trans(handle, inode, 1);
3522 } else 3558 } else
3523 ext4_update_inode_fsync_trans(handle, inode, 0); 3559 ext4_update_inode_fsync_trans(handle, inode, 0);
3524out: 3560out:
3525 if (allocated > max_blocks) 3561 if (allocated > map->m_len)
3526 allocated = max_blocks; 3562 allocated = map->m_len;
3527 ext4_ext_show_leaf(inode, path); 3563 ext4_ext_show_leaf(inode, path);
3528 set_buffer_mapped(bh_result); 3564 map->m_flags |= EXT4_MAP_MAPPED;
3529 bh_result->b_bdev = inode->i_sb->s_bdev; 3565 map->m_pblk = newblock;
3530 bh_result->b_blocknr = newblock; 3566 map->m_len = allocated;
3531out2: 3567out2:
3532 if (path) { 3568 if (path) {
3533 ext4_ext_drop_refs(path); 3569 ext4_ext_drop_refs(path);
@@ -3625,7 +3661,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
3625 * can proceed even if the new size is the same as i_size. 3661 * can proceed even if the new size is the same as i_size.
3626 */ 3662 */
3627 if (new_size > i_size_read(inode)) 3663 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; 3664 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3629 } 3665 }
3630 3666
3631} 3667}
@@ -3640,55 +3676,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
3640long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3676long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3641{ 3677{
3642 handle_t *handle; 3678 handle_t *handle;
3643 ext4_lblk_t block;
3644 loff_t new_size; 3679 loff_t new_size;
3645 unsigned int max_blocks; 3680 unsigned int max_blocks;
3646 int ret = 0; 3681 int ret = 0;
3647 int ret2 = 0; 3682 int ret2 = 0;
3648 int retries = 0; 3683 int retries = 0;
3649 struct buffer_head map_bh; 3684 struct ext4_map_blocks map;
3650 unsigned int credits, blkbits = inode->i_blkbits; 3685 unsigned int credits, blkbits = inode->i_blkbits;
3651 3686
3652 /* 3687 /*
3653 * currently supporting (pre)allocate mode for extent-based 3688 * currently supporting (pre)allocate mode for extent-based
3654 * files _only_ 3689 * files _only_
3655 */ 3690 */
3656 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3691 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3657 return -EOPNOTSUPP; 3692 return -EOPNOTSUPP;
3658 3693
3659 /* preallocation to directories is currently not supported */ 3694 /* preallocation to directories is currently not supported */
3660 if (S_ISDIR(inode->i_mode)) 3695 if (S_ISDIR(inode->i_mode))
3661 return -ENODEV; 3696 return -ENODEV;
3662 3697
3663 block = offset >> blkbits; 3698 map.m_lblk = offset >> blkbits;
3664 /* 3699 /*
3665 * We can't just convert len to max_blocks because 3700 * We can't just convert len to max_blocks because
3666 * If blocksize = 4096 offset = 3072 and len = 2048 3701 * If blocksize = 4096 offset = 3072 and len = 2048
3667 */ 3702 */
3668 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3703 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3669 - block; 3704 - map.m_lblk;
3670 /* 3705 /*
3671 * credits to insert 1 extent into extent tree 3706 * credits to insert 1 extent into extent tree
3672 */ 3707 */
3673 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3708 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3674 mutex_lock(&inode->i_mutex); 3709 mutex_lock(&inode->i_mutex);
3710 ret = inode_newsize_ok(inode, (len + offset));
3711 if (ret) {
3712 mutex_unlock(&inode->i_mutex);
3713 return ret;
3714 }
3675retry: 3715retry:
3676 while (ret >= 0 && ret < max_blocks) { 3716 while (ret >= 0 && ret < max_blocks) {
3677 block = block + ret; 3717 map.m_lblk = map.m_lblk + ret;
3678 max_blocks = max_blocks - ret; 3718 map.m_len = max_blocks = max_blocks - ret;
3679 handle = ext4_journal_start(inode, credits); 3719 handle = ext4_journal_start(inode, credits);
3680 if (IS_ERR(handle)) { 3720 if (IS_ERR(handle)) {
3681 ret = PTR_ERR(handle); 3721 ret = PTR_ERR(handle);
3682 break; 3722 break;
3683 } 3723 }
3684 map_bh.b_state = 0; 3724 ret = ext4_map_blocks(handle, inode, &map,
3685 ret = ext4_get_blocks(handle, inode, block,
3686 max_blocks, &map_bh,
3687 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3725 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3688 if (ret <= 0) { 3726 if (ret <= 0) {
3689#ifdef EXT4FS_DEBUG 3727#ifdef EXT4FS_DEBUG
3690 WARN_ON(ret <= 0); 3728 WARN_ON(ret <= 0);
3691 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3729 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3692 "returned error inode#%lu, block=%u, " 3730 "returned error inode#%lu, block=%u, "
3693 "max_blocks=%u", __func__, 3731 "max_blocks=%u", __func__,
3694 inode->i_ino, block, max_blocks); 3732 inode->i_ino, block, max_blocks);
@@ -3697,14 +3735,14 @@ retry:
3697 ret2 = ext4_journal_stop(handle); 3735 ret2 = ext4_journal_stop(handle);
3698 break; 3736 break;
3699 } 3737 }
3700 if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3738 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3701 blkbits) >> blkbits)) 3739 blkbits) >> blkbits))
3702 new_size = offset + len; 3740 new_size = offset + len;
3703 else 3741 else
3704 new_size = (block + ret) << blkbits; 3742 new_size = (map.m_lblk + ret) << blkbits;
3705 3743
3706 ext4_falloc_update_inode(inode, mode, new_size, 3744 ext4_falloc_update_inode(inode, mode, new_size,
3707 buffer_new(&map_bh)); 3745 (map.m_flags & EXT4_MAP_NEW));
3708 ext4_mark_inode_dirty(handle, inode); 3746 ext4_mark_inode_dirty(handle, inode);
3709 ret2 = ext4_journal_stop(handle); 3747 ret2 = ext4_journal_stop(handle);
3710 if (ret2) 3748 if (ret2)
@@ -3733,42 +3771,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3733 ssize_t len) 3771 ssize_t len)
3734{ 3772{
3735 handle_t *handle; 3773 handle_t *handle;
3736 ext4_lblk_t block;
3737 unsigned int max_blocks; 3774 unsigned int max_blocks;
3738 int ret = 0; 3775 int ret = 0;
3739 int ret2 = 0; 3776 int ret2 = 0;
3740 struct buffer_head map_bh; 3777 struct ext4_map_blocks map;
3741 unsigned int credits, blkbits = inode->i_blkbits; 3778 unsigned int credits, blkbits = inode->i_blkbits;
3742 3779
3743 block = offset >> blkbits; 3780 map.m_lblk = offset >> blkbits;
3744 /* 3781 /*
3745 * We can't just convert len to max_blocks because 3782 * We can't just convert len to max_blocks because
3746 * If blocksize = 4096 offset = 3072 and len = 2048 3783 * If blocksize = 4096 offset = 3072 and len = 2048
3747 */ 3784 */
3748 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3785 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3749 - block; 3786 map.m_lblk);
3750 /* 3787 /*
3751 * credits to insert 1 extent into extent tree 3788 * credits to insert 1 extent into extent tree
3752 */ 3789 */
3753 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3790 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3754 while (ret >= 0 && ret < max_blocks) { 3791 while (ret >= 0 && ret < max_blocks) {
3755 block = block + ret; 3792 map.m_lblk += ret;
3756 max_blocks = max_blocks - ret; 3793 map.m_len = (max_blocks -= ret);
3757 handle = ext4_journal_start(inode, credits); 3794 handle = ext4_journal_start(inode, credits);
3758 if (IS_ERR(handle)) { 3795 if (IS_ERR(handle)) {
3759 ret = PTR_ERR(handle); 3796 ret = PTR_ERR(handle);
3760 break; 3797 break;
3761 } 3798 }
3762 map_bh.b_state = 0; 3799 ret = ext4_map_blocks(handle, inode, &map,
3763 ret = ext4_get_blocks(handle, inode, block,
3764 max_blocks, &map_bh,
3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3800 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3766 if (ret <= 0) { 3801 if (ret <= 0) {
3767 WARN_ON(ret <= 0); 3802 WARN_ON(ret <= 0);
3768 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3803 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3769 "returned error inode#%lu, block=%u, " 3804 "returned error inode#%lu, block=%u, "
3770 "max_blocks=%u", __func__, 3805 "max_blocks=%u", __func__,
3771 inode->i_ino, block, max_blocks); 3806 inode->i_ino, map.m_lblk, map.m_len);
3772 } 3807 }
3773 ext4_mark_inode_dirty(handle, inode); 3808 ext4_mark_inode_dirty(handle, inode);
3774 ret2 = ext4_journal_stop(handle); 3809 ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3933,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3898 int error = 0; 3933 int error = 0;
3899 3934
3900 /* fallback to generic here if not in extents fmt */ 3935 /* fallback to generic here if not in extents fmt */
3901 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3936 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3902 return generic_block_fiemap(inode, fieinfo, start, len, 3937 return generic_block_fiemap(inode, fieinfo, start, len,
3903 ext4_get_block); 3938 ext4_get_block);
3904 3939
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..ee92b66d4558 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,11 +66,12 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
66 * is smaller than s_maxbytes, which is for extent-mapped files. 66 * is smaller than s_maxbytes, which is for extent-mapped files.
67 */ 67 */
68 68
69 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 69 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
71 size_t length = iov_length(iov, nr_segs); 71 size_t length = iov_length(iov, nr_segs);
72 72
73 if (pos > sbi->s_bitmap_maxbytes) 73 if ((pos > sbi->s_bitmap_maxbytes ||
74 (pos == sbi->s_bitmap_maxbytes && length > 0)))
74 return -EFBIG; 75 return -EFBIG;
75 76
76 if (pos + length > sbi->s_bitmap_maxbytes) { 77 if (pos + length > sbi->s_bitmap_maxbytes) {
@@ -123,7 +124,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
123 if (!IS_ERR(cp)) { 124 if (!IS_ERR(cp)) {
124 memcpy(sbi->s_es->s_last_mounted, cp, 125 memcpy(sbi->s_es->s_last_mounted, cp,
125 sizeof(sbi->s_es->s_last_mounted)); 126 sizeof(sbi->s_es->s_last_mounted));
126 sb->s_dirt = 1; 127 ext4_mark_super_dirty(sb);
127 } 128 }
128 } 129 }
129 return dquot_file_open(inode, filp); 130 return dquot_file_open(inode, filp);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37/* 37/*
38 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since
40 * otherwise it will only be written by writeback, leaving a huge
41 * window during which a crash may lose the file. This may apply for
42 * the parent directory's parent as well, and so on recursively, if
43 * they are also freshly created.
44 */
45static void ext4_sync_parent(struct inode *inode)
46{
47 struct dentry *dentry = NULL;
48
49 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
50 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
51 dentry = list_entry(inode->i_dentry.next,
52 struct dentry, d_alias);
53 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
54 break;
55 inode = dentry->d_parent->d_inode;
56 sync_mapping_buffers(inode->i_mapping);
57 }
58}
59
60/*
38 * akpm: A new design for ext4_sync_file(). 61 * akpm: A new design for ext4_sync_file().
39 * 62 *
40 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). 63 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
48 * i_mutex lock is held when entering and exiting this function 71 * i_mutex lock is held when entering and exiting this function
49 */ 72 */
50 73
51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 74int ext4_sync_file(struct file *file, int datasync)
52{ 75{
53 struct inode *inode = dentry->d_inode; 76 struct inode *inode = file->f_mapping->host;
54 struct ext4_inode_info *ei = EXT4_I(inode); 77 struct ext4_inode_info *ei = EXT4_I(inode);
55 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 78 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
56 int ret; 79 int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
58 81
59 J_ASSERT(ext4_journal_current_handle() == NULL); 82 J_ASSERT(ext4_journal_current_handle() == NULL);
60 83
61 trace_ext4_sync_file(file, dentry, datasync); 84 trace_ext4_sync_file(file, datasync);
62 85
63 if (inode->i_sb->s_flags & MS_RDONLY) 86 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0; 87 return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
66 ret = flush_completed_IO(inode); 89 ret = flush_completed_IO(inode);
67 if (ret < 0) 90 if (ret < 0)
68 return ret; 91 return ret;
69 92
70 if (!journal) 93 if (!journal) {
71 return simple_fsync(file, dentry, datasync); 94 ret = generic_file_fsync(file, datasync);
95 if (!ret && !list_empty(&inode->i_dentry))
96 ext4_sync_parent(inode);
97 return ret;
98 }
72 99
73 /* 100 /*
74 * data=writeback,ordered: 101 * data=writeback,ordered:
@@ -100,9 +127,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
100 if (ext4_should_writeback_data(inode) && 127 if (ext4_should_writeback_data(inode) &&
101 (journal->j_fs_dev != journal->j_dev) && 128 (journal->j_fs_dev != journal->j_dev) &&
102 (journal->j_flags & JBD2_BARRIER)) 129 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 jbd2_log_wait_commit(journal, commit_tid); 131 NULL, BLKDEV_IFL_WAIT);
132 ret = jbd2_log_wait_commit(journal, commit_tid);
105 } else if (journal->j_flags & JBD2_BARRIER) 133 } else if (journal->j_flags & JBD2_BARRIER)
106 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
135 BLKDEV_IFL_WAIT);
107 return ret; 136 return ret;
108} 137}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 57f6eef6ccd6..45853e0d1f21 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -222,7 +222,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
222 is_directory = S_ISDIR(inode->i_mode); 222 is_directory = S_ISDIR(inode->i_mode);
223 223
224 /* Do this BEFORE marking the inode not in use or returning an error */ 224 /* Do this BEFORE marking the inode not in use or returning an error */
225 clear_inode(inode); 225 ext4_clear_inode(inode);
226 226
227 es = EXT4_SB(sb)->s_es; 227 es = EXT4_SB(sb)->s_es;
228 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 228 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
240 if (fatal) 240 if (fatal)
241 goto error_return; 241 goto error_return;
242 242
243 /* Ok, now we can actually update the inode bitmaps.. */ 243 fatal = -ESRCH;
244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 gdp = ext4_get_group_desc(sb, block_group, &bh2);
245 bit, bitmap_bh->b_data); 245 if (gdp) {
246 if (!cleared)
247 ext4_error(sb, "bit already cleared for inode %lu", ino);
248 else {
249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
250
251 BUFFER_TRACE(bh2, "get_write_access"); 246 BUFFER_TRACE(bh2, "get_write_access");
252 fatal = ext4_journal_get_write_access(handle, bh2); 247 fatal = ext4_journal_get_write_access(handle, bh2);
253 if (fatal) goto error_return; 248 }
254 249 ext4_lock_group(sb, block_group);
255 if (gdp) { 250 cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
256 ext4_lock_group(sb, block_group); 251 if (fatal || !cleared) {
257 count = ext4_free_inodes_count(sb, gdp) + 1; 252 ext4_unlock_group(sb, block_group);
258 ext4_free_inodes_set(sb, gdp, count); 253 goto out;
259 if (is_directory) { 254 }
260 count = ext4_used_dirs_count(sb, gdp) - 1;
261 ext4_used_dirs_set(sb, gdp, count);
262 if (sbi->s_log_groups_per_flex) {
263 ext4_group_t f;
264
265 f = ext4_flex_group(sbi, block_group);
266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
267 }
268 255
269 } 256 count = ext4_free_inodes_count(sb, gdp) + 1;
270 gdp->bg_checksum = ext4_group_desc_csum(sbi, 257 ext4_free_inodes_set(sb, gdp, count);
271 block_group, gdp); 258 if (is_directory) {
272 ext4_unlock_group(sb, block_group); 259 count = ext4_used_dirs_count(sb, gdp) - 1;
273 percpu_counter_inc(&sbi->s_freeinodes_counter); 260 ext4_used_dirs_set(sb, gdp, count);
274 if (is_directory) 261 percpu_counter_dec(&sbi->s_dirs_counter);
275 percpu_counter_dec(&sbi->s_dirs_counter); 262 }
276 263 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
277 if (sbi->s_log_groups_per_flex) { 264 ext4_unlock_group(sb, block_group);
278 ext4_group_t f; 265
279 266 percpu_counter_inc(&sbi->s_freeinodes_counter);
280 f = ext4_flex_group(sbi, block_group); 267 if (sbi->s_log_groups_per_flex) {
281 atomic_inc(&sbi->s_flex_groups[f].free_inodes); 268 ext4_group_t f = ext4_flex_group(sbi, block_group);
282 } 269
283 } 270 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); 271 if (is_directory)
285 err = ext4_handle_dirty_metadata(handle, NULL, bh2); 272 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
286 if (!fatal) fatal = err;
287 } 273 }
288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); 274 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 275 fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
290 if (!fatal) 276out:
291 fatal = err; 277 if (cleared) {
292 sb->s_dirt = 1; 278 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
279 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
280 if (!fatal)
281 fatal = err;
282 ext4_mark_super_dirty(sb);
283 } else
284 ext4_error(sb, "bit already cleared for inode %lu", ino);
285
293error_return: 286error_return:
294 brelse(bitmap_bh); 287 brelse(bitmap_bh);
295 ext4_std_error(sb, fatal); 288 ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
499 492
500 if (S_ISDIR(mode) && 493 if (S_ISDIR(mode) &&
501 ((parent == sb->s_root->d_inode) || 494 ((parent == sb->s_root->d_inode) ||
502 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { 495 (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
503 int best_ndir = inodes_per_group; 496 int best_ndir = inodes_per_group;
504 int ret = -1; 497 int ret = -1;
505 498
@@ -972,23 +965,19 @@ got:
972 percpu_counter_dec(&sbi->s_freeinodes_counter); 965 percpu_counter_dec(&sbi->s_freeinodes_counter);
973 if (S_ISDIR(mode)) 966 if (S_ISDIR(mode))
974 percpu_counter_inc(&sbi->s_dirs_counter); 967 percpu_counter_inc(&sbi->s_dirs_counter);
975 sb->s_dirt = 1; 968 ext4_mark_super_dirty(sb);
976 969
977 if (sbi->s_log_groups_per_flex) { 970 if (sbi->s_log_groups_per_flex) {
978 flex_group = ext4_flex_group(sbi, group); 971 flex_group = ext4_flex_group(sbi, group);
979 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 972 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
980 } 973 }
981 974
982 inode->i_uid = current_fsuid(); 975 if (test_opt(sb, GRPID)) {
983 if (test_opt(sb, GRPID)) 976 inode->i_mode = mode;
984 inode->i_gid = dir->i_gid; 977 inode->i_uid = current_fsuid();
985 else if (dir->i_mode & S_ISGID) {
986 inode->i_gid = dir->i_gid; 978 inode->i_gid = dir->i_gid;
987 if (S_ISDIR(mode))
988 mode |= S_ISGID;
989 } else 979 } else
990 inode->i_gid = current_fsgid(); 980 inode_init_owner(inode, dir, mode);
991 inode->i_mode = mode;
992 981
993 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); 982 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
994 /* This is the optimal IO size (for stat), not the fs block size */ 983 /* This is the optimal IO size (for stat), not the fs block size */
@@ -1045,7 +1034,7 @@ got:
1045 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 1034 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1046 /* set extent flag only for directory, file and normal symlink*/ 1035 /* set extent flag only for directory, file and normal symlink*/
1047 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 1036 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
1048 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 1037 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
1049 ext4_ext_tree_init(handle, inode); 1038 ext4_ext_tree_init(handle, inode);
1050 } 1039 }
1051 } 1040 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 81d605412844..4b8debeb3965 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
149 int ret; 149 int ret;
150 150
151 /* 151 /*
152 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this 152 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
153 * moment, get_block can be called only for blocks inside i_size since 153 * moment, get_block can be called only for blocks inside i_size since
154 * page cache has been already dropped and writes are blocked by 154 * page cache has been already dropped and writes are blocked by
155 * i_mutex. So we can safely drop the i_data_sem here. 155 * i_mutex. So we can safely drop the i_data_sem here.
@@ -167,11 +167,16 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
167/* 167/*
168 * Called at the last iput() if i_nlink is zero. 168 * Called at the last iput() if i_nlink is zero.
169 */ 169 */
170void ext4_delete_inode(struct inode *inode) 170void ext4_evict_inode(struct inode *inode)
171{ 171{
172 handle_t *handle; 172 handle_t *handle;
173 int err; 173 int err;
174 174
175 if (inode->i_nlink) {
176 truncate_inode_pages(&inode->i_data, 0);
177 goto no_delete;
178 }
179
175 if (!is_bad_inode(inode)) 180 if (!is_bad_inode(inode))
176 dquot_initialize(inode); 181 dquot_initialize(inode);
177 182
@@ -221,6 +226,7 @@ void ext4_delete_inode(struct inode *inode)
221 "couldn't extend journal (err %d)", err); 226 "couldn't extend journal (err %d)", err);
222 stop_handle: 227 stop_handle:
223 ext4_journal_stop(handle); 228 ext4_journal_stop(handle);
229 ext4_orphan_del(NULL, inode);
224 goto no_delete; 230 goto no_delete;
225 } 231 }
226 } 232 }
@@ -245,13 +251,13 @@ void ext4_delete_inode(struct inode *inode)
245 */ 251 */
246 if (ext4_mark_inode_dirty(handle, inode)) 252 if (ext4_mark_inode_dirty(handle, inode))
247 /* If that failed, just do the required in-core inode clear. */ 253 /* If that failed, just do the required in-core inode clear. */
248 clear_inode(inode); 254 ext4_clear_inode(inode);
249 else 255 else
250 ext4_free_inode(handle, inode); 256 ext4_free_inode(handle, inode);
251 ext4_journal_stop(handle); 257 ext4_journal_stop(handle);
252 return; 258 return;
253no_delete: 259no_delete:
254 clear_inode(inode); /* We must guarantee clearing of inode... */ 260 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
255} 261}
256 262
257typedef struct { 263typedef struct {
@@ -337,9 +343,11 @@ static int ext4_block_to_path(struct inode *inode,
337 return n; 343 return n;
338} 344}
339 345
340static int __ext4_check_blockref(const char *function, struct inode *inode, 346static int __ext4_check_blockref(const char *function, unsigned int line,
347 struct inode *inode,
341 __le32 *p, unsigned int max) 348 __le32 *p, unsigned int max)
342{ 349{
350 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
343 __le32 *bref = p; 351 __le32 *bref = p;
344 unsigned int blk; 352 unsigned int blk;
345 353
@@ -348,9 +356,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
348 if (blk && 356 if (blk &&
349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 357 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
350 blk, 1))) { 358 blk, 1))) {
351 __ext4_error(inode->i_sb, function, 359 es->s_last_error_block = cpu_to_le64(blk);
352 "invalid block reference %u " 360 ext4_error_inode(inode, function, line, blk,
353 "in inode #%lu", blk, inode->i_ino); 361 "invalid block");
354 return -EIO; 362 return -EIO;
355 } 363 }
356 } 364 }
@@ -359,11 +367,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
359 367
360 368
361#define ext4_check_indirect_blockref(inode, bh) \ 369#define ext4_check_indirect_blockref(inode, bh) \
362 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ 370 __ext4_check_blockref(__func__, __LINE__, inode, \
371 (__le32 *)(bh)->b_data, \
363 EXT4_ADDR_PER_BLOCK((inode)->i_sb)) 372 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
364 373
365#define ext4_check_inode_blockref(inode) \ 374#define ext4_check_inode_blockref(inode) \
366 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ 375 __ext4_check_blockref(__func__, __LINE__, inode, \
376 EXT4_I(inode)->i_data, \
367 EXT4_NDIR_BLOCKS) 377 EXT4_NDIR_BLOCKS)
368 378
369/** 379/**
@@ -785,7 +795,7 @@ failed:
785 /* Allocation failed, free what we already allocated */ 795 /* Allocation failed, free what we already allocated */
786 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 796 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
787 for (i = 1; i <= n ; i++) { 797 for (i = 1; i <= n ; i++) {
788 /* 798 /*
789 * branch[i].bh is newly allocated, so there is no 799 * branch[i].bh is newly allocated, so there is no
790 * need to revoke the block, which is why we don't 800 * need to revoke the block, which is why we don't
791 * need to set EXT4_FREE_BLOCKS_METADATA. 801 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +885,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
875 885
876err_out: 886err_out:
877 for (i = 1; i <= num; i++) { 887 for (i = 1; i <= num; i++) {
878 /* 888 /*
879 * branch[i].bh is newly allocated, so there is no 889 * branch[i].bh is newly allocated, so there is no
880 * need to revoke the block, which is why we don't 890 * need to revoke the block, which is why we don't
881 * need to set EXT4_FREE_BLOCKS_METADATA. 891 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +900,9 @@ err_out:
890} 900}
891 901
892/* 902/*
893 * The ext4_ind_get_blocks() function handles non-extents inodes 903 * The ext4_ind_map_blocks() function handles non-extents inodes
894 * (i.e., using the traditional indirect/double-indirect i_blocks 904 * (i.e., using the traditional indirect/double-indirect i_blocks
895 * scheme) for ext4_get_blocks(). 905 * scheme) for ext4_map_blocks().
896 * 906 *
897 * Allocation strategy is simple: if we have to allocate something, we will 907 * Allocation strategy is simple: if we have to allocate something, we will
898 * have to go the whole way to leaf. So let's do it before attaching anything 908 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +927,8 @@ err_out:
917 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 927 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
918 * blocks. 928 * blocks.
919 */ 929 */
920static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, 930static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
921 ext4_lblk_t iblock, unsigned int maxblocks, 931 struct ext4_map_blocks *map,
922 struct buffer_head *bh_result,
923 int flags) 932 int flags)
924{ 933{
925 int err = -EIO; 934 int err = -EIO;
@@ -933,9 +942,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
933 int count = 0; 942 int count = 0;
934 ext4_fsblk_t first_block = 0; 943 ext4_fsblk_t first_block = 0;
935 944
936 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 945 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
937 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 946 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
938 depth = ext4_block_to_path(inode, iblock, offsets, 947 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
939 &blocks_to_boundary); 948 &blocks_to_boundary);
940 949
941 if (depth == 0) 950 if (depth == 0)
@@ -946,10 +955,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
946 /* Simplest case - block found, no allocation needed */ 955 /* Simplest case - block found, no allocation needed */
947 if (!partial) { 956 if (!partial) {
948 first_block = le32_to_cpu(chain[depth - 1].key); 957 first_block = le32_to_cpu(chain[depth - 1].key);
949 clear_buffer_new(bh_result);
950 count++; 958 count++;
951 /*map more blocks*/ 959 /*map more blocks*/
952 while (count < maxblocks && count <= blocks_to_boundary) { 960 while (count < map->m_len && count <= blocks_to_boundary) {
953 ext4_fsblk_t blk; 961 ext4_fsblk_t blk;
954 962
955 blk = le32_to_cpu(*(chain[depth-1].p + count)); 963 blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +977,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
969 /* 977 /*
970 * Okay, we need to do block allocation. 978 * Okay, we need to do block allocation.
971 */ 979 */
972 goal = ext4_find_goal(inode, iblock, partial); 980 goal = ext4_find_goal(inode, map->m_lblk, partial);
973 981
974 /* the number of blocks need to allocate for [d,t]indirect blocks */ 982 /* the number of blocks need to allocate for [d,t]indirect blocks */
975 indirect_blks = (chain + depth) - partial - 1; 983 indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +987,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
979 * direct blocks to allocate for this branch. 987 * direct blocks to allocate for this branch.
980 */ 988 */
981 count = ext4_blks_to_allocate(partial, indirect_blks, 989 count = ext4_blks_to_allocate(partial, indirect_blks,
982 maxblocks, blocks_to_boundary); 990 map->m_len, blocks_to_boundary);
983 /* 991 /*
984 * Block out ext4_truncate while we alter the tree 992 * Block out ext4_truncate while we alter the tree
985 */ 993 */
986 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 994 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
987 &count, goal, 995 &count, goal,
988 offsets + (partial - chain), partial); 996 offsets + (partial - chain), partial);
989 997
@@ -995,18 +1003,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
995 * may need to return -EAGAIN upwards in the worst case. --sct 1003 * may need to return -EAGAIN upwards in the worst case. --sct
996 */ 1004 */
997 if (!err) 1005 if (!err)
998 err = ext4_splice_branch(handle, inode, iblock, 1006 err = ext4_splice_branch(handle, inode, map->m_lblk,
999 partial, indirect_blks, count); 1007 partial, indirect_blks, count);
1000 if (err) 1008 if (err)
1001 goto cleanup; 1009 goto cleanup;
1002 1010
1003 set_buffer_new(bh_result); 1011 map->m_flags |= EXT4_MAP_NEW;
1004 1012
1005 ext4_update_inode_fsync_trans(handle, inode, 1); 1013 ext4_update_inode_fsync_trans(handle, inode, 1);
1006got_it: 1014got_it:
1007 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1015 map->m_flags |= EXT4_MAP_MAPPED;
1016 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1017 map->m_len = count;
1008 if (count > blocks_to_boundary) 1018 if (count > blocks_to_boundary)
1009 set_buffer_boundary(bh_result); 1019 map->m_flags |= EXT4_MAP_BOUNDARY;
1010 err = count; 1020 err = count;
1011 /* Clean up and exit */ 1021 /* Clean up and exit */
1012 partial = chain + depth - 1; /* the whole chain */ 1022 partial = chain + depth - 1; /* the whole chain */
@@ -1016,7 +1026,6 @@ cleanup:
1016 brelse(partial->bh); 1026 brelse(partial->bh);
1017 partial--; 1027 partial--;
1018 } 1028 }
1019 BUFFER_TRACE(bh_result, "returned");
1020out: 1029out:
1021 return err; 1030 return err;
1022} 1031}
@@ -1061,7 +1070,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1061 */ 1070 */
1062static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1071static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1063{ 1072{
1064 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1073 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1065 return ext4_ext_calc_metadata_amount(inode, lblock); 1074 return ext4_ext_calc_metadata_amount(inode, lblock);
1066 1075
1067 return ext4_indirect_calc_metadata_amount(inode, lblock); 1076 return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1085,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
1076{ 1085{
1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1086 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1078 struct ext4_inode_info *ei = EXT4_I(inode); 1087 struct ext4_inode_info *ei = EXT4_I(inode);
1079 int mdb_free = 0, allocated_meta_blocks = 0;
1080 1088
1081 spin_lock(&ei->i_block_reservation_lock); 1089 spin_lock(&ei->i_block_reservation_lock);
1082 trace_ext4_da_update_reserve_space(inode, used); 1090 trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1099,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
1091 1099
1092 /* Update per-inode reservations */ 1100 /* Update per-inode reservations */
1093 ei->i_reserved_data_blocks -= used; 1101 ei->i_reserved_data_blocks -= used;
1094 used += ei->i_allocated_meta_blocks;
1095 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 1102 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1096 allocated_meta_blocks = ei->i_allocated_meta_blocks; 1103 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1104 used + ei->i_allocated_meta_blocks);
1097 ei->i_allocated_meta_blocks = 0; 1105 ei->i_allocated_meta_blocks = 0;
1098 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1099 1106
1100 if (ei->i_reserved_data_blocks == 0) { 1107 if (ei->i_reserved_data_blocks == 0) {
1101 /* 1108 /*
@@ -1103,30 +1110,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
1103 * only when we have written all of the delayed 1110 * only when we have written all of the delayed
1104 * allocation blocks. 1111 * allocation blocks.
1105 */ 1112 */
1106 mdb_free = ei->i_reserved_meta_blocks; 1113 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1114 ei->i_reserved_meta_blocks);
1107 ei->i_reserved_meta_blocks = 0; 1115 ei->i_reserved_meta_blocks = 0;
1108 ei->i_da_metadata_calc_len = 0; 1116 ei->i_da_metadata_calc_len = 0;
1109 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1110 } 1117 }
1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1118 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1112 1119
1113 /* Update quota subsystem */ 1120 /* Update quota subsystem for data blocks */
1114 if (quota_claim) { 1121 if (quota_claim)
1115 dquot_claim_block(inode, used); 1122 dquot_claim_block(inode, used);
1116 if (mdb_free) 1123 else {
1117 dquot_release_reservation_block(inode, mdb_free);
1118 } else {
1119 /* 1124 /*
1120 * We did fallocate with an offset that is already delayed 1125 * We did fallocate with an offset that is already delayed
1121 * allocated. So on delayed allocated writeback we should 1126 * allocated. So on delayed allocated writeback we should
1122 * not update the quota for allocated blocks. But then 1127 * not re-claim the quota for fallocated blocks.
1123 * converting an fallocate region to initialized region would
1124 * have caused a metadata allocation. So claim quota for
1125 * that
1126 */ 1128 */
1127 if (allocated_meta_blocks) 1129 dquot_release_reservation_block(inode, used);
1128 dquot_claim_block(inode, allocated_meta_blocks);
1129 dquot_release_reservation_block(inode, mdb_free + used);
1130 } 1130 }
1131 1131
1132 /* 1132 /*
@@ -1139,20 +1139,24 @@ void ext4_da_update_reserve_space(struct inode *inode,
1139 ext4_discard_preallocations(inode); 1139 ext4_discard_preallocations(inode);
1140} 1140}
1141 1141
1142static int check_block_validity(struct inode *inode, const char *msg, 1142static int __check_block_validity(struct inode *inode, const char *func,
1143 sector_t logical, sector_t phys, int len) 1143 unsigned int line,
1144 struct ext4_map_blocks *map)
1144{ 1145{
1145 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1146 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1146 __ext4_error(inode->i_sb, msg, 1147 map->m_len)) {
1147 "inode #%lu logical block %llu mapped to %llu " 1148 ext4_error_inode(inode, func, line, map->m_pblk,
1148 "(size %d)", inode->i_ino, 1149 "lblock %lu mapped to illegal pblock "
1149 (unsigned long long) logical, 1150 "(length %d)", (unsigned long) map->m_lblk,
1150 (unsigned long long) phys, len); 1151 map->m_len);
1151 return -EIO; 1152 return -EIO;
1152 } 1153 }
1153 return 0; 1154 return 0;
1154} 1155}
1155 1156
1157#define check_block_validity(inode, map) \
1158 __check_block_validity((inode), __func__, __LINE__, (map))
1159
1156/* 1160/*
1157 * Return the number of contiguous dirty pages in a given inode 1161 * Return the number of contiguous dirty pages in a given inode
1158 * starting at page frame idx. 1162 * starting at page frame idx.
@@ -1212,15 +1216,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1212} 1216}
1213 1217
1214/* 1218/*
1215 * The ext4_get_blocks() function tries to look up the requested blocks, 1219 * The ext4_map_blocks() function tries to look up the requested blocks,
1216 * and returns if the blocks are already mapped. 1220 * and returns if the blocks are already mapped.
1217 * 1221 *
1218 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1222 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1219 * and store the allocated blocks in the result buffer head and mark it 1223 * and store the allocated blocks in the result buffer head and mark it
1220 * mapped. 1224 * mapped.
1221 * 1225 *
1222 * If file type is extents based, it will call ext4_ext_get_blocks(), 1226 * If file type is extents based, it will call ext4_ext_map_blocks(),
1223 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping 1227 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1224 * based files 1228 * based files
1225 * 1229 *
1226 * On success, it returns the number of blocks being mapped or allocate. 1230 * On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1237,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1233 * 1237 *
1234 * It returns the error in case of allocation failure. 1238 * It returns the error in case of allocation failure.
1235 */ 1239 */
1236int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, 1240int ext4_map_blocks(handle_t *handle, struct inode *inode,
1237 unsigned int max_blocks, struct buffer_head *bh, 1241 struct ext4_map_blocks *map, int flags)
1238 int flags)
1239{ 1242{
1240 int retval; 1243 int retval;
1241 1244
1242 clear_buffer_mapped(bh); 1245 map->m_flags = 0;
1243 clear_buffer_unwritten(bh); 1246 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1244 1247 "logical block %lu\n", inode->i_ino, flags, map->m_len,
1245 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," 1248 (unsigned long) map->m_lblk);
1246 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1247 (unsigned long)block);
1248 /* 1249 /*
1249 * Try to see if we can get the block without requesting a new 1250 * Try to see if we can get the block without requesting a new
1250 * file system block. 1251 * file system block.
1251 */ 1252 */
1252 down_read((&EXT4_I(inode)->i_data_sem)); 1253 down_read((&EXT4_I(inode)->i_data_sem));
1253 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1254 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1254 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1255 retval = ext4_ext_map_blocks(handle, inode, map, 0);
1255 bh, 0);
1256 } else { 1256 } else {
1257 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, 1257 retval = ext4_ind_map_blocks(handle, inode, map, 0);
1258 bh, 0);
1259 } 1258 }
1260 up_read((&EXT4_I(inode)->i_data_sem)); 1259 up_read((&EXT4_I(inode)->i_data_sem));
1261 1260
1262 if (retval > 0 && buffer_mapped(bh)) { 1261 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1263 int ret = check_block_validity(inode, "file system corruption", 1262 int ret = check_block_validity(inode, map);
1264 block, bh->b_blocknr, retval);
1265 if (ret != 0) 1263 if (ret != 0)
1266 return ret; 1264 return ret;
1267 } 1265 }
@@ -1277,7 +1275,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1277 * ext4_ext_get_block() returns th create = 0 1275 * ext4_ext_get_block() returns th create = 0
1278 * with buffer head unmapped. 1276 * with buffer head unmapped.
1279 */ 1277 */
1280 if (retval > 0 && buffer_mapped(bh)) 1278 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1281 return retval; 1279 return retval;
1282 1280
1283 /* 1281 /*
@@ -1290,7 +1288,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1290 * of BH_Unwritten and BH_Mapped flags being simultaneously 1288 * of BH_Unwritten and BH_Mapped flags being simultaneously
1291 * set on the buffer_head. 1289 * set on the buffer_head.
1292 */ 1290 */
1293 clear_buffer_unwritten(bh); 1291 map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1294 1292
1295 /* 1293 /*
1296 * New blocks allocate and/or writing to uninitialized extent 1294 * New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1310,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1312 * We need to check for EXT4 here because migrate 1310 * We need to check for EXT4 here because migrate
1313 * could have changed the inode type in between 1311 * could have changed the inode type in between
1314 */ 1312 */
1315 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1313 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1316 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1314 retval = ext4_ext_map_blocks(handle, inode, map, flags);
1317 bh, flags);
1318 } else { 1315 } else {
1319 retval = ext4_ind_get_blocks(handle, inode, block, 1316 retval = ext4_ind_map_blocks(handle, inode, map, flags);
1320 max_blocks, bh, flags);
1321 1317
1322 if (retval > 0 && buffer_new(bh)) { 1318 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1323 /* 1319 /*
1324 * We allocated new blocks which will result in 1320 * We allocated new blocks which will result in
1325 * i_data's format changing. Force the migrate 1321 * i_data's format changing. Force the migrate
@@ -1342,10 +1338,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1342 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1338 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1343 1339
1344 up_write((&EXT4_I(inode)->i_data_sem)); 1340 up_write((&EXT4_I(inode)->i_data_sem));
1345 if (retval > 0 && buffer_mapped(bh)) { 1341 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1346 int ret = check_block_validity(inode, "file system " 1342 int ret = check_block_validity(inode, map);
1347 "corruption after allocation",
1348 block, bh->b_blocknr, retval);
1349 if (ret != 0) 1343 if (ret != 0)
1350 return ret; 1344 return ret;
1351 } 1345 }
@@ -1355,109 +1349,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1355/* Maximum number of blocks we map for direct IO at once. */ 1349/* Maximum number of blocks we map for direct IO at once. */
1356#define DIO_MAX_BLOCKS 4096 1350#define DIO_MAX_BLOCKS 4096
1357 1351
1358int ext4_get_block(struct inode *inode, sector_t iblock, 1352static int _ext4_get_block(struct inode *inode, sector_t iblock,
1359 struct buffer_head *bh_result, int create) 1353 struct buffer_head *bh, int flags)
1360{ 1354{
1361 handle_t *handle = ext4_journal_current_handle(); 1355 handle_t *handle = ext4_journal_current_handle();
1356 struct ext4_map_blocks map;
1362 int ret = 0, started = 0; 1357 int ret = 0, started = 0;
1363 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1364 int dio_credits; 1358 int dio_credits;
1365 1359
1366 if (create && !handle) { 1360 map.m_lblk = iblock;
1361 map.m_len = bh->b_size >> inode->i_blkbits;
1362
1363 if (flags && !handle) {
1367 /* Direct IO write... */ 1364 /* Direct IO write... */
1368 if (max_blocks > DIO_MAX_BLOCKS) 1365 if (map.m_len > DIO_MAX_BLOCKS)
1369 max_blocks = DIO_MAX_BLOCKS; 1366 map.m_len = DIO_MAX_BLOCKS;
1370 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1367 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1371 handle = ext4_journal_start(inode, dio_credits); 1368 handle = ext4_journal_start(inode, dio_credits);
1372 if (IS_ERR(handle)) { 1369 if (IS_ERR(handle)) {
1373 ret = PTR_ERR(handle); 1370 ret = PTR_ERR(handle);
1374 goto out; 1371 return ret;
1375 } 1372 }
1376 started = 1; 1373 started = 1;
1377 } 1374 }
1378 1375
1379 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 1376 ret = ext4_map_blocks(handle, inode, &map, flags);
1380 create ? EXT4_GET_BLOCKS_CREATE : 0);
1381 if (ret > 0) { 1377 if (ret > 0) {
1382 bh_result->b_size = (ret << inode->i_blkbits); 1378 map_bh(bh, inode->i_sb, map.m_pblk);
1379 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1380 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1383 ret = 0; 1381 ret = 0;
1384 } 1382 }
1385 if (started) 1383 if (started)
1386 ext4_journal_stop(handle); 1384 ext4_journal_stop(handle);
1387out:
1388 return ret; 1385 return ret;
1389} 1386}
1390 1387
1388int ext4_get_block(struct inode *inode, sector_t iblock,
1389 struct buffer_head *bh, int create)
1390{
1391 return _ext4_get_block(inode, iblock, bh,
1392 create ? EXT4_GET_BLOCKS_CREATE : 0);
1393}
1394
1391/* 1395/*
1392 * `handle' can be NULL if create is zero 1396 * `handle' can be NULL if create is zero
1393 */ 1397 */
1394struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1398struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1395 ext4_lblk_t block, int create, int *errp) 1399 ext4_lblk_t block, int create, int *errp)
1396{ 1400{
1397 struct buffer_head dummy; 1401 struct ext4_map_blocks map;
1402 struct buffer_head *bh;
1398 int fatal = 0, err; 1403 int fatal = 0, err;
1399 int flags = 0;
1400 1404
1401 J_ASSERT(handle != NULL || create == 0); 1405 J_ASSERT(handle != NULL || create == 0);
1402 1406
1403 dummy.b_state = 0; 1407 map.m_lblk = block;
1404 dummy.b_blocknr = -1000; 1408 map.m_len = 1;
1405 buffer_trace_init(&dummy.b_history); 1409 err = ext4_map_blocks(handle, inode, &map,
1406 if (create) 1410 create ? EXT4_GET_BLOCKS_CREATE : 0);
1407 flags |= EXT4_GET_BLOCKS_CREATE; 1411
1408 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); 1412 if (err < 0)
1409 /* 1413 *errp = err;
1410 * ext4_get_blocks() returns number of blocks mapped. 0 in 1414 if (err <= 0)
1411 * case of a HOLE. 1415 return NULL;
1412 */ 1416 *errp = 0;
1413 if (err > 0) { 1417
1414 if (err > 1) 1418 bh = sb_getblk(inode->i_sb, map.m_pblk);
1415 WARN_ON(1); 1419 if (!bh) {
1416 err = 0; 1420 *errp = -EIO;
1421 return NULL;
1417 } 1422 }
1418 *errp = err; 1423 if (map.m_flags & EXT4_MAP_NEW) {
1419 if (!err && buffer_mapped(&dummy)) { 1424 J_ASSERT(create != 0);
1420 struct buffer_head *bh; 1425 J_ASSERT(handle != NULL);
1421 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1422 if (!bh) {
1423 *errp = -EIO;
1424 goto err;
1425 }
1426 if (buffer_new(&dummy)) {
1427 J_ASSERT(create != 0);
1428 J_ASSERT(handle != NULL);
1429 1426
1430 /* 1427 /*
1431 * Now that we do not always journal data, we should 1428 * Now that we do not always journal data, we should
1432 * keep in mind whether this should always journal the 1429 * keep in mind whether this should always journal the
1433 * new buffer as metadata. For now, regular file 1430 * new buffer as metadata. For now, regular file
1434 * writes use ext4_get_block instead, so it's not a 1431 * writes use ext4_get_block instead, so it's not a
1435 * problem. 1432 * problem.
1436 */ 1433 */
1437 lock_buffer(bh); 1434 lock_buffer(bh);
1438 BUFFER_TRACE(bh, "call get_create_access"); 1435 BUFFER_TRACE(bh, "call get_create_access");
1439 fatal = ext4_journal_get_create_access(handle, bh); 1436 fatal = ext4_journal_get_create_access(handle, bh);
1440 if (!fatal && !buffer_uptodate(bh)) { 1437 if (!fatal && !buffer_uptodate(bh)) {
1441 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1438 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1442 set_buffer_uptodate(bh); 1439 set_buffer_uptodate(bh);
1443 }
1444 unlock_buffer(bh);
1445 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1446 err = ext4_handle_dirty_metadata(handle, inode, bh);
1447 if (!fatal)
1448 fatal = err;
1449 } else {
1450 BUFFER_TRACE(bh, "not a new buffer");
1451 }
1452 if (fatal) {
1453 *errp = fatal;
1454 brelse(bh);
1455 bh = NULL;
1456 } 1440 }
1457 return bh; 1441 unlock_buffer(bh);
1442 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1443 err = ext4_handle_dirty_metadata(handle, inode, bh);
1444 if (!fatal)
1445 fatal = err;
1446 } else {
1447 BUFFER_TRACE(bh, "not a new buffer");
1458 } 1448 }
1459err: 1449 if (fatal) {
1460 return NULL; 1450 *errp = fatal;
1451 brelse(bh);
1452 bh = NULL;
1453 }
1454 return bh;
1461} 1455}
1462 1456
1463struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1457struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1538,9 +1532,25 @@ static int walk_page_buffers(handle_t *handle,
1538static int do_journal_get_write_access(handle_t *handle, 1532static int do_journal_get_write_access(handle_t *handle,
1539 struct buffer_head *bh) 1533 struct buffer_head *bh)
1540{ 1534{
1535 int dirty = buffer_dirty(bh);
1536 int ret;
1537
1541 if (!buffer_mapped(bh) || buffer_freed(bh)) 1538 if (!buffer_mapped(bh) || buffer_freed(bh))
1542 return 0; 1539 return 0;
1543 return ext4_journal_get_write_access(handle, bh); 1540 /*
1541 * __block_prepare_write() could have dirtied some buffers. Clean
1542 * the dirty bit as jbd2_journal_get_write_access() could complain
1543 * otherwise about fs integrity issues. Setting of the dirty bit
1544 * by __block_prepare_write() isn't a real problem here as we clear
1545 * the bit before releasing a page lock and thus writeback cannot
1546 * ever write the buffer.
1547 */
1548 if (dirty)
1549 clear_buffer_dirty(bh);
1550 ret = ext4_journal_get_write_access(handle, bh);
1551 if (!ret && dirty)
1552 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1553 return ret;
1544} 1554}
1545 1555
1546/* 1556/*
@@ -1597,11 +1607,9 @@ retry:
1597 *pagep = page; 1607 *pagep = page;
1598 1608
1599 if (ext4_should_dioread_nolock(inode)) 1609 if (ext4_should_dioread_nolock(inode))
1600 ret = block_write_begin(file, mapping, pos, len, flags, pagep, 1610 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
1601 fsdata, ext4_get_block_write);
1602 else 1611 else
1603 ret = block_write_begin(file, mapping, pos, len, flags, pagep, 1612 ret = __block_write_begin(page, pos, len, ext4_get_block);
1604 fsdata, ext4_get_block);
1605 1613
1606 if (!ret && ext4_should_journal_data(inode)) { 1614 if (!ret && ext4_should_journal_data(inode)) {
1607 ret = walk_page_buffers(handle, page_buffers(page), 1615 ret = walk_page_buffers(handle, page_buffers(page),
@@ -1612,7 +1620,7 @@ retry:
1612 unlock_page(page); 1620 unlock_page(page);
1613 page_cache_release(page); 1621 page_cache_release(page);
1614 /* 1622 /*
1615 * block_write_begin may have instantiated a few blocks 1623 * __block_write_begin may have instantiated a few blocks
1616 * outside i_size. Trim these off again. Don't need 1624 * outside i_size. Trim these off again. Don't need
1617 * i_size_read because we hold i_mutex. 1625 * i_size_read because we hold i_mutex.
1618 * 1626 *
@@ -1860,7 +1868,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1860 int retries = 0; 1868 int retries = 0;
1861 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1869 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1862 struct ext4_inode_info *ei = EXT4_I(inode); 1870 struct ext4_inode_info *ei = EXT4_I(inode);
1863 unsigned long md_needed, md_reserved; 1871 unsigned long md_needed;
1864 int ret; 1872 int ret;
1865 1873
1866 /* 1874 /*
@@ -1870,22 +1878,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1870 */ 1878 */
1871repeat: 1879repeat:
1872 spin_lock(&ei->i_block_reservation_lock); 1880 spin_lock(&ei->i_block_reservation_lock);
1873 md_reserved = ei->i_reserved_meta_blocks;
1874 md_needed = ext4_calc_metadata_amount(inode, lblock); 1881 md_needed = ext4_calc_metadata_amount(inode, lblock);
1875 trace_ext4_da_reserve_space(inode, md_needed); 1882 trace_ext4_da_reserve_space(inode, md_needed);
1876 spin_unlock(&ei->i_block_reservation_lock); 1883 spin_unlock(&ei->i_block_reservation_lock);
1877 1884
1878 /* 1885 /*
1879 * Make quota reservation here to prevent quota overflow 1886 * We will charge metadata quota at writeout time; this saves
1880 * later. Real quota accounting is done at pages writeout 1887 * us from metadata over-estimation, though we may go over by
1881 * time. 1888 * a small amount in the end. Here we just reserve for data.
1882 */ 1889 */
1883 ret = dquot_reserve_block(inode, md_needed + 1); 1890 ret = dquot_reserve_block(inode, 1);
1884 if (ret) 1891 if (ret)
1885 return ret; 1892 return ret;
1886 1893 /*
1894 * We do still charge estimated metadata to the sb though;
1895 * we cannot afford to run out of free blocks.
1896 */
1887 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1897 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1888 dquot_release_reservation_block(inode, md_needed + 1); 1898 dquot_release_reservation_block(inode, 1);
1889 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1899 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1890 yield(); 1900 yield();
1891 goto repeat; 1901 goto repeat;
@@ -1910,6 +1920,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1910 1920
1911 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1921 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1912 1922
1923 trace_ext4_da_release_space(inode, to_free);
1913 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1924 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1914 /* 1925 /*
1915 * if there aren't enough reserved blocks, then the 1926 * if there aren't enough reserved blocks, then the
@@ -1932,12 +1943,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1932 * only when we have written all of the delayed 1943 * only when we have written all of the delayed
1933 * allocation blocks. 1944 * allocation blocks.
1934 */ 1945 */
1935 to_free += ei->i_reserved_meta_blocks; 1946 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1947 ei->i_reserved_meta_blocks);
1936 ei->i_reserved_meta_blocks = 0; 1948 ei->i_reserved_meta_blocks = 0;
1937 ei->i_da_metadata_calc_len = 0; 1949 ei->i_da_metadata_calc_len = 0;
1938 } 1950 }
1939 1951
1940 /* update fs dirty blocks counter */ 1952 /* update fs dirty data blocks counter */
1941 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1953 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1942 1954
1943 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1955 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2054,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2042/* 2054/*
2043 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2055 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
2044 * 2056 *
2045 * @mpd->inode - inode to walk through
2046 * @exbh->b_blocknr - first block on a disk
2047 * @exbh->b_size - amount of space in bytes
2048 * @logical - first logical block to start assignment with
2049 *
2050 * the function goes through all passed space and put actual disk 2057 * the function goes through all passed space and put actual disk
2051 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten 2058 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2052 */ 2059 */
2053static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 2060static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2054 struct buffer_head *exbh) 2061 struct ext4_map_blocks *map)
2055{ 2062{
2056 struct inode *inode = mpd->inode; 2063 struct inode *inode = mpd->inode;
2057 struct address_space *mapping = inode->i_mapping; 2064 struct address_space *mapping = inode->i_mapping;
2058 int blocks = exbh->b_size >> inode->i_blkbits; 2065 int blocks = map->m_len;
2059 sector_t pblock = exbh->b_blocknr, cur_logical; 2066 sector_t pblock = map->m_pblk, cur_logical;
2060 struct buffer_head *head, *bh; 2067 struct buffer_head *head, *bh;
2061 pgoff_t index, end; 2068 pgoff_t index, end;
2062 struct pagevec pvec; 2069 struct pagevec pvec;
2063 int nr_pages, i; 2070 int nr_pages, i;
2064 2071
2065 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2072 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2066 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2073 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2067 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2074 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2068 2075
2069 pagevec_init(&pvec, 0); 2076 pagevec_init(&pvec, 0);
@@ -2090,17 +2097,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2090 2097
2091 /* skip blocks out of the range */ 2098 /* skip blocks out of the range */
2092 do { 2099 do {
2093 if (cur_logical >= logical) 2100 if (cur_logical >= map->m_lblk)
2094 break; 2101 break;
2095 cur_logical++; 2102 cur_logical++;
2096 } while ((bh = bh->b_this_page) != head); 2103 } while ((bh = bh->b_this_page) != head);
2097 2104
2098 do { 2105 do {
2099 if (cur_logical >= logical + blocks) 2106 if (cur_logical >= map->m_lblk + blocks)
2100 break; 2107 break;
2101 2108
2102 if (buffer_delay(bh) || 2109 if (buffer_delay(bh) || buffer_unwritten(bh)) {
2103 buffer_unwritten(bh)) {
2104 2110
2105 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); 2111 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2106 2112
@@ -2119,7 +2125,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2119 } else if (buffer_mapped(bh)) 2125 } else if (buffer_mapped(bh))
2120 BUG_ON(bh->b_blocknr != pblock); 2126 BUG_ON(bh->b_blocknr != pblock);
2121 2127
2122 if (buffer_uninit(exbh)) 2128 if (map->m_flags & EXT4_MAP_UNINIT)
2123 set_buffer_uninit(bh); 2129 set_buffer_uninit(bh);
2124 cur_logical++; 2130 cur_logical++;
2125 pblock++; 2131 pblock++;
@@ -2130,21 +2136,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2130} 2136}
2131 2137
2132 2138
2133/*
2134 * __unmap_underlying_blocks - just a helper function to unmap
2135 * set of blocks described by @bh
2136 */
2137static inline void __unmap_underlying_blocks(struct inode *inode,
2138 struct buffer_head *bh)
2139{
2140 struct block_device *bdev = inode->i_sb->s_bdev;
2141 int blocks, i;
2142
2143 blocks = bh->b_size >> inode->i_blkbits;
2144 for (i = 0; i < blocks; i++)
2145 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
2146}
2147
2148static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2139static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2149 sector_t logical, long blk_cnt) 2140 sector_t logical, long blk_cnt)
2150{ 2141{
@@ -2206,7 +2197,7 @@ static void ext4_print_free_blocks(struct inode *inode)
2206static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2197static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2207{ 2198{
2208 int err, blks, get_blocks_flags; 2199 int err, blks, get_blocks_flags;
2209 struct buffer_head new; 2200 struct ext4_map_blocks map;
2210 sector_t next = mpd->b_blocknr; 2201 sector_t next = mpd->b_blocknr;
2211 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2202 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2212 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2203 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2230,7 +2221,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2230 BUG_ON(!handle); 2221 BUG_ON(!handle);
2231 2222
2232 /* 2223 /*
2233 * Call ext4_get_blocks() to allocate any delayed allocation 2224 * Call ext4_map_blocks() to allocate any delayed allocation
2234 * blocks, or to convert an uninitialized extent to be 2225 * blocks, or to convert an uninitialized extent to be
2235 * initialized (in the case where we have written into 2226 * initialized (in the case where we have written into
2236 * one or more preallocated blocks). 2227 * one or more preallocated blocks).
@@ -2239,7 +2230,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2239 * indicate that we are on the delayed allocation path. This 2230 * indicate that we are on the delayed allocation path. This
2240 * affects functions in many different parts of the allocation 2231 * affects functions in many different parts of the allocation
2241 * call path. This flag exists primarily because we don't 2232 * call path. This flag exists primarily because we don't
2242 * want to change *many* call functions, so ext4_get_blocks() 2233 * want to change *many* call functions, so ext4_map_blocks()
2243 * will set the magic i_delalloc_reserved_flag once the 2234 * will set the magic i_delalloc_reserved_flag once the
2244 * inode's allocation semaphore is taken. 2235 * inode's allocation semaphore is taken.
2245 * 2236 *
@@ -2247,16 +2238,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2247 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 2238 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2248 * variables are updated after the blocks have been allocated. 2239 * variables are updated after the blocks have been allocated.
2249 */ 2240 */
2250 new.b_state = 0; 2241 map.m_lblk = next;
2242 map.m_len = max_blocks;
2251 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2243 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2252 if (ext4_should_dioread_nolock(mpd->inode)) 2244 if (ext4_should_dioread_nolock(mpd->inode))
2253 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2245 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2254 if (mpd->b_state & (1 << BH_Delay)) 2246 if (mpd->b_state & (1 << BH_Delay))
2255 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2247 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2256 2248
2257 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2249 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2258 &new, get_blocks_flags);
2259 if (blks < 0) { 2250 if (blks < 0) {
2251 struct super_block *sb = mpd->inode->i_sb;
2252
2260 err = blks; 2253 err = blks;
2261 /* 2254 /*
2262 * If get block returns with error we simply 2255 * If get block returns with error we simply
@@ -2267,7 +2260,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2267 return 0; 2260 return 0;
2268 2261
2269 if (err == -ENOSPC && 2262 if (err == -ENOSPC &&
2270 ext4_count_free_blocks(mpd->inode->i_sb)) { 2263 ext4_count_free_blocks(sb)) {
2271 mpd->retval = err; 2264 mpd->retval = err;
2272 return 0; 2265 return 0;
2273 } 2266 }
@@ -2279,16 +2272,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2279 * writepage and writepages will again try to write 2272 * writepage and writepages will again try to write
2280 * the same. 2273 * the same.
2281 */ 2274 */
2282 ext4_msg(mpd->inode->i_sb, KERN_CRIT, 2275 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2283 "delayed block allocation failed for inode %lu at " 2276 ext4_msg(sb, KERN_CRIT,
2284 "logical offset %llu with max blocks %zd with " 2277 "delayed block allocation failed for inode %lu "
2285 "error %d\n", mpd->inode->i_ino, 2278 "at logical offset %llu with max blocks %zd "
2286 (unsigned long long) next, 2279 "with error %d", mpd->inode->i_ino,
2287 mpd->b_size >> mpd->inode->i_blkbits, err); 2280 (unsigned long long) next,
2288 printk(KERN_CRIT "This should not happen!! " 2281 mpd->b_size >> mpd->inode->i_blkbits, err);
2289 "Data will be lost\n"); 2282 ext4_msg(sb, KERN_CRIT,
2290 if (err == -ENOSPC) { 2283 "This should not happen!! Data will be lost\n");
2291 ext4_print_free_blocks(mpd->inode); 2284 if (err == -ENOSPC)
2285 ext4_print_free_blocks(mpd->inode);
2292 } 2286 }
2293 /* invalidate all the pages */ 2287 /* invalidate all the pages */
2294 ext4_da_block_invalidatepages(mpd, next, 2288 ext4_da_block_invalidatepages(mpd, next,
@@ -2297,10 +2291,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2297 } 2291 }
2298 BUG_ON(blks == 0); 2292 BUG_ON(blks == 0);
2299 2293
2300 new.b_size = (blks << mpd->inode->i_blkbits); 2294 if (map.m_flags & EXT4_MAP_NEW) {
2295 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2296 int i;
2301 2297
2302 if (buffer_new(&new)) 2298 for (i = 0; i < map.m_len; i++)
2303 __unmap_underlying_blocks(mpd->inode, &new); 2299 unmap_underlying_metadata(bdev, map.m_pblk + i);
2300 }
2304 2301
2305 /* 2302 /*
2306 * If blocks are delayed marked, we need to 2303 * If blocks are delayed marked, we need to
@@ -2308,7 +2305,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2308 */ 2305 */
2309 if ((mpd->b_state & (1 << BH_Delay)) || 2306 if ((mpd->b_state & (1 << BH_Delay)) ||
2310 (mpd->b_state & (1 << BH_Unwritten))) 2307 (mpd->b_state & (1 << BH_Unwritten)))
2311 mpage_put_bnr_to_bhs(mpd, next, &new); 2308 mpage_put_bnr_to_bhs(mpd, &map);
2312 2309
2313 if (ext4_should_order_data(mpd->inode)) { 2310 if (ext4_should_order_data(mpd->inode)) {
2314 err = ext4_jbd2_file_inode(handle, mpd->inode); 2311 err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2346,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2349 sector_t next; 2346 sector_t next;
2350 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2347 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2351 2348
2349 /*
2350 * XXX Don't go larger than mballoc is willing to allocate
2351 * This is a stopgap solution. We eventually need to fold
2352 * mpage_da_submit_io() into this function and then call
2353 * ext4_map_blocks() multiple times in a loop
2354 */
2355 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2356 goto flush_it;
2357
2352 /* check if thereserved journal credits might overflow */ 2358 /* check if thereserved journal credits might overflow */
2353 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2359 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2354 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2360 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2355 /* 2361 /*
2356 * With non-extent format we are limited by the journal 2362 * With non-extent format we are limited by the journal
@@ -2423,17 +2429,6 @@ static int __mpage_da_writepage(struct page *page,
2423 struct buffer_head *bh, *head; 2429 struct buffer_head *bh, *head;
2424 sector_t logical; 2430 sector_t logical;
2425 2431
2426 if (mpd->io_done) {
2427 /*
2428 * Rest of the page in the page_vec
2429 * redirty then and skip then. We will
2430 * try to write them again after
2431 * starting a new transaction
2432 */
2433 redirty_page_for_writepage(wbc, page);
2434 unlock_page(page);
2435 return MPAGE_DA_EXTENT_TAIL;
2436 }
2437 /* 2432 /*
2438 * Can we merge this page to current extent? 2433 * Can we merge this page to current extent?
2439 */ 2434 */
@@ -2528,8 +2523,9 @@ static int __mpage_da_writepage(struct page *page,
2528 * initialized properly. 2523 * initialized properly.
2529 */ 2524 */
2530static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2525static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2531 struct buffer_head *bh_result, int create) 2526 struct buffer_head *bh, int create)
2532{ 2527{
2528 struct ext4_map_blocks map;
2533 int ret = 0; 2529 int ret = 0;
2534 sector_t invalid_block = ~((sector_t) 0xffff); 2530 sector_t invalid_block = ~((sector_t) 0xffff);
2535 2531
@@ -2537,16 +2533,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2537 invalid_block = ~0; 2533 invalid_block = ~0;
2538 2534
2539 BUG_ON(create == 0); 2535 BUG_ON(create == 0);
2540 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2536 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2537
2538 map.m_lblk = iblock;
2539 map.m_len = 1;
2541 2540
2542 /* 2541 /*
2543 * first, we need to know whether the block is allocated already 2542 * first, we need to know whether the block is allocated already
2544 * preallocated blocks are unmapped but should treated 2543 * preallocated blocks are unmapped but should treated
2545 * the same as allocated blocks. 2544 * the same as allocated blocks.
2546 */ 2545 */
2547 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); 2546 ret = ext4_map_blocks(NULL, inode, &map, 0);
2548 if ((ret == 0) && !buffer_delay(bh_result)) { 2547 if (ret < 0)
2549 /* the block isn't (pre)allocated yet, let's reserve space */ 2548 return ret;
2549 if (ret == 0) {
2550 if (buffer_delay(bh))
2551 return 0; /* Not sure this could or should happen */
2550 /* 2552 /*
2551 * XXX: __block_prepare_write() unmaps passed block, 2553 * XXX: __block_prepare_write() unmaps passed block,
2552 * is it OK? 2554 * is it OK?
@@ -2556,62 +2558,47 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2556 /* not enough space to reserve */ 2558 /* not enough space to reserve */
2557 return ret; 2559 return ret;
2558 2560
2559 map_bh(bh_result, inode->i_sb, invalid_block); 2561 map_bh(bh, inode->i_sb, invalid_block);
2560 set_buffer_new(bh_result); 2562 set_buffer_new(bh);
2561 set_buffer_delay(bh_result); 2563 set_buffer_delay(bh);
2562 } else if (ret > 0) { 2564 return 0;
2563 bh_result->b_size = (ret << inode->i_blkbits);
2564 if (buffer_unwritten(bh_result)) {
2565 /* A delayed write to unwritten bh should
2566 * be marked new and mapped. Mapped ensures
2567 * that we don't do get_block multiple times
2568 * when we write to the same offset and new
2569 * ensures that we do proper zero out for
2570 * partial write.
2571 */
2572 set_buffer_new(bh_result);
2573 set_buffer_mapped(bh_result);
2574 }
2575 ret = 0;
2576 } 2565 }
2577 2566
2578 return ret; 2567 map_bh(bh, inode->i_sb, map.m_pblk);
2568 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2569
2570 if (buffer_unwritten(bh)) {
2571 /* A delayed write to unwritten bh should be marked
2572 * new and mapped. Mapped ensures that we don't do
2573 * get_block multiple times when we write to the same
2574 * offset and new ensures that we do proper zero out
2575 * for partial write.
2576 */
2577 set_buffer_new(bh);
2578 set_buffer_mapped(bh);
2579 }
2580 return 0;
2579} 2581}
2580 2582
2581/* 2583/*
2582 * This function is used as a standard get_block_t calback function 2584 * This function is used as a standard get_block_t calback function
2583 * when there is no desire to allocate any blocks. It is used as a 2585 * when there is no desire to allocate any blocks. It is used as a
2584 * callback function for block_prepare_write(), nobh_writepage(), and 2586 * callback function for block_prepare_write() and block_write_full_page().
2585 * block_write_full_page(). These functions should only try to map a 2587 * These functions should only try to map a single block at a time.
2586 * single block at a time.
2587 * 2588 *
2588 * Since this function doesn't do block allocations even if the caller 2589 * Since this function doesn't do block allocations even if the caller
2589 * requests it by passing in create=1, it is critically important that 2590 * requests it by passing in create=1, it is critically important that
2590 * any caller checks to make sure that any buffer heads are returned 2591 * any caller checks to make sure that any buffer heads are returned
2591 * by this function are either all already mapped or marked for 2592 * by this function are either all already mapped or marked for
2592 * delayed allocation before calling nobh_writepage() or 2593 * delayed allocation before calling block_write_full_page(). Otherwise,
2593 * block_write_full_page(). Otherwise, b_blocknr could be left 2594 * b_blocknr could be left unitialized, and the page write functions will
2594 * unitialized, and the page write functions will be taken by 2595 * be taken by surprise.
2595 * surprise.
2596 */ 2596 */
2597static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2597static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2598 struct buffer_head *bh_result, int create) 2598 struct buffer_head *bh_result, int create)
2599{ 2599{
2600 int ret = 0;
2601 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2602
2603 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2600 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2604 2601 return _ext4_get_block(inode, iblock, bh_result, 0);
2605 /*
2606 * we don't want to do block allocation in writepage
2607 * so call get_block_wrap with create = 0
2608 */
2609 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
2610 if (ret > 0) {
2611 bh_result->b_size = (ret << inode->i_blkbits);
2612 ret = 0;
2613 }
2614 return ret;
2615} 2602}
2616 2603
2617static int bget_one(handle_t *handle, struct buffer_head *bh) 2604static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2790,9 +2777,7 @@ static int ext4_writepage(struct page *page,
2790 return __ext4_journalled_writepage(page, len); 2777 return __ext4_journalled_writepage(page, len);
2791 } 2778 }
2792 2779
2793 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2780 if (page_bufs && buffer_uninit(page_bufs)) {
2794 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2795 else if (page_bufs && buffer_uninit(page_bufs)) {
2796 ext4_set_bh_endio(page_bufs, inode); 2781 ext4_set_bh_endio(page_bufs, inode);
2797 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2782 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2798 wbc, ext4_end_io_buffer_write); 2783 wbc, ext4_end_io_buffer_write);
@@ -2821,13 +2806,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2821 * number of contiguous block. So we will limit 2806 * number of contiguous block. So we will limit
2822 * number of contiguous block to a sane value 2807 * number of contiguous block to a sane value
2823 */ 2808 */
2824 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && 2809 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2825 (max_blocks > EXT4_MAX_TRANS_DATA)) 2810 (max_blocks > EXT4_MAX_TRANS_DATA))
2826 max_blocks = EXT4_MAX_TRANS_DATA; 2811 max_blocks = EXT4_MAX_TRANS_DATA;
2827 2812
2828 return ext4_chunk_trans_blocks(inode, max_blocks); 2813 return ext4_chunk_trans_blocks(inode, max_blocks);
2829} 2814}
2830 2815
2816/*
2817 * write_cache_pages_da - walk the list of dirty pages of the given
2818 * address space and call the callback function (which usually writes
2819 * the pages).
2820 *
2821 * This is a forked version of write_cache_pages(). Differences:
2822 * Range cyclic is ignored.
2823 * no_nrwrite_index_update is always presumed true
2824 */
2825static int write_cache_pages_da(struct address_space *mapping,
2826 struct writeback_control *wbc,
2827 struct mpage_da_data *mpd)
2828{
2829 int ret = 0;
2830 int done = 0;
2831 struct pagevec pvec;
2832 int nr_pages;
2833 pgoff_t index;
2834 pgoff_t end; /* Inclusive */
2835 long nr_to_write = wbc->nr_to_write;
2836
2837 pagevec_init(&pvec, 0);
2838 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2839 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2840
2841 while (!done && (index <= end)) {
2842 int i;
2843
2844 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2845 PAGECACHE_TAG_DIRTY,
2846 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2847 if (nr_pages == 0)
2848 break;
2849
2850 for (i = 0; i < nr_pages; i++) {
2851 struct page *page = pvec.pages[i];
2852
2853 /*
2854 * At this point, the page may be truncated or
2855 * invalidated (changing page->mapping to NULL), or
2856 * even swizzled back from swapper_space to tmpfs file
2857 * mapping. However, page->index will not change
2858 * because we have a reference on the page.
2859 */
2860 if (page->index > end) {
2861 done = 1;
2862 break;
2863 }
2864
2865 lock_page(page);
2866
2867 /*
2868 * Page truncated or invalidated. We can freely skip it
2869 * then, even for data integrity operations: the page
2870 * has disappeared concurrently, so there could be no
2871 * real expectation of this data interity operation
2872 * even if there is now a new, dirty page at the same
2873 * pagecache address.
2874 */
2875 if (unlikely(page->mapping != mapping)) {
2876continue_unlock:
2877 unlock_page(page);
2878 continue;
2879 }
2880
2881 if (!PageDirty(page)) {
2882 /* someone wrote it for us */
2883 goto continue_unlock;
2884 }
2885
2886 if (PageWriteback(page)) {
2887 if (wbc->sync_mode != WB_SYNC_NONE)
2888 wait_on_page_writeback(page);
2889 else
2890 goto continue_unlock;
2891 }
2892
2893 BUG_ON(PageWriteback(page));
2894 if (!clear_page_dirty_for_io(page))
2895 goto continue_unlock;
2896
2897 ret = __mpage_da_writepage(page, wbc, mpd);
2898 if (unlikely(ret)) {
2899 if (ret == AOP_WRITEPAGE_ACTIVATE) {
2900 unlock_page(page);
2901 ret = 0;
2902 } else {
2903 done = 1;
2904 break;
2905 }
2906 }
2907
2908 if (nr_to_write > 0) {
2909 nr_to_write--;
2910 if (nr_to_write == 0 &&
2911 wbc->sync_mode == WB_SYNC_NONE) {
2912 /*
2913 * We stop writing back only if we are
2914 * not doing integrity sync. In case of
2915 * integrity sync we have to keep going
2916 * because someone may be concurrently
2917 * dirtying pages, and we might have
2918 * synced a lot of newly appeared dirty
2919 * pages, but have not synced all of the
2920 * old dirty pages.
2921 */
2922 done = 1;
2923 break;
2924 }
2925 }
2926 }
2927 pagevec_release(&pvec);
2928 cond_resched();
2929 }
2930 return ret;
2931}
2932
2933
2831static int ext4_da_writepages(struct address_space *mapping, 2934static int ext4_da_writepages(struct address_space *mapping,
2832 struct writeback_control *wbc) 2935 struct writeback_control *wbc)
2833{ 2936{
@@ -2836,7 +2939,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2836 handle_t *handle = NULL; 2939 handle_t *handle = NULL;
2837 struct mpage_da_data mpd; 2940 struct mpage_da_data mpd;
2838 struct inode *inode = mapping->host; 2941 struct inode *inode = mapping->host;
2839 int no_nrwrite_index_update;
2840 int pages_written = 0; 2942 int pages_written = 0;
2841 long pages_skipped; 2943 long pages_skipped;
2842 unsigned int max_pages; 2944 unsigned int max_pages;
@@ -2916,12 +3018,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2916 mpd.wbc = wbc; 3018 mpd.wbc = wbc;
2917 mpd.inode = mapping->host; 3019 mpd.inode = mapping->host;
2918 3020
2919 /*
2920 * we don't want write_cache_pages to update
2921 * nr_to_write and writeback_index
2922 */
2923 no_nrwrite_index_update = wbc->no_nrwrite_index_update;
2924 wbc->no_nrwrite_index_update = 1;
2925 pages_skipped = wbc->pages_skipped; 3021 pages_skipped = wbc->pages_skipped;
2926 3022
2927retry: 3023retry:
@@ -2941,7 +3037,7 @@ retry:
2941 if (IS_ERR(handle)) { 3037 if (IS_ERR(handle)) {
2942 ret = PTR_ERR(handle); 3038 ret = PTR_ERR(handle);
2943 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 3039 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2944 "%ld pages, ino %lu; err %d\n", __func__, 3040 "%ld pages, ino %lu; err %d", __func__,
2945 wbc->nr_to_write, inode->i_ino, ret); 3041 wbc->nr_to_write, inode->i_ino, ret);
2946 goto out_writepages; 3042 goto out_writepages;
2947 } 3043 }
@@ -2963,8 +3059,7 @@ retry:
2963 mpd.io_done = 0; 3059 mpd.io_done = 0;
2964 mpd.pages_written = 0; 3060 mpd.pages_written = 0;
2965 mpd.retval = 0; 3061 mpd.retval = 0;
2966 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 3062 ret = write_cache_pages_da(mapping, wbc, &mpd);
2967 &mpd);
2968 /* 3063 /*
2969 * If we have a contiguous extent of pages and we 3064 * If we have a contiguous extent of pages and we
2970 * haven't done the I/O yet, map the blocks and submit 3065 * haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3111,7 @@ retry:
3016 if (pages_skipped != wbc->pages_skipped) 3111 if (pages_skipped != wbc->pages_skipped)
3017 ext4_msg(inode->i_sb, KERN_CRIT, 3112 ext4_msg(inode->i_sb, KERN_CRIT,
3018 "This should not happen leaving %s " 3113 "This should not happen leaving %s "
3019 "with nr_to_write = %ld ret = %d\n", 3114 "with nr_to_write = %ld ret = %d",
3020 __func__, wbc->nr_to_write, ret); 3115 __func__, wbc->nr_to_write, ret);
3021 3116
3022 /* Update index */ 3117 /* Update index */
@@ -3030,8 +3125,6 @@ retry:
3030 mapping->writeback_index = index; 3125 mapping->writeback_index = index;
3031 3126
3032out_writepages: 3127out_writepages:
3033 if (!no_nrwrite_index_update)
3034 wbc->no_nrwrite_index_update = 0;
3035 wbc->nr_to_write -= nr_to_writebump; 3128 wbc->nr_to_write -= nr_to_writebump;
3036 wbc->range_start = range_start; 3129 wbc->range_start = range_start;
3037 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3130 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,16 +3169,13 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3076 loff_t pos, unsigned len, unsigned flags, 3169 loff_t pos, unsigned len, unsigned flags,
3077 struct page **pagep, void **fsdata) 3170 struct page **pagep, void **fsdata)
3078{ 3171{
3079 int ret, retries = 0, quota_retries = 0; 3172 int ret, retries = 0;
3080 struct page *page; 3173 struct page *page;
3081 pgoff_t index; 3174 pgoff_t index;
3082 unsigned from, to;
3083 struct inode *inode = mapping->host; 3175 struct inode *inode = mapping->host;
3084 handle_t *handle; 3176 handle_t *handle;
3085 3177
3086 index = pos >> PAGE_CACHE_SHIFT; 3178 index = pos >> PAGE_CACHE_SHIFT;
3087 from = pos & (PAGE_CACHE_SIZE - 1);
3088 to = from + len;
3089 3179
3090 if (ext4_nonda_switch(inode->i_sb)) { 3180 if (ext4_nonda_switch(inode->i_sb)) {
3091 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 3181 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -3118,8 +3208,7 @@ retry:
3118 } 3208 }
3119 *pagep = page; 3209 *pagep = page;
3120 3210
3121 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 3211 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
3122 ext4_da_get_block_prep);
3123 if (ret < 0) { 3212 if (ret < 0) {
3124 unlock_page(page); 3213 unlock_page(page);
3125 ext4_journal_stop(handle); 3214 ext4_journal_stop(handle);
@@ -3135,22 +3224,6 @@ retry:
3135 3224
3136 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3225 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3137 goto retry; 3226 goto retry;
3138
3139 if ((ret == -EDQUOT) &&
3140 EXT4_I(inode)->i_reserved_meta_blocks &&
3141 (quota_retries++ < 3)) {
3142 /*
3143 * Since we often over-estimate the number of meta
3144 * data blocks required, we may sometimes get a
3145 * spurios out of quota error even though there would
3146 * be enough space once we write the data blocks and
3147 * find out how many meta data blocks were _really_
3148 * required. So try forcing the inode write to see if
3149 * that helps.
3150 */
3151 write_inode_now(inode, (quota_retries == 3));
3152 goto retry;
3153 }
3154out: 3227out:
3155 return ret; 3228 return ret;
3156} 3229}
@@ -3494,15 +3567,24 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3494 3567
3495retry: 3568retry:
3496 if (rw == READ && ext4_should_dioread_nolock(inode)) 3569 if (rw == READ && ext4_should_dioread_nolock(inode))
3497 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 3570 ret = __blockdev_direct_IO(rw, iocb, inode,
3498 inode->i_sb->s_bdev, iov, 3571 inode->i_sb->s_bdev, iov,
3499 offset, nr_segs, 3572 offset, nr_segs,
3500 ext4_get_block, NULL); 3573 ext4_get_block, NULL, NULL, 0);
3501 else 3574 else {
3502 ret = blockdev_direct_IO(rw, iocb, inode, 3575 ret = blockdev_direct_IO(rw, iocb, inode,
3503 inode->i_sb->s_bdev, iov, 3576 inode->i_sb->s_bdev, iov,
3504 offset, nr_segs, 3577 offset, nr_segs,
3505 ext4_get_block, NULL); 3578 ext4_get_block, NULL);
3579
3580 if (unlikely((rw & WRITE) && ret < 0)) {
3581 loff_t isize = i_size_read(inode);
3582 loff_t end = offset + iov_length(iov, nr_segs);
3583
3584 if (end > isize)
3585 vmtruncate(inode, isize);
3586 }
3587 }
3506 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3588 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3507 goto retry; 3589 goto retry;
3508 3590
@@ -3546,46 +3628,18 @@ out:
3546 return ret; 3628 return ret;
3547} 3629}
3548 3630
3631/*
3632 * ext4_get_block used when preparing for a DIO write or buffer write.
3633 * We allocate an uinitialized extent if blocks haven't been allocated.
3634 * The extent will be converted to initialized after the IO is complete.
3635 */
3549static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3636static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3550 struct buffer_head *bh_result, int create) 3637 struct buffer_head *bh_result, int create)
3551{ 3638{
3552 handle_t *handle = ext4_journal_current_handle();
3553 int ret = 0;
3554 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3555 int dio_credits;
3556 int started = 0;
3557
3558 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3639 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3559 inode->i_ino, create); 3640 inode->i_ino, create);
3560 /* 3641 return _ext4_get_block(inode, iblock, bh_result,
3561 * ext4_get_block in prepare for a DIO write or buffer write. 3642 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3562 * We allocate an uinitialized extent if blocks haven't been allocated.
3563 * The extent will be converted to initialized after IO complete.
3564 */
3565 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3566
3567 if (!handle) {
3568 if (max_blocks > DIO_MAX_BLOCKS)
3569 max_blocks = DIO_MAX_BLOCKS;
3570 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3571 handle = ext4_journal_start(inode, dio_credits);
3572 if (IS_ERR(handle)) {
3573 ret = PTR_ERR(handle);
3574 goto out;
3575 }
3576 started = 1;
3577 }
3578
3579 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3580 create);
3581 if (ret > 0) {
3582 bh_result->b_size = (ret << inode->i_blkbits);
3583 ret = 0;
3584 }
3585 if (started)
3586 ext4_journal_stop(handle);
3587out:
3588 return ret;
3589} 3643}
3590 3644
3591static void dump_completed_IO(struct inode * inode) 3645static void dump_completed_IO(struct inode * inode)
@@ -3645,6 +3699,8 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
3645 return ret; 3699 return ret;
3646 } 3700 }
3647 3701
3702 if (io->iocb)
3703 aio_complete(io->iocb, io->result, 0);
3648 /* clear the DIO AIO unwritten flag */ 3704 /* clear the DIO AIO unwritten flag */
3649 io->flag = 0; 3705 io->flag = 0;
3650 return ret; 3706 return ret;
@@ -3744,6 +3800,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3744 io->offset = 0; 3800 io->offset = 0;
3745 io->size = 0; 3801 io->size = 0;
3746 io->page = NULL; 3802 io->page = NULL;
3803 io->iocb = NULL;
3804 io->result = 0;
3747 INIT_WORK(&io->work, ext4_end_io_work); 3805 INIT_WORK(&io->work, ext4_end_io_work);
3748 INIT_LIST_HEAD(&io->list); 3806 INIT_LIST_HEAD(&io->list);
3749 } 3807 }
@@ -3752,7 +3810,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3752} 3810}
3753 3811
3754static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3812static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3755 ssize_t size, void *private) 3813 ssize_t size, void *private, int ret,
3814 bool is_async)
3756{ 3815{
3757 ext4_io_end_t *io_end = iocb->private; 3816 ext4_io_end_t *io_end = iocb->private;
3758 struct workqueue_struct *wq; 3817 struct workqueue_struct *wq;
@@ -3761,7 +3820,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3761 3820
3762 /* if not async direct IO or dio with 0 bytes write, just return */ 3821 /* if not async direct IO or dio with 0 bytes write, just return */
3763 if (!io_end || !size) 3822 if (!io_end || !size)
3764 return; 3823 goto out;
3765 3824
3766 ext_debug("ext4_end_io_dio(): io_end 0x%p" 3825 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3767 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", 3826 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
@@ -3772,12 +3831,18 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3772 if (io_end->flag != EXT4_IO_UNWRITTEN){ 3831 if (io_end->flag != EXT4_IO_UNWRITTEN){
3773 ext4_free_io_end(io_end); 3832 ext4_free_io_end(io_end);
3774 iocb->private = NULL; 3833 iocb->private = NULL;
3834out:
3835 if (is_async)
3836 aio_complete(iocb, ret, 0);
3775 return; 3837 return;
3776 } 3838 }
3777 3839
3778 io_end->offset = offset; 3840 io_end->offset = offset;
3779 io_end->size = size; 3841 io_end->size = size;
3780 io_end->flag = EXT4_IO_UNWRITTEN; 3842 if (is_async) {
3843 io_end->iocb = iocb;
3844 io_end->result = ret;
3845 }
3781 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3846 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3782 3847
3783 /* queue the work to convert unwritten extents to written */ 3848 /* queue the work to convert unwritten extents to written */
@@ -3914,7 +3979,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3914 return -ENOMEM; 3979 return -ENOMEM;
3915 /* 3980 /*
3916 * we save the io structure for current async 3981 * we save the io structure for current async
3917 * direct IO, so that later ext4_get_blocks() 3982 * direct IO, so that later ext4_map_blocks()
3918 * could flag the io structure whether there 3983 * could flag the io structure whether there
3919 * is a unwritten extents needs to be converted 3984 * is a unwritten extents needs to be converted
3920 * when IO is completed. 3985 * when IO is completed.
@@ -3973,7 +4038,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3973 struct file *file = iocb->ki_filp; 4038 struct file *file = iocb->ki_filp;
3974 struct inode *inode = file->f_mapping->host; 4039 struct inode *inode = file->f_mapping->host;
3975 4040
3976 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 4041 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3977 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 4042 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3978 4043
3979 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 4044 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4105,17 +4170,6 @@ int ext4_block_truncate_page(handle_t *handle,
4105 length = blocksize - (offset & (blocksize - 1)); 4170 length = blocksize - (offset & (blocksize - 1));
4106 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 4171 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
4107 4172
4108 /*
4109 * For "nobh" option, we can only work if we don't need to
4110 * read-in the page - otherwise we create buffers to do the IO.
4111 */
4112 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
4113 ext4_should_writeback_data(inode) && PageUptodate(page)) {
4114 zero_user(page, offset, length);
4115 set_page_dirty(page);
4116 goto unlock;
4117 }
4118
4119 if (!page_has_buffers(page)) 4173 if (!page_has_buffers(page))
4120 create_empty_buffers(page, blocksize, 0); 4174 create_empty_buffers(page, blocksize, 0);
4121 4175
@@ -4302,10 +4356,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4302 4356
4303 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 4357 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4304 count)) { 4358 count)) {
4305 ext4_error(inode->i_sb, "inode #%lu: " 4359 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4306 "attempt to clear blocks %llu len %lu, invalid", 4360 "blocks %llu len %lu",
4307 inode->i_ino, (unsigned long long) block_to_free, 4361 (unsigned long long) block_to_free, count);
4308 count);
4309 return 1; 4362 return 1;
4310 } 4363 }
4311 4364
@@ -4410,11 +4463,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4410 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4463 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4411 ext4_handle_dirty_metadata(handle, inode, this_bh); 4464 ext4_handle_dirty_metadata(handle, inode, this_bh);
4412 else 4465 else
4413 ext4_error(inode->i_sb, 4466 EXT4_ERROR_INODE(inode,
4414 "circular indirect block detected, " 4467 "circular indirect block detected at "
4415 "inode=%lu, block=%llu", 4468 "block %llu",
4416 inode->i_ino, 4469 (unsigned long long) this_bh->b_blocknr);
4417 (unsigned long long) this_bh->b_blocknr);
4418 } 4470 }
4419} 4471}
4420 4472
@@ -4452,11 +4504,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4452 4504
4453 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 4505 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4454 nr, 1)) { 4506 nr, 1)) {
4455 ext4_error(inode->i_sb, 4507 EXT4_ERROR_INODE(inode,
4456 "indirect mapped block in inode " 4508 "invalid indirect mapped "
4457 "#%lu invalid (level %d, blk #%lu)", 4509 "block %lu (level %d)",
4458 inode->i_ino, depth, 4510 (unsigned long) nr, depth);
4459 (unsigned long) nr);
4460 break; 4511 break;
4461 } 4512 }
4462 4513
@@ -4468,9 +4519,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4468 * (should be rare). 4519 * (should be rare).
4469 */ 4520 */
4470 if (!bh) { 4521 if (!bh) {
4471 ext4_error(inode->i_sb, 4522 EXT4_ERROR_INODE_BLOCK(inode, nr,
4472 "Read failure, inode=%lu, block=%llu", 4523 "Read failure");
4473 inode->i_ino, nr);
4474 continue; 4524 continue;
4475 } 4525 }
4476 4526
@@ -4482,27 +4532,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4482 depth); 4532 depth);
4483 4533
4484 /* 4534 /*
4485 * We've probably journalled the indirect block several
4486 * times during the truncate. But it's no longer
4487 * needed and we now drop it from the transaction via
4488 * jbd2_journal_revoke().
4489 *
4490 * That's easy if it's exclusively part of this
4491 * transaction. But if it's part of the committing
4492 * transaction then jbd2_journal_forget() will simply
4493 * brelse() it. That means that if the underlying
4494 * block is reallocated in ext4_get_block(),
4495 * unmap_underlying_metadata() will find this block
4496 * and will try to get rid of it. damn, damn.
4497 *
4498 * If this block has already been committed to the
4499 * journal, a revoke record will be written. And
4500 * revoke records must be emitted *before* clearing
4501 * this block's bit in the bitmaps.
4502 */
4503 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
4504
4505 /*
4506 * Everything below this this pointer has been 4535 * Everything below this this pointer has been
4507 * released. Now let this top-of-subtree go. 4536 * released. Now let this top-of-subtree go.
4508 * 4537 *
@@ -4526,8 +4555,20 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4526 blocks_for_truncate(inode)); 4555 blocks_for_truncate(inode));
4527 } 4556 }
4528 4557
4558 /*
4559 * The forget flag here is critical because if
4560 * we are journaling (and not doing data
4561 * journaling), we have to make sure a revoke
4562 * record is written to prevent the journal
4563 * replay from overwriting the (former)
4564 * indirect block if it gets reallocated as a
4565 * data block. This must happen in the same
4566 * transaction where the data blocks are
4567 * actually freed.
4568 */
4529 ext4_free_blocks(handle, inode, 0, nr, 1, 4569 ext4_free_blocks(handle, inode, 0, nr, 1,
4530 EXT4_FREE_BLOCKS_METADATA); 4570 EXT4_FREE_BLOCKS_METADATA|
4571 EXT4_FREE_BLOCKS_FORGET);
4531 4572
4532 if (parent_bh) { 4573 if (parent_bh) {
4533 /* 4574 /*
@@ -4612,12 +4653,12 @@ void ext4_truncate(struct inode *inode)
4612 if (!ext4_can_truncate(inode)) 4653 if (!ext4_can_truncate(inode))
4613 return; 4654 return;
4614 4655
4615 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 4656 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4616 4657
4617 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4658 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4618 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4659 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4619 4660
4620 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4661 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4621 ext4_ext_truncate(inode); 4662 ext4_ext_truncate(inode);
4622 return; 4663 return;
4623 } 4664 }
@@ -4785,8 +4826,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4785 4826
4786 bh = sb_getblk(sb, block); 4827 bh = sb_getblk(sb, block);
4787 if (!bh) { 4828 if (!bh) {
4788 ext4_error(sb, "unable to read inode block - " 4829 EXT4_ERROR_INODE_BLOCK(inode, block,
4789 "inode=%lu, block=%llu", inode->i_ino, block); 4830 "unable to read itable block");
4790 return -EIO; 4831 return -EIO;
4791 } 4832 }
4792 if (!buffer_uptodate(bh)) { 4833 if (!buffer_uptodate(bh)) {
@@ -4884,8 +4925,8 @@ make_io:
4884 submit_bh(READ_META, bh); 4925 submit_bh(READ_META, bh);
4885 wait_on_buffer(bh); 4926 wait_on_buffer(bh);
4886 if (!buffer_uptodate(bh)) { 4927 if (!buffer_uptodate(bh)) {
4887 ext4_error(sb, "unable to read inode block - inode=%lu," 4928 EXT4_ERROR_INODE_BLOCK(inode, block,
4888 " block=%llu", inode->i_ino, block); 4929 "unable to read itable block");
4889 brelse(bh); 4930 brelse(bh);
4890 return -EIO; 4931 return -EIO;
4891 } 4932 }
@@ -4922,20 +4963,26 @@ void ext4_set_inode_flags(struct inode *inode)
4922/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4963/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4923void ext4_get_inode_flags(struct ext4_inode_info *ei) 4964void ext4_get_inode_flags(struct ext4_inode_info *ei)
4924{ 4965{
4925 unsigned int flags = ei->vfs_inode.i_flags; 4966 unsigned int vfs_fl;
4926 4967 unsigned long old_fl, new_fl;
4927 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4968
4928 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 4969 do {
4929 if (flags & S_SYNC) 4970 vfs_fl = ei->vfs_inode.i_flags;
4930 ei->i_flags |= EXT4_SYNC_FL; 4971 old_fl = ei->i_flags;
4931 if (flags & S_APPEND) 4972 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4932 ei->i_flags |= EXT4_APPEND_FL; 4973 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
4933 if (flags & S_IMMUTABLE) 4974 EXT4_DIRSYNC_FL);
4934 ei->i_flags |= EXT4_IMMUTABLE_FL; 4975 if (vfs_fl & S_SYNC)
4935 if (flags & S_NOATIME) 4976 new_fl |= EXT4_SYNC_FL;
4936 ei->i_flags |= EXT4_NOATIME_FL; 4977 if (vfs_fl & S_APPEND)
4937 if (flags & S_DIRSYNC) 4978 new_fl |= EXT4_APPEND_FL;
4938 ei->i_flags |= EXT4_DIRSYNC_FL; 4979 if (vfs_fl & S_IMMUTABLE)
4980 new_fl |= EXT4_IMMUTABLE_FL;
4981 if (vfs_fl & S_NOATIME)
4982 new_fl |= EXT4_NOATIME_FL;
4983 if (vfs_fl & S_DIRSYNC)
4984 new_fl |= EXT4_DIRSYNC_FL;
4985 } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
4939} 4986}
4940 4987
4941static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4988static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -4950,7 +4997,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4950 /* we are using combined 48 bit field */ 4997 /* we are using combined 48 bit field */
4951 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4998 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4952 le32_to_cpu(raw_inode->i_blocks_lo); 4999 le32_to_cpu(raw_inode->i_blocks_lo);
4953 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 5000 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4954 /* i_blocks represent file system block size */ 5001 /* i_blocks represent file system block size */
4955 return i_blocks << (inode->i_blkbits - 9); 5002 return i_blocks << (inode->i_blkbits - 9);
4956 } else { 5003 } else {
@@ -5046,7 +5093,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5046 transaction_t *transaction; 5093 transaction_t *transaction;
5047 tid_t tid; 5094 tid_t tid;
5048 5095
5049 spin_lock(&journal->j_state_lock); 5096 read_lock(&journal->j_state_lock);
5050 if (journal->j_running_transaction) 5097 if (journal->j_running_transaction)
5051 transaction = journal->j_running_transaction; 5098 transaction = journal->j_running_transaction;
5052 else 5099 else
@@ -5055,7 +5102,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5055 tid = transaction->t_tid; 5102 tid = transaction->t_tid;
5056 else 5103 else
5057 tid = journal->j_commit_sequence; 5104 tid = journal->j_commit_sequence;
5058 spin_unlock(&journal->j_state_lock); 5105 read_unlock(&journal->j_state_lock);
5059 ei->i_sync_tid = tid; 5106 ei->i_sync_tid = tid;
5060 ei->i_datasync_tid = tid; 5107 ei->i_datasync_tid = tid;
5061 } 5108 }
@@ -5096,11 +5143,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5096 ret = 0; 5143 ret = 0;
5097 if (ei->i_file_acl && 5144 if (ei->i_file_acl &&
5098 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5145 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
5099 ext4_error(sb, "bad extended attribute block %llu inode #%lu", 5146 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
5100 ei->i_file_acl, inode->i_ino); 5147 ei->i_file_acl);
5101 ret = -EIO; 5148 ret = -EIO;
5102 goto bad_inode; 5149 goto bad_inode;
5103 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 5150 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5104 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 5151 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5105 (S_ISLNK(inode->i_mode) && 5152 (S_ISLNK(inode->i_mode) &&
5106 !ext4_inode_is_fast_symlink(inode))) 5153 !ext4_inode_is_fast_symlink(inode)))
@@ -5142,8 +5189,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5142 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5189 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5143 } else { 5190 } else {
5144 ret = -EIO; 5191 ret = -EIO;
5145 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", 5192 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5146 inode->i_mode, inode->i_ino);
5147 goto bad_inode; 5193 goto bad_inode;
5148 } 5194 }
5149 brelse(iloc.bh); 5195 brelse(iloc.bh);
@@ -5172,7 +5218,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
5172 */ 5218 */
5173 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5219 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5174 raw_inode->i_blocks_high = 0; 5220 raw_inode->i_blocks_high = 0;
5175 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 5221 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5176 return 0; 5222 return 0;
5177 } 5223 }
5178 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 5224 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -5185,9 +5231,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
5185 */ 5231 */
5186 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5232 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5187 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 5233 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5188 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 5234 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5189 } else { 5235 } else {
5190 ei->i_flags |= EXT4_HUGE_FILE_FL; 5236 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5191 /* i_block is stored in file system block size */ 5237 /* i_block is stored in file system block size */
5192 i_blocks = i_blocks >> (inode->i_blkbits - 9); 5238 i_blocks = i_blocks >> (inode->i_blkbits - 9);
5193 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5239 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -5381,9 +5427,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5381 if (wbc->sync_mode == WB_SYNC_ALL) 5427 if (wbc->sync_mode == WB_SYNC_ALL)
5382 sync_dirty_buffer(iloc.bh); 5428 sync_dirty_buffer(iloc.bh);
5383 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5429 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5384 ext4_error(inode->i_sb, "IO error syncing inode, " 5430 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5385 "inode=%lu, block=%llu", inode->i_ino, 5431 "IO error syncing inode");
5386 (unsigned long long)iloc.bh->b_blocknr);
5387 err = -EIO; 5432 err = -EIO;
5388 } 5433 }
5389 brelse(iloc.bh); 5434 brelse(iloc.bh);
@@ -5425,7 +5470,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5425 if (error) 5470 if (error)
5426 return error; 5471 return error;
5427 5472
5428 if (ia_valid & ATTR_SIZE) 5473 if (is_quota_modification(inode, attr))
5429 dquot_initialize(inode); 5474 dquot_initialize(inode);
5430 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5475 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5431 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5476 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
@@ -5455,20 +5500,18 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5455 } 5500 }
5456 5501
5457 if (attr->ia_valid & ATTR_SIZE) { 5502 if (attr->ia_valid & ATTR_SIZE) {
5458 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 5503 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5459 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5504 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5460 5505
5461 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5506 if (attr->ia_size > sbi->s_bitmap_maxbytes)
5462 error = -EFBIG; 5507 return -EFBIG;
5463 goto err_out;
5464 }
5465 } 5508 }
5466 } 5509 }
5467 5510
5468 if (S_ISREG(inode->i_mode) && 5511 if (S_ISREG(inode->i_mode) &&
5469 attr->ia_valid & ATTR_SIZE && 5512 attr->ia_valid & ATTR_SIZE &&
5470 (attr->ia_size < inode->i_size || 5513 (attr->ia_size < inode->i_size ||
5471 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { 5514 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5472 handle_t *handle; 5515 handle_t *handle;
5473 5516
5474 handle = ext4_journal_start(inode, 3); 5517 handle = ext4_journal_start(inode, 3);
@@ -5500,15 +5543,23 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5500 } 5543 }
5501 } 5544 }
5502 /* ext4_truncate will clear the flag */ 5545 /* ext4_truncate will clear the flag */
5503 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) 5546 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5504 ext4_truncate(inode); 5547 ext4_truncate(inode);
5505 } 5548 }
5506 5549
5507 rc = inode_setattr(inode, attr); 5550 if ((attr->ia_valid & ATTR_SIZE) &&
5551 attr->ia_size != i_size_read(inode))
5552 rc = vmtruncate(inode, attr->ia_size);
5553
5554 if (!rc) {
5555 setattr_copy(inode, attr);
5556 mark_inode_dirty(inode);
5557 }
5508 5558
5509 /* If inode_setattr's call to ext4_truncate failed to get a 5559 /*
5510 * transaction handle at all, we need to clean up the in-core 5560 * If the call to ext4_truncate failed to get a transaction handle at
5511 * orphan list manually. */ 5561 * all, we need to clean up the in-core orphan list manually.
5562 */
5512 if (inode->i_nlink) 5563 if (inode->i_nlink)
5513 ext4_orphan_del(NULL, inode); 5564 ext4_orphan_del(NULL, inode);
5514 5565
@@ -5576,7 +5627,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5576 5627
5577static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5628static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5578{ 5629{
5579 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 5630 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5580 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 5631 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5581 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 5632 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5582} 5633}
@@ -5663,7 +5714,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
5663 * Calculate the journal credits for a chunk of data modification. 5714 * Calculate the journal credits for a chunk of data modification.
5664 * 5715 *
5665 * This is called from DIO, fallocate or whoever calling 5716 * This is called from DIO, fallocate or whoever calling
5666 * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks. 5717 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5667 * 5718 *
5668 * journal buffers for data blocks are not included here, as DIO 5719 * journal buffers for data blocks are not included here, as DIO
5669 * and fallocate do no need to journal data buffers. 5720 * and fallocate do no need to journal data buffers.
@@ -5729,7 +5780,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
5729{ 5780{
5730 struct ext4_inode *raw_inode; 5781 struct ext4_inode *raw_inode;
5731 struct ext4_xattr_ibody_header *header; 5782 struct ext4_xattr_ibody_header *header;
5732 struct ext4_xattr_entry *entry;
5733 5783
5734 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 5784 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5735 return 0; 5785 return 0;
@@ -5737,7 +5787,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
5737 raw_inode = ext4_raw_inode(&iloc); 5787 raw_inode = ext4_raw_inode(&iloc);
5738 5788
5739 header = IHDR(inode, raw_inode); 5789 header = IHDR(inode, raw_inode);
5740 entry = IFIRST(header);
5741 5790
5742 /* No extended attributes present */ 5791 /* No extended attributes present */
5743 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 5792 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
@@ -5911,9 +5960,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5911 */ 5960 */
5912 5961
5913 if (val) 5962 if (val)
5914 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5963 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5915 else 5964 else
5916 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5965 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5917 ext4_set_aops(inode); 5966 ext4_set_aops(inode);
5918 5967
5919 jbd2_journal_unlock_updates(journal); 5968 jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
258 if (me.moved_len > 0) 258 if (me.moved_len > 0)
259 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
260 260
261 if (copy_to_user((struct move_extent __user *)arg, 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me))) 262 &me, sizeof(me)))
263 err = -EFAULT; 263 err = -EFAULT;
264mext_out: 264mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
373 case EXT4_IOC32_SETRSVSZ: 373 case EXT4_IOC32_SETRSVSZ:
374 cmd = EXT4_IOC_SETRSVSZ; 374 cmd = EXT4_IOC_SETRSVSZ;
375 break; 375 break;
376 case EXT4_IOC_GROUP_ADD: 376 case EXT4_IOC32_GROUP_ADD: {
377 struct compat_ext4_new_group_input __user *uinput;
378 struct ext4_new_group_input input;
379 mm_segment_t old_fs;
380 int err;
381
382 uinput = compat_ptr(arg);
383 err = get_user(input.group, &uinput->group);
384 err |= get_user(input.block_bitmap, &uinput->block_bitmap);
385 err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
386 err |= get_user(input.inode_table, &uinput->inode_table);
387 err |= get_user(input.blocks_count, &uinput->blocks_count);
388 err |= get_user(input.reserved_blocks,
389 &uinput->reserved_blocks);
390 if (err)
391 return -EFAULT;
392 old_fs = get_fs();
393 set_fs(KERNEL_DS);
394 err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
395 (unsigned long) &input);
396 set_fs(old_fs);
397 return err;
398 }
399 case EXT4_IOC_MOVE_EXT:
377 break; 400 break;
378 default: 401 default:
379 return -ENOIOCTLCMD; 402 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..4b4ad4b7ce57 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -446,10 +446,11 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
446 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 446 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
447 blocknr += first + i; 447 blocknr += first + i;
448 ext4_grp_locked_error(sb, e4b->bd_group, 448 ext4_grp_locked_error(sb, e4b->bd_group,
449 __func__, "double-free of inode" 449 inode ? inode->i_ino : 0,
450 " %lu's block %llu(bit %u in group %u)", 450 blocknr,
451 inode ? inode->i_ino : 0, blocknr, 451 "freeing block already freed "
452 first + i, e4b->bd_group); 452 "(bit %u)",
453 first + i);
453 } 454 }
454 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 455 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
455 } 456 }
@@ -658,6 +659,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
658 } 659 }
659} 660}
660 661
662/*
663 * Cache the order of the largest free extent we have available in this block
664 * group.
665 */
666static void
667mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
668{
669 int i;
670 int bits;
671
672 grp->bb_largest_free_order = -1; /* uninit */
673
674 bits = sb->s_blocksize_bits + 1;
675 for (i = bits; i >= 0; i--) {
676 if (grp->bb_counters[i] > 0) {
677 grp->bb_largest_free_order = i;
678 break;
679 }
680 }
681}
682
661static noinline_for_stack 683static noinline_for_stack
662void ext4_mb_generate_buddy(struct super_block *sb, 684void ext4_mb_generate_buddy(struct super_block *sb,
663 void *buddy, void *bitmap, ext4_group_t group) 685 void *buddy, void *bitmap, ext4_group_t group)
@@ -691,15 +713,16 @@ void ext4_mb_generate_buddy(struct super_block *sb,
691 grp->bb_fragments = fragments; 713 grp->bb_fragments = fragments;
692 714
693 if (free != grp->bb_free) { 715 if (free != grp->bb_free) {
694 ext4_grp_locked_error(sb, group, __func__, 716 ext4_grp_locked_error(sb, group, 0, 0,
695 "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", 717 "%u blocks in bitmap, %u in gd",
696 group, free, grp->bb_free); 718 free, grp->bb_free);
697 /* 719 /*
698 * If we intent to continue, we consider group descritor 720 * If we intent to continue, we consider group descritor
699 * corrupt and update bb_free using bitmap value 721 * corrupt and update bb_free using bitmap value
700 */ 722 */
701 grp->bb_free = free; 723 grp->bb_free = free;
702 } 724 }
725 mb_set_largest_free_order(sb, grp);
703 726
704 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 727 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
705 728
@@ -725,6 +748,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
725 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 748 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
726 * So it can have information regarding groups_per_page which 749 * So it can have information regarding groups_per_page which
727 * is blocks_per_page/2 750 * is blocks_per_page/2
751 *
752 * Locking note: This routine takes the block group lock of all groups
753 * for this page; do not hold this lock when calling this routine!
728 */ 754 */
729 755
730static int ext4_mb_init_cache(struct page *page, char *incore) 756static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +891,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
865 BUG_ON(incore == NULL); 891 BUG_ON(incore == NULL);
866 mb_debug(1, "put buddy for group %u in page %lu/%x\n", 892 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
867 group, page->index, i * blocksize); 893 group, page->index, i * blocksize);
894 trace_ext4_mb_buddy_bitmap_load(sb, group);
868 grinfo = ext4_get_group_info(sb, group); 895 grinfo = ext4_get_group_info(sb, group);
869 grinfo->bb_fragments = 0; 896 grinfo->bb_fragments = 0;
870 memset(grinfo->bb_counters, 0, 897 memset(grinfo->bb_counters, 0,
@@ -882,6 +909,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
882 BUG_ON(incore != NULL); 909 BUG_ON(incore != NULL);
883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n", 910 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
884 group, page->index, i * blocksize); 911 group, page->index, i * blocksize);
912 trace_ext4_mb_bitmap_load(sb, group);
885 913
886 /* see comments in ext4_mb_put_pa() */ 914 /* see comments in ext4_mb_put_pa() */
887 ext4_lock_group(sb, group); 915 ext4_lock_group(sb, group);
@@ -910,6 +938,11 @@ out:
910 return err; 938 return err;
911} 939}
912 940
941/*
942 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
943 * block group lock of all groups for this page; do not hold the BG lock when
944 * calling this routine!
945 */
913static noinline_for_stack 946static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 947int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{ 948{
@@ -1004,6 +1037,11 @@ err:
1004 return ret; 1037 return ret;
1005} 1038}
1006 1039
1040/*
1041 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1042 * block group lock of all groups for this page; do not hold the BG lock when
1043 * calling this routine!
1044 */
1007static noinline_for_stack int 1045static noinline_for_stack int
1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1046ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1009 struct ext4_buddy *e4b) 1047 struct ext4_buddy *e4b)
@@ -1150,7 +1188,7 @@ err:
1150 return ret; 1188 return ret;
1151} 1189}
1152 1190
1153static void ext4_mb_release_desc(struct ext4_buddy *e4b) 1191static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1154{ 1192{
1155 if (e4b->bd_bitmap_page) 1193 if (e4b->bd_bitmap_page)
1156 page_cache_release(e4b->bd_bitmap_page); 1194 page_cache_release(e4b->bd_bitmap_page);
@@ -1259,10 +1297,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1259 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1297 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1260 blocknr += block; 1298 blocknr += block;
1261 ext4_grp_locked_error(sb, e4b->bd_group, 1299 ext4_grp_locked_error(sb, e4b->bd_group,
1262 __func__, "double-free of inode" 1300 inode ? inode->i_ino : 0,
1263 " %lu's block %llu(bit %u in group %u)", 1301 blocknr,
1264 inode ? inode->i_ino : 0, blocknr, block, 1302 "freeing already freed block "
1265 e4b->bd_group); 1303 "(bit %u)", block);
1266 } 1304 }
1267 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1305 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1268 e4b->bd_info->bb_counters[order]++; 1306 e4b->bd_info->bb_counters[order]++;
@@ -1299,6 +1337,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1299 buddy = buddy2; 1337 buddy = buddy2;
1300 } while (1); 1338 } while (1);
1301 } 1339 }
1340 mb_set_largest_free_order(sb, e4b->bd_info);
1302 mb_check_buddy(e4b); 1341 mb_check_buddy(e4b);
1303} 1342}
1304 1343
@@ -1427,6 +1466,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1427 e4b->bd_info->bb_counters[ord]++; 1466 e4b->bd_info->bb_counters[ord]++;
1428 e4b->bd_info->bb_counters[ord]++; 1467 e4b->bd_info->bb_counters[ord]++;
1429 } 1468 }
1469 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1430 1470
1431 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1471 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1432 mb_check_buddy(e4b); 1472 mb_check_buddy(e4b);
@@ -1617,7 +1657,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1617 } 1657 }
1618 1658
1619 ext4_unlock_group(ac->ac_sb, group); 1659 ext4_unlock_group(ac->ac_sb, group);
1620 ext4_mb_release_desc(e4b); 1660 ext4_mb_unload_buddy(e4b);
1621 1661
1622 return 0; 1662 return 0;
1623} 1663}
@@ -1672,7 +1712,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1672 ext4_mb_use_best_found(ac, e4b); 1712 ext4_mb_use_best_found(ac, e4b);
1673 } 1713 }
1674 ext4_unlock_group(ac->ac_sb, group); 1714 ext4_unlock_group(ac->ac_sb, group);
1675 ext4_mb_release_desc(e4b); 1715 ext4_mb_unload_buddy(e4b);
1676 1716
1677 return 0; 1717 return 0;
1678} 1718}
@@ -1749,8 +1789,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1749 * free blocks even though group info says we 1789 * free blocks even though group info says we
1750 * we have free blocks 1790 * we have free blocks
1751 */ 1791 */
1752 ext4_grp_locked_error(sb, e4b->bd_group, 1792 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1753 __func__, "%d free blocks as per " 1793 "%d free blocks as per "
1754 "group info. But bitmap says 0", 1794 "group info. But bitmap says 0",
1755 free); 1795 free);
1756 break; 1796 break;
@@ -1759,8 +1799,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1759 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1799 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1760 BUG_ON(ex.fe_len <= 0); 1800 BUG_ON(ex.fe_len <= 0);
1761 if (free < ex.fe_len) { 1801 if (free < ex.fe_len) {
1762 ext4_grp_locked_error(sb, e4b->bd_group, 1802 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1763 __func__, "%d free blocks as per " 1803 "%d free blocks as per "
1764 "group info. But got %d blocks", 1804 "group info. But got %d blocks",
1765 free, ex.fe_len); 1805 free, ex.fe_len);
1766 /* 1806 /*
@@ -1782,8 +1822,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1782 1822
1783/* 1823/*
1784 * This is a special case for storages like raid5 1824 * This is a special case for storages like raid5
1785 * we try to find stripe-aligned chunks for stripe-size requests 1825 * we try to find stripe-aligned chunks for stripe-size-multiple requests
1786 * XXX should do so at least for multiples of stripe size as well
1787 */ 1826 */
1788static noinline_for_stack 1827static noinline_for_stack
1789void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1828void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1821 } 1860 }
1822} 1861}
1823 1862
1863/* This is now called BEFORE we load the buddy bitmap. */
1824static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1864static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1825 ext4_group_t group, int cr) 1865 ext4_group_t group, int cr)
1826{ 1866{
1827 unsigned free, fragments; 1867 unsigned free, fragments;
1828 unsigned i, bits;
1829 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1868 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1830 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1869 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1831 1870
1832 BUG_ON(cr < 0 || cr >= 4); 1871 BUG_ON(cr < 0 || cr >= 4);
1833 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1872
1873 /* We only do this if the grp has never been initialized */
1874 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1875 int ret = ext4_mb_init_group(ac->ac_sb, group);
1876 if (ret)
1877 return 0;
1878 }
1834 1879
1835 free = grp->bb_free; 1880 free = grp->bb_free;
1836 fragments = grp->bb_fragments; 1881 fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1843 case 0: 1888 case 0:
1844 BUG_ON(ac->ac_2order == 0); 1889 BUG_ON(ac->ac_2order == 0);
1845 1890
1891 if (grp->bb_largest_free_order < ac->ac_2order)
1892 return 0;
1893
1846 /* Avoid using the first bg of a flexgroup for data files */ 1894 /* Avoid using the first bg of a flexgroup for data files */
1847 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1895 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1848 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1896 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1849 ((group % flex_size) == 0)) 1897 ((group % flex_size) == 0))
1850 return 0; 1898 return 0;
1851 1899
1852 bits = ac->ac_sb->s_blocksize_bits + 1; 1900 return 1;
1853 for (i = ac->ac_2order; i <= bits; i++)
1854 if (grp->bb_counters[i] > 0)
1855 return 1;
1856 break;
1857 case 1: 1901 case 1:
1858 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1902 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1859 return 1; 1903 return 1;
@@ -1955,7 +1999,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1955 ext4_group_t ngroups, group, i; 1999 ext4_group_t ngroups, group, i;
1956 int cr; 2000 int cr;
1957 int err = 0; 2001 int err = 0;
1958 int bsbits;
1959 struct ext4_sb_info *sbi; 2002 struct ext4_sb_info *sbi;
1960 struct super_block *sb; 2003 struct super_block *sb;
1961 struct ext4_buddy e4b; 2004 struct ext4_buddy e4b;
@@ -1964,7 +2007,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1964 sbi = EXT4_SB(sb); 2007 sbi = EXT4_SB(sb);
1965 ngroups = ext4_get_groups_count(sb); 2008 ngroups = ext4_get_groups_count(sb);
1966 /* non-extent files are limited to low blocks/groups */ 2009 /* non-extent files are limited to low blocks/groups */
1967 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) 2010 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
1968 ngroups = sbi->s_blockfile_groups; 2011 ngroups = sbi->s_blockfile_groups;
1969 2012
1970 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2013 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -1997,8 +2040,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1997 ac->ac_2order = i - 1; 2040 ac->ac_2order = i - 1;
1998 } 2041 }
1999 2042
2000 bsbits = ac->ac_sb->s_blocksize_bits;
2001
2002 /* if stream allocation is enabled, use global goal */ 2043 /* if stream allocation is enabled, use global goal */
2003 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2044 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2004 /* TBD: may be hot point */ 2045 /* TBD: may be hot point */
@@ -2024,15 +2065,11 @@ repeat:
2024 group = ac->ac_g_ex.fe_group; 2065 group = ac->ac_g_ex.fe_group;
2025 2066
2026 for (i = 0; i < ngroups; group++, i++) { 2067 for (i = 0; i < ngroups; group++, i++) {
2027 struct ext4_group_info *grp;
2028 struct ext4_group_desc *desc;
2029
2030 if (group == ngroups) 2068 if (group == ngroups)
2031 group = 0; 2069 group = 0;
2032 2070
2033 /* quick check to skip empty groups */ 2071 /* This now checks without needing the buddy page */
2034 grp = ext4_get_group_info(sb, group); 2072 if (!ext4_mb_good_group(ac, group, cr))
2035 if (grp->bb_free == 0)
2036 continue; 2073 continue;
2037 2074
2038 err = ext4_mb_load_buddy(sb, group, &e4b); 2075 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,25 +2077,28 @@ repeat:
2040 goto out; 2077 goto out;
2041 2078
2042 ext4_lock_group(sb, group); 2079 ext4_lock_group(sb, group);
2080
2081 /*
2082 * We need to check again after locking the
2083 * block group
2084 */
2043 if (!ext4_mb_good_group(ac, group, cr)) { 2085 if (!ext4_mb_good_group(ac, group, cr)) {
2044 /* someone did allocation from this group */
2045 ext4_unlock_group(sb, group); 2086 ext4_unlock_group(sb, group);
2046 ext4_mb_release_desc(&e4b); 2087 ext4_mb_unload_buddy(&e4b);
2047 continue; 2088 continue;
2048 } 2089 }
2049 2090
2050 ac->ac_groups_scanned++; 2091 ac->ac_groups_scanned++;
2051 desc = ext4_get_group_desc(sb, group, NULL);
2052 if (cr == 0) 2092 if (cr == 0)
2053 ext4_mb_simple_scan_group(ac, &e4b); 2093 ext4_mb_simple_scan_group(ac, &e4b);
2054 else if (cr == 1 && 2094 else if (cr == 1 && sbi->s_stripe &&
2055 ac->ac_g_ex.fe_len == sbi->s_stripe) 2095 !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2056 ext4_mb_scan_aligned(ac, &e4b); 2096 ext4_mb_scan_aligned(ac, &e4b);
2057 else 2097 else
2058 ext4_mb_complex_scan_group(ac, &e4b); 2098 ext4_mb_complex_scan_group(ac, &e4b);
2059 2099
2060 ext4_unlock_group(sb, group); 2100 ext4_unlock_group(sb, group);
2061 ext4_mb_release_desc(&e4b); 2101 ext4_mb_unload_buddy(&e4b);
2062 2102
2063 if (ac->ac_status != AC_STATUS_CONTINUE) 2103 if (ac->ac_status != AC_STATUS_CONTINUE)
2064 break; 2104 break;
@@ -2148,7 +2188,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2148 ext4_lock_group(sb, group); 2188 ext4_lock_group(sb, group);
2149 memcpy(&sg, ext4_get_group_info(sb, group), i); 2189 memcpy(&sg, ext4_get_group_info(sb, group), i);
2150 ext4_unlock_group(sb, group); 2190 ext4_unlock_group(sb, group);
2151 ext4_mb_release_desc(&e4b); 2191 ext4_mb_unload_buddy(&e4b);
2152 2192
2153 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2193 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2154 sg.info.bb_fragments, sg.info.bb_first_free); 2194 sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2178,7 +2218,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2178 2218
2179 rc = seq_open(file, &ext4_mb_seq_groups_ops); 2219 rc = seq_open(file, &ext4_mb_seq_groups_ops);
2180 if (rc == 0) { 2220 if (rc == 0) {
2181 struct seq_file *m = (struct seq_file *)file->private_data; 2221 struct seq_file *m = file->private_data;
2182 m->private = sb; 2222 m->private = sb;
2183 } 2223 }
2184 return rc; 2224 return rc;
@@ -2255,6 +2295,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2255 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2295 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2256 init_rwsem(&meta_group_info[i]->alloc_sem); 2296 init_rwsem(&meta_group_info[i]->alloc_sem);
2257 meta_group_info[i]->bb_free_root = RB_ROOT; 2297 meta_group_info[i]->bb_free_root = RB_ROOT;
2298 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
2258 2299
2259#ifdef DOUBLE_CHECK 2300#ifdef DOUBLE_CHECK
2260 { 2301 {
@@ -2516,6 +2557,22 @@ int ext4_mb_release(struct super_block *sb)
2516 return 0; 2557 return 0;
2517} 2558}
2518 2559
2560static inline void ext4_issue_discard(struct super_block *sb,
2561 ext4_group_t block_group, ext4_grpblk_t block, int count)
2562{
2563 int ret;
2564 ext4_fsblk_t discard_block;
2565
2566 discard_block = block + ext4_group_first_block_no(sb, block_group);
2567 trace_ext4_discard_blocks(sb,
2568 (unsigned long long) discard_block, count);
2569 ret = sb_issue_discard(sb, discard_block, count);
2570 if (ret == EOPNOTSUPP) {
2571 ext4_warning(sb, "discard not supported, disabling");
2572 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2573 }
2574}
2575
2519/* 2576/*
2520 * This function is called by the jbd2 layer once the commit has finished, 2577 * This function is called by the jbd2 layer once the commit has finished,
2521 * so we know we can free the blocks that were released with that commit. 2578 * so we know we can free the blocks that were released with that commit.
@@ -2535,16 +2592,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2535 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2592 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2536 entry->count, entry->group, entry); 2593 entry->count, entry->group, entry);
2537 2594
2538 if (test_opt(sb, DISCARD)) { 2595 if (test_opt(sb, DISCARD))
2539 ext4_fsblk_t discard_block; 2596 ext4_issue_discard(sb, entry->group,
2540 2597 entry->start_blk, entry->count);
2541 discard_block = entry->start_blk +
2542 ext4_group_first_block_no(sb, entry->group);
2543 trace_ext4_discard_blocks(sb,
2544 (unsigned long long)discard_block,
2545 entry->count);
2546 sb_issue_discard(sb, discard_block, entry->count);
2547 }
2548 2598
2549 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2550 /* we expect to find existing buddy because it's pinned */ 2600 /* we expect to find existing buddy because it's pinned */
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2568 } 2618 }
2569 ext4_unlock_group(sb, entry->group); 2619 ext4_unlock_group(sb, entry->group);
2570 kmem_cache_free(ext4_free_ext_cachep, entry); 2620 kmem_cache_free(ext4_free_ext_cachep, entry);
2571 ext4_mb_release_desc(&e4b); 2621 ext4_mb_unload_buddy(&e4b);
2572 } 2622 }
2573 2623
2574 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2624 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
2641 2691
2642void exit_ext4_mballoc(void) 2692void exit_ext4_mballoc(void)
2643{ 2693{
2644 /* 2694 /*
2645 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2646 * before destroying the slab cache. 2696 * before destroying the slab cache.
2647 */ 2697 */
@@ -2654,7 +2704,7 @@ void exit_ext4_mballoc(void)
2654 2704
2655 2705
2656/* 2706/*
2657 * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps 2707 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2658 * Returns 0 if success or error code 2708 * Returns 0 if success or error code
2659 */ 2709 */
2660static noinline_for_stack int 2710static noinline_for_stack int
@@ -2662,7 +2712,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2662 handle_t *handle, unsigned int reserv_blks) 2712 handle_t *handle, unsigned int reserv_blks)
2663{ 2713{
2664 struct buffer_head *bitmap_bh = NULL; 2714 struct buffer_head *bitmap_bh = NULL;
2665 struct ext4_super_block *es;
2666 struct ext4_group_desc *gdp; 2715 struct ext4_group_desc *gdp;
2667 struct buffer_head *gdp_bh; 2716 struct buffer_head *gdp_bh;
2668 struct ext4_sb_info *sbi; 2717 struct ext4_sb_info *sbi;
@@ -2675,8 +2724,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2675 2724
2676 sb = ac->ac_sb; 2725 sb = ac->ac_sb;
2677 sbi = EXT4_SB(sb); 2726 sbi = EXT4_SB(sb);
2678 es = sbi->s_es;
2679
2680 2727
2681 err = -EIO; 2728 err = -EIO;
2682 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2729 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
@@ -2762,7 +2809,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2762 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 2809 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2763 2810
2764out_err: 2811out_err:
2765 sb->s_dirt = 1; 2812 ext4_mark_super_dirty(sb);
2766 brelse(bitmap_bh); 2813 brelse(bitmap_bh);
2767 return err; 2814 return err;
2768} 2815}
@@ -2800,7 +2847,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2800 int bsbits, max; 2847 int bsbits, max;
2801 ext4_lblk_t end; 2848 ext4_lblk_t end;
2802 loff_t size, orig_size, start_off; 2849 loff_t size, orig_size, start_off;
2803 ext4_lblk_t start, orig_start; 2850 ext4_lblk_t start;
2804 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 2851 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2805 struct ext4_prealloc_space *pa; 2852 struct ext4_prealloc_space *pa;
2806 2853
@@ -2831,6 +2878,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2831 size = size << bsbits; 2878 size = size << bsbits;
2832 if (size < i_size_read(ac->ac_inode)) 2879 if (size < i_size_read(ac->ac_inode))
2833 size = i_size_read(ac->ac_inode); 2880 size = i_size_read(ac->ac_inode);
2881 orig_size = size;
2834 2882
2835 /* max size of free chunks */ 2883 /* max size of free chunks */
2836 max = 2 << bsbits; 2884 max = 2 << bsbits;
@@ -2872,8 +2920,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2872 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 2920 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
2873 size = ac->ac_o_ex.fe_len << bsbits; 2921 size = ac->ac_o_ex.fe_len << bsbits;
2874 } 2922 }
2875 orig_size = size = size >> bsbits; 2923 size = size >> bsbits;
2876 orig_start = start = start_off >> bsbits; 2924 start = start_off >> bsbits;
2877 2925
2878 /* don't cover already allocated blocks in selected range */ 2926 /* don't cover already allocated blocks in selected range */
2879 if (ar->pleft && start <= ar->lleft) { 2927 if (ar->pleft && start <= ar->lleft) {
@@ -2981,7 +3029,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
2981 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3029 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
2982 atomic_inc(&sbi->s_bal_reqs); 3030 atomic_inc(&sbi->s_bal_reqs);
2983 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3031 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
2984 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) 3032 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
2985 atomic_inc(&sbi->s_bal_success); 3033 atomic_inc(&sbi->s_bal_success);
2986 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3034 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
2987 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3035 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3171,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3123 continue; 3171 continue;
3124 3172
3125 /* non-extent files can't have physical blocks past 2^32 */ 3173 /* non-extent files can't have physical blocks past 2^32 */
3126 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && 3174 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3127 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) 3175 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3128 continue; 3176 continue;
3129 3177
@@ -3280,7 +3328,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3280 spin_unlock(&pa->pa_lock); 3328 spin_unlock(&pa->pa_lock);
3281 3329
3282 grp_blk = pa->pa_pstart; 3330 grp_blk = pa->pa_pstart;
3283 /* 3331 /*
3284 * If doing group-based preallocation, pa_pstart may be in the 3332 * If doing group-based preallocation, pa_pstart may be in the
3285 * next group when pa is used up 3333 * next group when pa is used up
3286 */ 3334 */
@@ -3497,7 +3545,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3497 ext4_group_t group; 3545 ext4_group_t group;
3498 ext4_grpblk_t bit; 3546 ext4_grpblk_t bit;
3499 unsigned long long grp_blk_start; 3547 unsigned long long grp_blk_start;
3500 sector_t start;
3501 int err = 0; 3548 int err = 0;
3502 int free = 0; 3549 int free = 0;
3503 3550
@@ -3517,10 +3564,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3517 if (bit >= end) 3564 if (bit >= end)
3518 break; 3565 break;
3519 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3566 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3520 start = ext4_group_first_block_no(sb, group) + bit;
3521 mb_debug(1, " free preallocated %u/%u in group %u\n", 3567 mb_debug(1, " free preallocated %u/%u in group %u\n",
3522 (unsigned) start, (unsigned) next - bit, 3568 (unsigned) ext4_group_first_block_no(sb, group) + bit,
3523 (unsigned) group); 3569 (unsigned) next - bit, (unsigned) group);
3524 free += next - bit; 3570 free += next - bit;
3525 3571
3526 if (ac) { 3572 if (ac) {
@@ -3531,7 +3577,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3531 trace_ext4_mballoc_discard(ac); 3577 trace_ext4_mballoc_discard(ac);
3532 } 3578 }
3533 3579
3534 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3580 trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
3535 next - bit); 3581 next - bit);
3536 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3582 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3537 bit = next + 1; 3583 bit = next + 1;
@@ -3541,8 +3587,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3541 pa, (unsigned long) pa->pa_lstart, 3587 pa, (unsigned long) pa->pa_lstart,
3542 (unsigned long) pa->pa_pstart, 3588 (unsigned long) pa->pa_pstart,
3543 (unsigned long) pa->pa_len); 3589 (unsigned long) pa->pa_len);
3544 ext4_grp_locked_error(sb, group, 3590 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3545 __func__, "free %u, pa_free %u",
3546 free, pa->pa_free); 3591 free, pa->pa_free);
3547 /* 3592 /*
3548 * pa is already deleted so we use the value obtained 3593 * pa is already deleted so we use the value obtained
@@ -3563,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3563 ext4_group_t group; 3608 ext4_group_t group;
3564 ext4_grpblk_t bit; 3609 ext4_grpblk_t bit;
3565 3610
3566 trace_ext4_mb_release_group_pa(ac, pa); 3611 trace_ext4_mb_release_group_pa(sb, ac, pa);
3567 BUG_ON(pa->pa_deleted == 0); 3612 BUG_ON(pa->pa_deleted == 0);
3568 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3613 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3569 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3614 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3697,7 +3742,7 @@ out:
3697 ext4_unlock_group(sb, group); 3742 ext4_unlock_group(sb, group);
3698 if (ac) 3743 if (ac)
3699 kmem_cache_free(ext4_ac_cachep, ac); 3744 kmem_cache_free(ext4_ac_cachep, ac);
3700 ext4_mb_release_desc(&e4b); 3745 ext4_mb_unload_buddy(&e4b);
3701 put_bh(bitmap_bh); 3746 put_bh(bitmap_bh);
3702 return free; 3747 return free;
3703} 3748}
@@ -3801,7 +3846,7 @@ repeat:
3801 if (bitmap_bh == NULL) { 3846 if (bitmap_bh == NULL) {
3802 ext4_error(sb, "Error reading block bitmap for %u", 3847 ext4_error(sb, "Error reading block bitmap for %u",
3803 group); 3848 group);
3804 ext4_mb_release_desc(&e4b); 3849 ext4_mb_unload_buddy(&e4b);
3805 continue; 3850 continue;
3806 } 3851 }
3807 3852
@@ -3810,7 +3855,7 @@ repeat:
3810 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3855 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
3811 ext4_unlock_group(sb, group); 3856 ext4_unlock_group(sb, group);
3812 3857
3813 ext4_mb_release_desc(&e4b); 3858 ext4_mb_unload_buddy(&e4b);
3814 put_bh(bitmap_bh); 3859 put_bh(bitmap_bh);
3815 3860
3816 list_del(&pa->u.pa_tmp_list); 3861 list_del(&pa->u.pa_tmp_list);
@@ -3839,6 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3839 struct super_block *sb = ac->ac_sb; 3884 struct super_block *sb = ac->ac_sb;
3840 ext4_group_t ngroups, i; 3885 ext4_group_t ngroups, i;
3841 3886
3887 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
3888 return;
3889
3842 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3890 printk(KERN_ERR "EXT4-fs: Can't allocate:"
3843 " Allocation context details:\n"); 3891 " Allocation context details:\n");
3844 printk(KERN_ERR "EXT4-fs: status %d flags %d\n", 3892 printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
@@ -4074,7 +4122,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4074 ext4_mb_release_group_pa(&e4b, pa, ac); 4122 ext4_mb_release_group_pa(&e4b, pa, ac);
4075 ext4_unlock_group(sb, group); 4123 ext4_unlock_group(sb, group);
4076 4124
4077 ext4_mb_release_desc(&e4b); 4125 ext4_mb_unload_buddy(&e4b);
4078 list_del(&pa->u.pa_tmp_list); 4126 list_del(&pa->u.pa_tmp_list);
4079 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4127 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4080 } 4128 }
@@ -4205,7 +4253,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4205 * to usual allocation 4253 * to usual allocation
4206 */ 4254 */
4207ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4255ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4208 struct ext4_allocation_request *ar, int *errp) 4256 struct ext4_allocation_request *ar, int *errp)
4209{ 4257{
4210 int freed; 4258 int freed;
4211 struct ext4_allocation_context *ac = NULL; 4259 struct ext4_allocation_context *ac = NULL;
@@ -4249,7 +4297,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4249 inquota = ar->len; 4297 inquota = ar->len;
4250 if (ar->len == 0) { 4298 if (ar->len == 0) {
4251 *errp = -EDQUOT; 4299 *errp = -EDQUOT;
4252 goto out3; 4300 goto out;
4253 } 4301 }
4254 } 4302 }
4255 4303
@@ -4257,13 +4305,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4257 if (!ac) { 4305 if (!ac) {
4258 ar->len = 0; 4306 ar->len = 0;
4259 *errp = -ENOMEM; 4307 *errp = -ENOMEM;
4260 goto out1; 4308 goto out;
4261 } 4309 }
4262 4310
4263 *errp = ext4_mb_initialize_context(ac, ar); 4311 *errp = ext4_mb_initialize_context(ac, ar);
4264 if (*errp) { 4312 if (*errp) {
4265 ar->len = 0; 4313 ar->len = 0;
4266 goto out2; 4314 goto out;
4267 } 4315 }
4268 4316
4269 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4317 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
@@ -4272,7 +4320,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4272 ext4_mb_normalize_request(ac, ar); 4320 ext4_mb_normalize_request(ac, ar);
4273repeat: 4321repeat:
4274 /* allocate space in core */ 4322 /* allocate space in core */
4275 ext4_mb_regular_allocator(ac); 4323 *errp = ext4_mb_regular_allocator(ac);
4324 if (*errp)
4325 goto errout;
4276 4326
4277 /* as we've just preallocated more space than 4327 /* as we've just preallocated more space than
4278 * user requested orinally, we store allocated 4328 * user requested orinally, we store allocated
@@ -4283,7 +4333,7 @@ repeat:
4283 } 4333 }
4284 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4334 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4285 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4335 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4286 if (*errp == -EAGAIN) { 4336 if (*errp == -EAGAIN) {
4287 /* 4337 /*
4288 * drop the reference that we took 4338 * drop the reference that we took
4289 * in ext4_mb_use_best_found 4339 * in ext4_mb_use_best_found
@@ -4294,12 +4344,10 @@ repeat:
4294 ac->ac_b_ex.fe_len = 0; 4344 ac->ac_b_ex.fe_len = 0;
4295 ac->ac_status = AC_STATUS_CONTINUE; 4345 ac->ac_status = AC_STATUS_CONTINUE;
4296 goto repeat; 4346 goto repeat;
4297 } else if (*errp) { 4347 } else if (*errp)
4348 errout:
4298 ext4_discard_allocated_blocks(ac); 4349 ext4_discard_allocated_blocks(ac);
4299 ac->ac_b_ex.fe_len = 0; 4350 else {
4300 ar->len = 0;
4301 ext4_mb_show_ac(ac);
4302 } else {
4303 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4351 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4304 ar->len = ac->ac_b_ex.fe_len; 4352 ar->len = ac->ac_b_ex.fe_len;
4305 } 4353 }
@@ -4308,19 +4356,19 @@ repeat:
4308 if (freed) 4356 if (freed)
4309 goto repeat; 4357 goto repeat;
4310 *errp = -ENOSPC; 4358 *errp = -ENOSPC;
4359 }
4360
4361 if (*errp) {
4311 ac->ac_b_ex.fe_len = 0; 4362 ac->ac_b_ex.fe_len = 0;
4312 ar->len = 0; 4363 ar->len = 0;
4313 ext4_mb_show_ac(ac); 4364 ext4_mb_show_ac(ac);
4314 } 4365 }
4315
4316 ext4_mb_release_context(ac); 4366 ext4_mb_release_context(ac);
4317 4367out:
4318out2: 4368 if (ac)
4319 kmem_cache_free(ext4_ac_cachep, ac); 4369 kmem_cache_free(ext4_ac_cachep, ac);
4320out1:
4321 if (inquota && ar->len < inquota) 4370 if (inquota && ar->len < inquota)
4322 dquot_free_block(ar->inode, inquota - ar->len); 4371 dquot_free_block(ar->inode, inquota - ar->len);
4323out3:
4324 if (!ar->len) { 4372 if (!ar->len) {
4325 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4373 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4326 /* release all the reserved blocks if non delalloc */ 4374 /* release all the reserved blocks if non delalloc */
@@ -4352,6 +4400,7 @@ static noinline_for_stack int
4352ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4400ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4353 struct ext4_free_data *new_entry) 4401 struct ext4_free_data *new_entry)
4354{ 4402{
4403 ext4_group_t group = e4b->bd_group;
4355 ext4_grpblk_t block; 4404 ext4_grpblk_t block;
4356 struct ext4_free_data *entry; 4405 struct ext4_free_data *entry;
4357 struct ext4_group_info *db = e4b->bd_info; 4406 struct ext4_group_info *db = e4b->bd_info;
@@ -4384,9 +4433,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4384 else if (block >= (entry->start_blk + entry->count)) 4433 else if (block >= (entry->start_blk + entry->count))
4385 n = &(*n)->rb_right; 4434 n = &(*n)->rb_right;
4386 else { 4435 else {
4387 ext4_grp_locked_error(sb, e4b->bd_group, __func__, 4436 ext4_grp_locked_error(sb, group, 0,
4388 "Double free of blocks %d (%d %d)", 4437 ext4_group_first_block_no(sb, group) + block,
4389 block, entry->start_blk, entry->count); 4438 "Block already on to-be-freed list");
4390 return 0; 4439 return 0;
4391 } 4440 }
4392 } 4441 }
@@ -4444,7 +4493,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4444 struct super_block *sb = inode->i_sb; 4493 struct super_block *sb = inode->i_sb;
4445 struct ext4_allocation_context *ac = NULL; 4494 struct ext4_allocation_context *ac = NULL;
4446 struct ext4_group_desc *gdp; 4495 struct ext4_group_desc *gdp;
4447 struct ext4_super_block *es;
4448 unsigned long freed = 0; 4496 unsigned long freed = 0;
4449 unsigned int overflow; 4497 unsigned int overflow;
4450 ext4_grpblk_t bit; 4498 ext4_grpblk_t bit;
@@ -4463,7 +4511,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4463 } 4511 }
4464 4512
4465 sbi = EXT4_SB(sb); 4513 sbi = EXT4_SB(sb);
4466 es = EXT4_SB(sb)->s_es;
4467 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 4514 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4468 !ext4_data_block_valid(sbi, block, count)) { 4515 !ext4_data_block_valid(sbi, block, count)) {
4469 ext4_error(sb, "Freeing blocks not in datazone - " 4516 ext4_error(sb, "Freeing blocks not in datazone - "
@@ -4484,12 +4531,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4484 if (!bh) 4531 if (!bh)
4485 tbh = sb_find_get_block(inode->i_sb, 4532 tbh = sb_find_get_block(inode->i_sb,
4486 block + i); 4533 block + i);
4487 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4534 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4488 inode, tbh, block + i); 4535 inode, tbh, block + i);
4489 } 4536 }
4490 } 4537 }
4491 4538
4492 /* 4539 /*
4493 * We need to make sure we don't reuse the freed block until 4540 * We need to make sure we don't reuse the freed block until
4494 * after the transaction is committed, which we can do by 4541 * after the transaction is committed, which we can do by
4495 * treating the block as metadata, below. We make an 4542 * treating the block as metadata, below. We make an
@@ -4597,6 +4644,8 @@ do_more:
4597 mb_clear_bits(bitmap_bh->b_data, bit, count); 4644 mb_clear_bits(bitmap_bh->b_data, bit, count);
4598 mb_free_blocks(inode, &e4b, bit, count); 4645 mb_free_blocks(inode, &e4b, bit, count);
4599 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4646 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4647 if (test_opt(sb, DISCARD))
4648 ext4_issue_discard(sb, block_group, bit, count);
4600 } 4649 }
4601 4650
4602 ret = ext4_free_blks_count(sb, gdp) + count; 4651 ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4610,7 +4659,7 @@ do_more:
4610 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4659 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4611 } 4660 }
4612 4661
4613 ext4_mb_release_desc(&e4b); 4662 ext4_mb_unload_buddy(&e4b);
4614 4663
4615 freed += count; 4664 freed += count;
4616 4665
@@ -4630,7 +4679,7 @@ do_more:
4630 put_bh(bitmap_bh); 4679 put_bh(bitmap_bh);
4631 goto do_more; 4680 goto do_more;
4632 } 4681 }
4633 sb->s_dirt = 1; 4682 ext4_mark_super_dirty(sb);
4634error_return: 4683error_return:
4635 if (freed) 4684 if (freed)
4636 dquot_free_block(inode, freed); 4685 dquot_free_block(inode, freed);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..1765c2c50a9b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
376 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
377 * Now copy the i_data across 377 * Now copy the i_data across
378 */ 378 */
379 ei->i_flags |= EXT4_EXTENTS_FL; 379 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); 380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
381 381
382 /* 382 /*
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
475 */ 475 */
476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, 476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
477 EXT4_FEATURE_INCOMPAT_EXTENTS) || 477 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
478 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 478 (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
479 return -EINVAL; 479 return -EINVAL;
480 480
481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..5f1ed9fc913c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -148,17 +148,17 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
148 */ 148 */
149static int 149static int
150mext_check_null_inode(struct inode *inode1, struct inode *inode2, 150mext_check_null_inode(struct inode *inode1, struct inode *inode2,
151 const char *function) 151 const char *function, unsigned int line)
152{ 152{
153 int ret = 0; 153 int ret = 0;
154 154
155 if (inode1 == NULL) { 155 if (inode1 == NULL) {
156 __ext4_error(inode2->i_sb, function, 156 __ext4_error(inode2->i_sb, function, line,
157 "Both inodes should not be NULL: " 157 "Both inodes should not be NULL: "
158 "inode1 NULL inode2 %lu", inode2->i_ino); 158 "inode1 NULL inode2 %lu", inode2->i_ino);
159 ret = -EIO; 159 ret = -EIO;
160 } else if (inode2 == NULL) { 160 } else if (inode2 == NULL) {
161 __ext4_error(inode1->i_sb, function, 161 __ext4_error(inode1->i_sb, function, line,
162 "Both inodes should not be NULL: " 162 "Both inodes should not be NULL: "
163 "inode1 %lu inode2 NULL", inode1->i_ino); 163 "inode1 %lu inode2 NULL", inode1->i_ino);
164 ret = -EIO; 164 ret = -EIO;
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
482 int depth = ext_depth(orig_inode); 482 int depth = ext_depth(orig_inode);
483 int ret; 483 int ret;
484 484
485 start_ext.ee_block = end_ext.ee_block = 0;
485 o_start = o_end = oext = orig_path[depth].p_ext; 486 o_start = o_end = oext = orig_path[depth].p_ext;
486 oext_alen = ext4_ext_get_actual_len(oext); 487 oext_alen = ext4_ext_get_actual_len(oext);
487 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
529 * new_ext |-------| 530 * new_ext |-------|
530 */ 531 */
531 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 532 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
532 ext4_error(orig_inode->i_sb, 533 EXT4_ERROR_INODE(orig_inode,
533 "new_ext_end(%u) should be less than or equal to " 534 "new_ext_end(%u) should be less than or equal to "
534 "oext->ee_block(%u) + oext_alen(%d) - 1", 535 "oext->ee_block(%u) + oext_alen(%d) - 1",
535 new_ext_end, le32_to_cpu(oext->ee_block), 536 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 while (1) { 693 while (1) {
693 /* The extent for donor must be found. */ 694 /* The extent for donor must be found. */
694 if (!dext) { 695 if (!dext) {
695 ext4_error(donor_inode->i_sb, 696 EXT4_ERROR_INODE(donor_inode,
696 "The extent for donor must be found"); 697 "The extent for donor must be found");
697 *err = -EIO; 698 *err = -EIO;
698 goto out; 699 goto out;
699 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 700 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
700 ext4_error(donor_inode->i_sb, 701 EXT4_ERROR_INODE(donor_inode,
701 "Donor offset(%u) and the first block of donor " 702 "Donor offset(%u) and the first block of donor "
702 "extent(%u) should be equal", 703 "extent(%u) should be equal",
703 donor_off, 704 donor_off,
@@ -959,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode,
959 return -EINVAL; 960 return -EINVAL;
960 } 961 }
961 962
963 if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
964 return -EPERM;
965
962 /* Ext4 move extent does not support swapfile */ 966 /* Ext4 move extent does not support swapfile */
963 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { 967 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
964 ext4_debug("ext4 move extent: The argument files should " 968 ext4_debug("ext4 move extent: The argument files should "
@@ -976,11 +980,11 @@ mext_check_arguments(struct inode *orig_inode,
976 } 980 }
977 981
978 /* Ext4 move extent supports only extent based file */ 982 /* Ext4 move extent supports only extent based file */
979 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 983 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
980 ext4_debug("ext4 move extent: orig file is not extents " 984 ext4_debug("ext4 move extent: orig file is not extents "
981 "based file [ino:orig %lu]\n", orig_inode->i_ino); 985 "based file [ino:orig %lu]\n", orig_inode->i_ino);
982 return -EOPNOTSUPP; 986 return -EOPNOTSUPP;
983 } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { 987 } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
984 ext4_debug("ext4 move extent: donor file is not extents " 988 ext4_debug("ext4 move extent: donor file is not extents "
985 "based file [ino:donor %lu]\n", donor_inode->i_ino); 989 "based file [ino:donor %lu]\n", donor_inode->i_ino);
986 return -EOPNOTSUPP; 990 return -EOPNOTSUPP;
@@ -1080,7 +1084,7 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1080 1084
1081 BUG_ON(inode1 == NULL && inode2 == NULL); 1085 BUG_ON(inode1 == NULL && inode2 == NULL);
1082 1086
1083 ret = mext_check_null_inode(inode1, inode2, __func__); 1087 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1084 if (ret < 0) 1088 if (ret < 0)
1085 goto out; 1089 goto out;
1086 1090
@@ -1117,7 +1121,7 @@ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1117 1121
1118 BUG_ON(inode1 == NULL && inode2 == NULL); 1122 BUG_ON(inode1 == NULL && inode2 == NULL);
1119 1123
1120 ret = mext_check_null_inode(inode1, inode2, __func__); 1124 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1121 if (ret < 0) 1125 if (ret < 0)
1122 goto out; 1126 goto out;
1123 1127
@@ -1354,7 +1358,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1354 if (ret1 < 0) 1358 if (ret1 < 0)
1355 break; 1359 break;
1356 if (*moved_len > len) { 1360 if (*moved_len > len) {
1357 ext4_error(orig_inode->i_sb, 1361 EXT4_ERROR_INODE(orig_inode,
1358 "We replaced blocks too much! " 1362 "We replaced blocks too much! "
1359 "sum of replaced: %llu requested: %llu", 1363 "sum of replaced: %llu requested: %llu",
1360 *moved_len, len); 1364 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..314c0d3b3fa9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -179,30 +179,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
179static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 179static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
180 struct inode *inode); 180 struct inode *inode);
181 181
182unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
183{
184 unsigned len = le16_to_cpu(dlen);
185
186 if (len == EXT4_MAX_REC_LEN || len == 0)
187 return blocksize;
188 return (len & 65532) | ((len & 3) << 16);
189}
190
191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
192{
193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
194 BUG();
195 if (len < 65536)
196 return cpu_to_le16(len);
197 if (len == blocksize) {
198 if (blocksize == 65536)
199 return cpu_to_le16(EXT4_MAX_REC_LEN);
200 else
201 return cpu_to_le16(0);
202 }
203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
204}
205
206/* 182/*
207 * p is at least 6 bytes before the end of page 183 * p is at least 6 bytes before the end of page
208 */ 184 */
@@ -349,7 +325,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
349 brelse(bh); 325 brelse(bh);
350 } 326 }
351 if (bcount) 327 if (bcount)
352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 328 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
353 levels ? "" : " ", names, space/bcount, 329 levels ? "" : " ", names, space/bcount,
354 (space/bcount)*100/blocksize); 330 (space/bcount)*100/blocksize);
355 return (struct stats) { names, space, bcount}; 331 return (struct stats) { names, space, bcount};
@@ -605,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
605 dir->i_sb->s_blocksize - 581 dir->i_sb->s_blocksize -
606 EXT4_DIR_REC_LEN(0)); 582 EXT4_DIR_REC_LEN(0));
607 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
608 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 584 if (!ext4_check_dir_entry(dir, de, bh,
609 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
610 +((char *)de - bh->b_data))) { 586 +((char *)de - bh->b_data))) {
611 /* On error, skip the f_pos to the next block. */ 587 /* On error, skip the f_pos to the next block. */
@@ -653,10 +629,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
653 int ret, err; 629 int ret, err;
654 __u32 hashval; 630 __u32 hashval;
655 631
656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 632 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
657 start_hash, start_minor_hash)); 633 start_hash, start_minor_hash));
658 dir = dir_file->f_path.dentry->d_inode; 634 dir = dir_file->f_path.dentry->d_inode;
659 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 635 if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 636 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
661 if (hinfo.hash_version <= DX_HASH_TEA) 637 if (hinfo.hash_version <= DX_HASH_TEA)
662 hinfo.hash_version += 638 hinfo.hash_version +=
@@ -801,7 +777,7 @@ static void ext4_update_dx_flag(struct inode *inode)
801{ 777{
802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 778 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
803 EXT4_FEATURE_COMPAT_DIR_INDEX)) 779 EXT4_FEATURE_COMPAT_DIR_INDEX))
804 EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; 780 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
805} 781}
806 782
807/* 783/*
@@ -844,8 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
844 if ((char *) de + namelen <= dlimit && 820 if ((char *) de + namelen <= dlimit &&
845 ext4_match (namelen, name, de)) { 821 ext4_match (namelen, name, de)) {
846 /* found a match - just to be sure, do a full check */ 822 /* found a match - just to be sure, do a full check */
847 if (!ext4_check_dir_entry("ext4_find_entry", 823 if (!ext4_check_dir_entry(dir, de, bh, offset))
848 dir, de, bh, offset))
849 return -1; 824 return -1;
850 *res_dir = de; 825 *res_dir = de;
851 return 1; 826 return 1;
@@ -943,8 +918,8 @@ restart:
943 wait_on_buffer(bh); 918 wait_on_buffer(bh);
944 if (!buffer_uptodate(bh)) { 919 if (!buffer_uptodate(bh)) {
945 /* read error, skip block & hope for the best */ 920 /* read error, skip block & hope for the best */
946 ext4_error(sb, "reading directory #%lu offset %lu", 921 EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
947 dir->i_ino, (unsigned long)block); 922 (unsigned long) block);
948 brelse(bh); 923 brelse(bh);
949 goto next; 924 goto next;
950 } 925 }
@@ -1019,7 +994,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1019 int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) 994 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
1020 + ((char *) de - bh->b_data); 995 + ((char *) de - bh->b_data);
1021 996
1022 if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { 997 if (!ext4_check_dir_entry(dir, de, bh, off)) {
1023 brelse(bh); 998 brelse(bh);
1024 *err = ERR_BAD_DX_DIR; 999 *err = ERR_BAD_DX_DIR;
1025 goto errout; 1000 goto errout;
@@ -1066,15 +1041,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1066 __u32 ino = le32_to_cpu(de->inode); 1041 __u32 ino = le32_to_cpu(de->inode);
1067 brelse(bh); 1042 brelse(bh);
1068 if (!ext4_valid_inum(dir->i_sb, ino)) { 1043 if (!ext4_valid_inum(dir->i_sb, ino)) {
1069 ext4_error(dir->i_sb, "bad inode number: %u", ino); 1044 EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1070 return ERR_PTR(-EIO); 1045 return ERR_PTR(-EIO);
1071 } 1046 }
1072 inode = ext4_iget(dir->i_sb, ino); 1047 inode = ext4_iget(dir->i_sb, ino);
1073 if (unlikely(IS_ERR(inode))) { 1048 if (unlikely(IS_ERR(inode))) {
1074 if (PTR_ERR(inode) == -ESTALE) { 1049 if (PTR_ERR(inode) == -ESTALE) {
1075 ext4_error(dir->i_sb, 1050 EXT4_ERROR_INODE(dir,
1076 "deleted inode referenced: %u", 1051 "deleted inode referenced: %u",
1077 ino); 1052 ino);
1078 return ERR_PTR(-EIO); 1053 return ERR_PTR(-EIO);
1079 } else { 1054 } else {
1080 return ERR_CAST(inode); 1055 return ERR_CAST(inode);
@@ -1088,7 +1063,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1088struct dentry *ext4_get_parent(struct dentry *child) 1063struct dentry *ext4_get_parent(struct dentry *child)
1089{ 1064{
1090 __u32 ino; 1065 __u32 ino;
1091 struct inode *inode;
1092 static const struct qstr dotdot = { 1066 static const struct qstr dotdot = {
1093 .name = "..", 1067 .name = "..",
1094 .len = 2, 1068 .len = 2,
@@ -1097,15 +1071,14 @@ struct dentry *ext4_get_parent(struct dentry *child)
1097 struct buffer_head *bh; 1071 struct buffer_head *bh;
1098 1072
1099 bh = ext4_find_entry(child->d_inode, &dotdot, &de); 1073 bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1100 inode = NULL;
1101 if (!bh) 1074 if (!bh)
1102 return ERR_PTR(-ENOENT); 1075 return ERR_PTR(-ENOENT);
1103 ino = le32_to_cpu(de->inode); 1076 ino = le32_to_cpu(de->inode);
1104 brelse(bh); 1077 brelse(bh);
1105 1078
1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1079 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1107 ext4_error(child->d_inode->i_sb, 1080 EXT4_ERROR_INODE(child->d_inode,
1108 "bad inode number: %u", ino); 1081 "bad parent inode number: %u", ino);
1109 return ERR_PTR(-EIO); 1082 return ERR_PTR(-EIO);
1110 } 1083 }
1111 1084
@@ -1141,7 +1114,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1141 unsigned rec_len = 0; 1114 unsigned rec_len = 0;
1142 1115
1143 while (count--) { 1116 while (count--) {
1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 1117 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1145 (from + (map->offs<<2)); 1118 (from + (map->offs<<2));
1146 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1119 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1147 memcpy (to, de, rec_len); 1120 memcpy (to, de, rec_len);
@@ -1305,8 +1278,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1305 de = (struct ext4_dir_entry_2 *)bh->b_data; 1278 de = (struct ext4_dir_entry_2 *)bh->b_data;
1306 top = bh->b_data + blocksize - reclen; 1279 top = bh->b_data + blocksize - reclen;
1307 while ((char *) de <= top) { 1280 while ((char *) de <= top) {
1308 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1281 if (!ext4_check_dir_entry(dir, de, bh, offset))
1309 bh, offset))
1310 return -EIO; 1282 return -EIO;
1311 if (ext4_match(namelen, name, de)) 1283 if (ext4_match(namelen, name, de))
1312 return -EEXIST; 1284 return -EEXIST;
@@ -1404,9 +1376,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1404 de = (struct ext4_dir_entry_2 *)((char *)fde + 1376 de = (struct ext4_dir_entry_2 *)((char *)fde +
1405 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1377 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1406 if ((char *) de >= (((char *) root) + blocksize)) { 1378 if ((char *) de >= (((char *) root) + blocksize)) {
1407 ext4_error(dir->i_sb, 1379 EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1408 "invalid rec_len for '..' in inode %lu",
1409 dir->i_ino);
1410 brelse(bh); 1380 brelse(bh);
1411 return -EIO; 1381 return -EIO;
1412 } 1382 }
@@ -1418,7 +1388,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1418 brelse(bh); 1388 brelse(bh);
1419 return retval; 1389 return retval;
1420 } 1390 }
1421 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; 1391 ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1422 data1 = bh2->b_data; 1392 data1 = bh2->b_data;
1423 1393
1424 memcpy (data1, de, len); 1394 memcpy (data1, de, len);
@@ -1491,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1491 retval = ext4_dx_add_entry(handle, dentry, inode); 1461 retval = ext4_dx_add_entry(handle, dentry, inode);
1492 if (!retval || (retval != ERR_BAD_DX_DIR)) 1462 if (!retval || (retval != ERR_BAD_DX_DIR))
1493 return retval; 1463 return retval;
1494 EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; 1464 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1495 dx_fallback++; 1465 dx_fallback++;
1496 ext4_mark_inode_dirty(handle, dir); 1466 ext4_mark_inode_dirty(handle, dir);
1497 } 1467 }
@@ -1519,6 +1489,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1519 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1489 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1520 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1490 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1521 brelse(bh); 1491 brelse(bh);
1492 if (retval == 0)
1493 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1522 return retval; 1494 return retval;
1523} 1495}
1524 1496
@@ -1673,7 +1645,7 @@ static int ext4_delete_entry(handle_t *handle,
1673 pde = NULL; 1645 pde = NULL;
1674 de = (struct ext4_dir_entry_2 *) bh->b_data; 1646 de = (struct ext4_dir_entry_2 *) bh->b_data;
1675 while (i < bh->b_size) { 1647 while (i < bh->b_size) {
1676 if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i)) 1648 if (!ext4_check_dir_entry(dir, de, bh, i))
1677 return -EIO; 1649 return -EIO;
1678 if (de == de_del) { 1650 if (de == de_del) {
1679 BUFFER_TRACE(bh, "get_write_access"); 1651 BUFFER_TRACE(bh, "get_write_access");
@@ -1915,9 +1887,8 @@ static int empty_dir(struct inode *inode)
1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1887 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1888 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1917 if (err) 1889 if (err)
1918 ext4_error(inode->i_sb, 1890 EXT4_ERROR_INODE(inode,
1919 "error %d reading directory #%lu offset 0", 1891 "error %d reading directory lblock 0", err);
1920 err, inode->i_ino);
1921 else 1892 else
1922 ext4_warning(inode->i_sb, 1893 ext4_warning(inode->i_sb,
1923 "bad directory (dir #%lu) - no data block", 1894 "bad directory (dir #%lu) - no data block",
@@ -1941,23 +1912,23 @@ static int empty_dir(struct inode *inode)
1941 de = ext4_next_entry(de1, sb->s_blocksize); 1912 de = ext4_next_entry(de1, sb->s_blocksize);
1942 while (offset < inode->i_size) { 1913 while (offset < inode->i_size) {
1943 if (!bh || 1914 if (!bh ||
1944 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1915 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1916 unsigned int lblock;
1945 err = 0; 1917 err = 0;
1946 brelse(bh); 1918 brelse(bh);
1947 bh = ext4_bread(NULL, inode, 1919 lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
1948 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1920 bh = ext4_bread(NULL, inode, lblock, 0, &err);
1949 if (!bh) { 1921 if (!bh) {
1950 if (err) 1922 if (err)
1951 ext4_error(sb, 1923 EXT4_ERROR_INODE(inode,
1952 "error %d reading directory" 1924 "error %d reading directory "
1953 " #%lu offset %u", 1925 "lblock %u", err, lblock);
1954 err, inode->i_ino, offset);
1955 offset += sb->s_blocksize; 1926 offset += sb->s_blocksize;
1956 continue; 1927 continue;
1957 } 1928 }
1958 de = (struct ext4_dir_entry_2 *) bh->b_data; 1929 de = (struct ext4_dir_entry_2 *) bh->b_data;
1959 } 1930 }
1960 if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) { 1931 if (!ext4_check_dir_entry(inode, de, bh, offset)) {
1961 de = (struct ext4_dir_entry_2 *)(bh->b_data + 1932 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1962 sb->s_blocksize); 1933 sb->s_blocksize);
1963 offset = (offset | (sb->s_blocksize - 1)) + 1; 1934 offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2297,7 +2268,7 @@ retry:
2297 } 2268 }
2298 } else { 2269 } else {
2299 /* clear the extent format for fast symlink */ 2270 /* clear the extent format for fast symlink */
2300 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2271 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2301 inode->i_op = &ext4_fast_symlink_inode_operations; 2272 inode->i_op = &ext4_fast_symlink_inode_operations;
2302 memcpy((char *)&EXT4_I(inode)->i_data, symname, l); 2273 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2303 inode->i_size = l-1; 2274 inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..ca5c8aa00a2f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
911 percpu_counter_add(&sbi->s_freeinodes_counter, 911 percpu_counter_add(&sbi->s_freeinodes_counter,
912 EXT4_INODES_PER_GROUP(sb)); 912 EXT4_INODES_PER_GROUP(sb));
913 913
914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
915 sbi->s_log_groups_per_flex) {
915 ext4_group_t flex_group; 916 ext4_group_t flex_group;
916 flex_group = ext4_flex_group(sbi, input->group); 917 flex_group = ext4_flex_group(sbi, input->group);
917 atomic_add(input->free_blocks_count, 918 atomic_add(input->free_blocks_count,
@@ -920,8 +921,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
920 &sbi->s_flex_groups[flex_group].free_inodes); 921 &sbi->s_flex_groups[flex_group].free_inodes);
921 } 922 }
922 923
923 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 924 ext4_handle_dirty_super(handle, sb);
924 sb->s_dirt = 1;
925 925
926exit_journal: 926exit_journal:
927 mutex_unlock(&sbi->s_resize_lock); 927 mutex_unlock(&sbi->s_resize_lock);
@@ -952,7 +952,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
952 ext4_fsblk_t n_blocks_count) 952 ext4_fsblk_t n_blocks_count)
953{ 953{
954 ext4_fsblk_t o_blocks_count; 954 ext4_fsblk_t o_blocks_count;
955 ext4_group_t o_groups_count;
956 ext4_grpblk_t last; 955 ext4_grpblk_t last;
957 ext4_grpblk_t add; 956 ext4_grpblk_t add;
958 struct buffer_head *bh; 957 struct buffer_head *bh;
@@ -964,7 +963,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
964 * yet: we're going to revalidate es->s_blocks_count after 963 * yet: we're going to revalidate es->s_blocks_count after
965 * taking the s_resize_lock below. */ 964 * taking the s_resize_lock below. */
966 o_blocks_count = ext4_blocks_count(es); 965 o_blocks_count = ext4_blocks_count(es);
967 o_groups_count = EXT4_SB(sb)->s_groups_count;
968 966
969 if (test_opt(sb, DEBUG)) 967 if (test_opt(sb, DEBUG))
970 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", 968 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
@@ -1044,13 +1042,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1044 goto exit_put; 1042 goto exit_put;
1045 } 1043 }
1046 ext4_blocks_count_set(es, o_blocks_count + add); 1044 ext4_blocks_count_set(es, o_blocks_count + add);
1047 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
1048 sb->s_dirt = 1;
1049 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1045 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1050 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1046 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1051 o_blocks_count + add); 1047 o_blocks_count + add);
1052 /* We add the blocks to the bitmap and set the group need init bit */ 1048 /* We add the blocks to the bitmap and set the group need init bit */
1053 ext4_add_groupblocks(handle, sb, o_blocks_count, add); 1049 ext4_add_groupblocks(handle, sb, o_blocks_count, add);
1050 ext4_handle_dirty_super(handle, sb);
1054 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1051 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1055 o_blocks_count + add); 1052 o_blocks_count + add);
1056 if ((err = ext4_journal_stop(handle))) 1053 if ((err = ext4_journal_stop(handle)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d5..26147746c272 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,13 +241,14 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
241 if (sb->s_flags & MS_RDONLY) 241 if (sb->s_flags & MS_RDONLY)
242 return ERR_PTR(-EROFS); 242 return ERR_PTR(-EROFS);
243 243
244 vfs_check_frozen(sb, SB_FREEZE_TRANS);
244 /* Special case here: if the journal has aborted behind our 245 /* Special case here: if the journal has aborted behind our
245 * backs (eg. EIO in the commit thread), then we still need to 246 * backs (eg. EIO in the commit thread), then we still need to
246 * take the FS itself readonly cleanly. */ 247 * take the FS itself readonly cleanly. */
247 journal = EXT4_SB(sb)->s_journal; 248 journal = EXT4_SB(sb)->s_journal;
248 if (journal) { 249 if (journal) {
249 if (is_journal_aborted(journal)) { 250 if (is_journal_aborted(journal)) {
250 ext4_abort(sb, __func__, "Detected aborted journal"); 251 ext4_abort(sb, "Detected aborted journal");
251 return ERR_PTR(-EROFS); 252 return ERR_PTR(-EROFS);
252 } 253 }
253 return jbd2_journal_start(journal, nblocks); 254 return jbd2_journal_start(journal, nblocks);
@@ -261,7 +262,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
261 * that sync() will call the filesystem's write_super callback if 262 * that sync() will call the filesystem's write_super callback if
262 * appropriate. 263 * appropriate.
263 */ 264 */
264int __ext4_journal_stop(const char *where, handle_t *handle) 265int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
265{ 266{
266 struct super_block *sb; 267 struct super_block *sb;
267 int err; 268 int err;
@@ -278,12 +279,13 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
278 if (!err) 279 if (!err)
279 err = rc; 280 err = rc;
280 if (err) 281 if (err)
281 __ext4_std_error(sb, where, err); 282 __ext4_std_error(sb, where, line, err);
282 return err; 283 return err;
283} 284}
284 285
285void ext4_journal_abort_handle(const char *caller, const char *err_fn, 286void ext4_journal_abort_handle(const char *caller, unsigned int line,
286 struct buffer_head *bh, handle_t *handle, int err) 287 const char *err_fn, struct buffer_head *bh,
288 handle_t *handle, int err)
287{ 289{
288 char nbuf[16]; 290 char nbuf[16];
289 const char *errstr = ext4_decode_error(NULL, err, nbuf); 291 const char *errstr = ext4_decode_error(NULL, err, nbuf);
@@ -299,12 +301,47 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
299 if (is_handle_aborted(handle)) 301 if (is_handle_aborted(handle))
300 return; 302 return;
301 303
302 printk(KERN_ERR "%s: aborting transaction: %s in %s\n", 304 printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
303 caller, errstr, err_fn); 305 caller, line, errstr, err_fn);
304 306
305 jbd2_journal_abort_handle(handle); 307 jbd2_journal_abort_handle(handle);
306} 308}
307 309
310static void __save_error_info(struct super_block *sb, const char *func,
311 unsigned int line)
312{
313 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
314
315 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
316 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
317 es->s_last_error_time = cpu_to_le32(get_seconds());
318 strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
319 es->s_last_error_line = cpu_to_le32(line);
320 if (!es->s_first_error_time) {
321 es->s_first_error_time = es->s_last_error_time;
322 strncpy(es->s_first_error_func, func,
323 sizeof(es->s_first_error_func));
324 es->s_first_error_line = cpu_to_le32(line);
325 es->s_first_error_ino = es->s_last_error_ino;
326 es->s_first_error_block = es->s_last_error_block;
327 }
328 /*
329 * Start the daily error reporting function if it hasn't been
330 * started already
331 */
332 if (!es->s_error_count)
333 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
334 es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
335}
336
337static void save_error_info(struct super_block *sb, const char *func,
338 unsigned int line)
339{
340 __save_error_info(sb, func, line);
341 ext4_commit_super(sb, 1);
342}
343
344
308/* Deal with the reporting of failure conditions on a filesystem such as 345/* Deal with the reporting of failure conditions on a filesystem such as
309 * inconsistencies detected or read IO failures. 346 * inconsistencies detected or read IO failures.
310 * 347 *
@@ -322,11 +359,6 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
322 359
323static void ext4_handle_error(struct super_block *sb) 360static void ext4_handle_error(struct super_block *sb)
324{ 361{
325 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
326
327 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
328 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
329
330 if (sb->s_flags & MS_RDONLY) 362 if (sb->s_flags & MS_RDONLY)
331 return; 363 return;
332 364
@@ -341,19 +373,19 @@ static void ext4_handle_error(struct super_block *sb)
341 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 373 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
342 sb->s_flags |= MS_RDONLY; 374 sb->s_flags |= MS_RDONLY;
343 } 375 }
344 ext4_commit_super(sb, 1);
345 if (test_opt(sb, ERRORS_PANIC)) 376 if (test_opt(sb, ERRORS_PANIC))
346 panic("EXT4-fs (device %s): panic forced after error\n", 377 panic("EXT4-fs (device %s): panic forced after error\n",
347 sb->s_id); 378 sb->s_id);
348} 379}
349 380
350void __ext4_error(struct super_block *sb, const char *function, 381void __ext4_error(struct super_block *sb, const char *function,
351 const char *fmt, ...) 382 unsigned int line, const char *fmt, ...)
352{ 383{
353 va_list args; 384 va_list args;
354 385
355 va_start(args, fmt); 386 va_start(args, fmt);
356 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 387 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
388 sb->s_id, function, line, current->comm);
357 vprintk(fmt, args); 389 vprintk(fmt, args);
358 printk("\n"); 390 printk("\n");
359 va_end(args); 391 va_end(args);
@@ -361,14 +393,22 @@ void __ext4_error(struct super_block *sb, const char *function,
361 ext4_handle_error(sb); 393 ext4_handle_error(sb);
362} 394}
363 395
364void ext4_error_inode(const char *function, struct inode *inode, 396void ext4_error_inode(struct inode *inode, const char *function,
397 unsigned int line, ext4_fsblk_t block,
365 const char *fmt, ...) 398 const char *fmt, ...)
366{ 399{
367 va_list args; 400 va_list args;
401 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
368 402
403 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
404 es->s_last_error_block = cpu_to_le64(block);
405 save_error_info(inode->i_sb, function, line);
369 va_start(args, fmt); 406 va_start(args, fmt);
370 printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ", 407 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
371 inode->i_sb->s_id, function, inode->i_ino, current->comm); 408 inode->i_sb->s_id, function, line, inode->i_ino);
409 if (block)
410 printk("block %llu: ", block);
411 printk("comm %s: ", current->comm);
372 vprintk(fmt, args); 412 vprintk(fmt, args);
373 printk("\n"); 413 printk("\n");
374 va_end(args); 414 va_end(args);
@@ -376,20 +416,26 @@ void ext4_error_inode(const char *function, struct inode *inode,
376 ext4_handle_error(inode->i_sb); 416 ext4_handle_error(inode->i_sb);
377} 417}
378 418
379void ext4_error_file(const char *function, struct file *file, 419void ext4_error_file(struct file *file, const char *function,
380 const char *fmt, ...) 420 unsigned int line, const char *fmt, ...)
381{ 421{
382 va_list args; 422 va_list args;
423 struct ext4_super_block *es;
383 struct inode *inode = file->f_dentry->d_inode; 424 struct inode *inode = file->f_dentry->d_inode;
384 char pathname[80], *path; 425 char pathname[80], *path;
385 426
427 es = EXT4_SB(inode->i_sb)->s_es;
428 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
429 save_error_info(inode->i_sb, function, line);
386 va_start(args, fmt); 430 va_start(args, fmt);
387 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 431 path = d_path(&(file->f_path), pathname, sizeof(pathname));
388 if (!path) 432 if (!path)
389 path = "(unknown)"; 433 path = "(unknown)";
390 printk(KERN_CRIT 434 printk(KERN_CRIT
391 "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ", 435 "EXT4-fs error (device %s): %s:%d: inode #%lu "
392 inode->i_sb->s_id, function, inode->i_ino, current->comm, path); 436 "(comm %s path %s): ",
437 inode->i_sb->s_id, function, line, inode->i_ino,
438 current->comm, path);
393 vprintk(fmt, args); 439 vprintk(fmt, args);
394 printk("\n"); 440 printk("\n");
395 va_end(args); 441 va_end(args);
@@ -434,7 +480,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
434/* __ext4_std_error decodes expected errors from journaling functions 480/* __ext4_std_error decodes expected errors from journaling functions
435 * automatically and invokes the appropriate error response. */ 481 * automatically and invokes the appropriate error response. */
436 482
437void __ext4_std_error(struct super_block *sb, const char *function, int errno) 483void __ext4_std_error(struct super_block *sb, const char *function,
484 unsigned int line, int errno)
438{ 485{
439 char nbuf[16]; 486 char nbuf[16];
440 const char *errstr; 487 const char *errstr;
@@ -447,8 +494,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
447 return; 494 return;
448 495
449 errstr = ext4_decode_error(sb, errno, nbuf); 496 errstr = ext4_decode_error(sb, errno, nbuf);
450 printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", 497 printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
451 sb->s_id, function, errstr); 498 sb->s_id, function, line, errstr);
499 save_error_info(sb, function, line);
452 500
453 ext4_handle_error(sb); 501 ext4_handle_error(sb);
454} 502}
@@ -463,29 +511,29 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
463 * case we take the easy way out and panic immediately. 511 * case we take the easy way out and panic immediately.
464 */ 512 */
465 513
466void ext4_abort(struct super_block *sb, const char *function, 514void __ext4_abort(struct super_block *sb, const char *function,
467 const char *fmt, ...) 515 unsigned int line, const char *fmt, ...)
468{ 516{
469 va_list args; 517 va_list args;
470 518
519 save_error_info(sb, function, line);
471 va_start(args, fmt); 520 va_start(args, fmt);
472 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 521 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
522 function, line);
473 vprintk(fmt, args); 523 vprintk(fmt, args);
474 printk("\n"); 524 printk("\n");
475 va_end(args); 525 va_end(args);
476 526
527 if ((sb->s_flags & MS_RDONLY) == 0) {
528 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
529 sb->s_flags |= MS_RDONLY;
530 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
531 if (EXT4_SB(sb)->s_journal)
532 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
533 save_error_info(sb, function, line);
534 }
477 if (test_opt(sb, ERRORS_PANIC)) 535 if (test_opt(sb, ERRORS_PANIC))
478 panic("EXT4-fs panic from previous error\n"); 536 panic("EXT4-fs panic from previous error\n");
479
480 if (sb->s_flags & MS_RDONLY)
481 return;
482
483 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
484 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
485 sb->s_flags |= MS_RDONLY;
486 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
487 if (EXT4_SB(sb)->s_journal)
488 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
489} 537}
490 538
491void ext4_msg (struct super_block * sb, const char *prefix, 539void ext4_msg (struct super_block * sb, const char *prefix,
@@ -501,38 +549,47 @@ void ext4_msg (struct super_block * sb, const char *prefix,
501} 549}
502 550
503void __ext4_warning(struct super_block *sb, const char *function, 551void __ext4_warning(struct super_block *sb, const char *function,
504 const char *fmt, ...) 552 unsigned int line, const char *fmt, ...)
505{ 553{
506 va_list args; 554 va_list args;
507 555
508 va_start(args, fmt); 556 va_start(args, fmt);
509 printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ", 557 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
510 sb->s_id, function); 558 sb->s_id, function, line);
511 vprintk(fmt, args); 559 vprintk(fmt, args);
512 printk("\n"); 560 printk("\n");
513 va_end(args); 561 va_end(args);
514} 562}
515 563
516void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, 564void __ext4_grp_locked_error(const char *function, unsigned int line,
517 const char *function, const char *fmt, ...) 565 struct super_block *sb, ext4_group_t grp,
566 unsigned long ino, ext4_fsblk_t block,
567 const char *fmt, ...)
518__releases(bitlock) 568__releases(bitlock)
519__acquires(bitlock) 569__acquires(bitlock)
520{ 570{
521 va_list args; 571 va_list args;
522 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 572 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
523 573
574 es->s_last_error_ino = cpu_to_le32(ino);
575 es->s_last_error_block = cpu_to_le64(block);
576 __save_error_info(sb, function, line);
524 va_start(args, fmt); 577 va_start(args, fmt);
525 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 578 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
579 sb->s_id, function, line, grp);
580 if (ino)
581 printk("inode %lu: ", ino);
582 if (block)
583 printk("block %llu:", (unsigned long long) block);
526 vprintk(fmt, args); 584 vprintk(fmt, args);
527 printk("\n"); 585 printk("\n");
528 va_end(args); 586 va_end(args);
529 587
530 if (test_opt(sb, ERRORS_CONT)) { 588 if (test_opt(sb, ERRORS_CONT)) {
531 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
532 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
533 ext4_commit_super(sb, 0); 589 ext4_commit_super(sb, 0);
534 return; 590 return;
535 } 591 }
592
536 ext4_unlock_group(sb, grp); 593 ext4_unlock_group(sb, grp);
537 ext4_handle_error(sb); 594 ext4_handle_error(sb);
538 /* 595 /*
@@ -645,6 +702,8 @@ static void ext4_put_super(struct super_block *sb)
645 struct ext4_super_block *es = sbi->s_es; 702 struct ext4_super_block *es = sbi->s_es;
646 int i, err; 703 int i, err;
647 704
705 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
706
648 flush_workqueue(sbi->dio_unwritten_wq); 707 flush_workqueue(sbi->dio_unwritten_wq);
649 destroy_workqueue(sbi->dio_unwritten_wq); 708 destroy_workqueue(sbi->dio_unwritten_wq);
650 709
@@ -657,8 +716,7 @@ static void ext4_put_super(struct super_block *sb)
657 err = jbd2_journal_destroy(sbi->s_journal); 716 err = jbd2_journal_destroy(sbi->s_journal);
658 sbi->s_journal = NULL; 717 sbi->s_journal = NULL;
659 if (err < 0) 718 if (err < 0)
660 ext4_abort(sb, __func__, 719 ext4_abort(sb, "Couldn't clean up the journal");
661 "Couldn't clean up the journal");
662 } 720 }
663 721
664 ext4_release_system_zone(sb); 722 ext4_release_system_zone(sb);
@@ -810,8 +868,10 @@ static void destroy_inodecache(void)
810 kmem_cache_destroy(ext4_inode_cachep); 868 kmem_cache_destroy(ext4_inode_cachep);
811} 869}
812 870
813static void ext4_clear_inode(struct inode *inode) 871void ext4_clear_inode(struct inode *inode)
814{ 872{
873 invalidate_inode_buffers(inode);
874 end_writeback(inode);
815 dquot_drop(inode); 875 dquot_drop(inode);
816 ext4_discard_preallocations(inode); 876 ext4_discard_preallocations(inode);
817 if (EXT4_JOURNAL(inode)) 877 if (EXT4_JOURNAL(inode))
@@ -941,14 +1001,14 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
941 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); 1001 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
942 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 1002 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
943 seq_puts(seq, ",journal_async_commit"); 1003 seq_puts(seq, ",journal_async_commit");
944 if (test_opt(sb, NOBH)) 1004 else if (test_opt(sb, JOURNAL_CHECKSUM))
945 seq_puts(seq, ",nobh"); 1005 seq_puts(seq, ",journal_checksum");
946 if (test_opt(sb, I_VERSION)) 1006 if (test_opt(sb, I_VERSION))
947 seq_puts(seq, ",i_version"); 1007 seq_puts(seq, ",i_version");
948 if (!test_opt(sb, DELALLOC)) 1008 if (!test_opt(sb, DELALLOC) &&
1009 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
949 seq_puts(seq, ",nodelalloc"); 1010 seq_puts(seq, ",nodelalloc");
950 1011
951
952 if (sbi->s_stripe) 1012 if (sbi->s_stripe)
953 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1013 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
954 /* 1014 /*
@@ -972,7 +1032,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
972 if (test_opt(sb, NO_AUTO_DA_ALLOC)) 1032 if (test_opt(sb, NO_AUTO_DA_ALLOC))
973 seq_puts(seq, ",noauto_da_alloc"); 1033 seq_puts(seq, ",noauto_da_alloc");
974 1034
975 if (test_opt(sb, DISCARD)) 1035 if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
976 seq_puts(seq, ",discard"); 1036 seq_puts(seq, ",discard");
977 1037
978 if (test_opt(sb, NOLOAD)) 1038 if (test_opt(sb, NOLOAD))
@@ -981,6 +1041,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
981 if (test_opt(sb, DIOREAD_NOLOCK)) 1041 if (test_opt(sb, DIOREAD_NOLOCK))
982 seq_puts(seq, ",dioread_nolock"); 1042 seq_puts(seq, ",dioread_nolock");
983 1043
1044 if (test_opt(sb, BLOCK_VALIDITY) &&
1045 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1046 seq_puts(seq, ",block_validity");
1047
984 ext4_show_quota_options(seq, sb); 1048 ext4_show_quota_options(seq, sb);
985 1049
986 return 0; 1050 return 0;
@@ -1059,7 +1123,8 @@ static int ext4_release_dquot(struct dquot *dquot);
1059static int ext4_mark_dquot_dirty(struct dquot *dquot); 1123static int ext4_mark_dquot_dirty(struct dquot *dquot);
1060static int ext4_write_info(struct super_block *sb, int type); 1124static int ext4_write_info(struct super_block *sb, int type);
1061static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1125static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1062 char *path, int remount); 1126 char *path);
1127static int ext4_quota_off(struct super_block *sb, int type);
1063static int ext4_quota_on_mount(struct super_block *sb, int type); 1128static int ext4_quota_on_mount(struct super_block *sb, int type);
1064static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1129static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1065 size_t len, loff_t off); 1130 size_t len, loff_t off);
@@ -1081,12 +1146,12 @@ static const struct dquot_operations ext4_quota_operations = {
1081 1146
1082static const struct quotactl_ops ext4_qctl_operations = { 1147static const struct quotactl_ops ext4_qctl_operations = {
1083 .quota_on = ext4_quota_on, 1148 .quota_on = ext4_quota_on,
1084 .quota_off = vfs_quota_off, 1149 .quota_off = ext4_quota_off,
1085 .quota_sync = vfs_quota_sync, 1150 .quota_sync = dquot_quota_sync,
1086 .get_info = vfs_get_dqinfo, 1151 .get_info = dquot_get_dqinfo,
1087 .set_info = vfs_set_dqinfo, 1152 .set_info = dquot_set_dqinfo,
1088 .get_dqblk = vfs_get_dqblk, 1153 .get_dqblk = dquot_get_dqblk,
1089 .set_dqblk = vfs_set_dqblk 1154 .set_dqblk = dquot_set_dqblk
1090}; 1155};
1091#endif 1156#endif
1092 1157
@@ -1095,14 +1160,13 @@ static const struct super_operations ext4_sops = {
1095 .destroy_inode = ext4_destroy_inode, 1160 .destroy_inode = ext4_destroy_inode,
1096 .write_inode = ext4_write_inode, 1161 .write_inode = ext4_write_inode,
1097 .dirty_inode = ext4_dirty_inode, 1162 .dirty_inode = ext4_dirty_inode,
1098 .delete_inode = ext4_delete_inode, 1163 .evict_inode = ext4_evict_inode,
1099 .put_super = ext4_put_super, 1164 .put_super = ext4_put_super,
1100 .sync_fs = ext4_sync_fs, 1165 .sync_fs = ext4_sync_fs,
1101 .freeze_fs = ext4_freeze, 1166 .freeze_fs = ext4_freeze,
1102 .unfreeze_fs = ext4_unfreeze, 1167 .unfreeze_fs = ext4_unfreeze,
1103 .statfs = ext4_statfs, 1168 .statfs = ext4_statfs,
1104 .remount_fs = ext4_remount, 1169 .remount_fs = ext4_remount,
1105 .clear_inode = ext4_clear_inode,
1106 .show_options = ext4_show_options, 1170 .show_options = ext4_show_options,
1107#ifdef CONFIG_QUOTA 1171#ifdef CONFIG_QUOTA
1108 .quota_read = ext4_quota_read, 1172 .quota_read = ext4_quota_read,
@@ -1116,12 +1180,11 @@ static const struct super_operations ext4_nojournal_sops = {
1116 .destroy_inode = ext4_destroy_inode, 1180 .destroy_inode = ext4_destroy_inode,
1117 .write_inode = ext4_write_inode, 1181 .write_inode = ext4_write_inode,
1118 .dirty_inode = ext4_dirty_inode, 1182 .dirty_inode = ext4_dirty_inode,
1119 .delete_inode = ext4_delete_inode, 1183 .evict_inode = ext4_evict_inode,
1120 .write_super = ext4_write_super, 1184 .write_super = ext4_write_super,
1121 .put_super = ext4_put_super, 1185 .put_super = ext4_put_super,
1122 .statfs = ext4_statfs, 1186 .statfs = ext4_statfs,
1123 .remount_fs = ext4_remount, 1187 .remount_fs = ext4_remount,
1124 .clear_inode = ext4_clear_inode,
1125 .show_options = ext4_show_options, 1188 .show_options = ext4_show_options,
1126#ifdef CONFIG_QUOTA 1189#ifdef CONFIG_QUOTA
1127 .quota_read = ext4_quota_read, 1190 .quota_read = ext4_quota_read,
@@ -1619,10 +1682,12 @@ set_qf_format:
1619 *n_blocks_count = option; 1682 *n_blocks_count = option;
1620 break; 1683 break;
1621 case Opt_nobh: 1684 case Opt_nobh:
1622 set_opt(sbi->s_mount_opt, NOBH); 1685 ext4_msg(sb, KERN_WARNING,
1686 "Ignoring deprecated nobh option");
1623 break; 1687 break;
1624 case Opt_bh: 1688 case Opt_bh:
1625 clear_opt(sbi->s_mount_opt, NOBH); 1689 ext4_msg(sb, KERN_WARNING,
1690 "Ignoring deprecated bh option");
1626 break; 1691 break;
1627 case Opt_i_version: 1692 case Opt_i_version:
1628 set_opt(sbi->s_mount_opt, I_VERSION); 1693 set_opt(sbi->s_mount_opt, I_VERSION);
@@ -2051,7 +2116,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2051 /* Turn quotas off */ 2116 /* Turn quotas off */
2052 for (i = 0; i < MAXQUOTAS; i++) { 2117 for (i = 0; i < MAXQUOTAS; i++) {
2053 if (sb_dqopt(sb)->files[i]) 2118 if (sb_dqopt(sb)->files[i])
2054 vfs_quota_off(sb, i, 0); 2119 dquot_quota_off(sb, i);
2055 } 2120 }
2056#endif 2121#endif
2057 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 2122 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2213,7 +2278,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2213struct ext4_attr { 2278struct ext4_attr {
2214 struct attribute attr; 2279 struct attribute attr;
2215 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2280 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2216 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2281 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2217 const char *, size_t); 2282 const char *, size_t);
2218 int offset; 2283 int offset;
2219}; 2284};
@@ -2244,6 +2309,8 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2244{ 2309{
2245 struct super_block *sb = sbi->s_buddy_cache->i_sb; 2310 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2246 2311
2312 if (!sb->s_bdev->bd_part)
2313 return snprintf(buf, PAGE_SIZE, "0\n");
2247 return snprintf(buf, PAGE_SIZE, "%lu\n", 2314 return snprintf(buf, PAGE_SIZE, "%lu\n",
2248 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 2315 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2249 sbi->s_sectors_written_start) >> 1); 2316 sbi->s_sectors_written_start) >> 1);
@@ -2254,6 +2321,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2254{ 2321{
2255 struct super_block *sb = sbi->s_buddy_cache->i_sb; 2322 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2256 2323
2324 if (!sb->s_bdev->bd_part)
2325 return snprintf(buf, PAGE_SIZE, "0\n");
2257 return snprintf(buf, PAGE_SIZE, "%llu\n", 2326 return snprintf(buf, PAGE_SIZE, "%llu\n",
2258 (unsigned long long)(sbi->s_kbytes_written + 2327 (unsigned long long)(sbi->s_kbytes_written +
2259 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 2328 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -2426,10 +2495,58 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2426 return 1; 2495 return 1;
2427} 2496}
2428 2497
2498/*
2499 * This function is called once a day if we have errors logged
2500 * on the file system
2501 */
2502static void print_daily_error_info(unsigned long arg)
2503{
2504 struct super_block *sb = (struct super_block *) arg;
2505 struct ext4_sb_info *sbi;
2506 struct ext4_super_block *es;
2507
2508 sbi = EXT4_SB(sb);
2509 es = sbi->s_es;
2510
2511 if (es->s_error_count)
2512 ext4_msg(sb, KERN_NOTICE, "error count: %u",
2513 le32_to_cpu(es->s_error_count));
2514 if (es->s_first_error_time) {
2515 printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
2516 sb->s_id, le32_to_cpu(es->s_first_error_time),
2517 (int) sizeof(es->s_first_error_func),
2518 es->s_first_error_func,
2519 le32_to_cpu(es->s_first_error_line));
2520 if (es->s_first_error_ino)
2521 printk(": inode %u",
2522 le32_to_cpu(es->s_first_error_ino));
2523 if (es->s_first_error_block)
2524 printk(": block %llu", (unsigned long long)
2525 le64_to_cpu(es->s_first_error_block));
2526 printk("\n");
2527 }
2528 if (es->s_last_error_time) {
2529 printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
2530 sb->s_id, le32_to_cpu(es->s_last_error_time),
2531 (int) sizeof(es->s_last_error_func),
2532 es->s_last_error_func,
2533 le32_to_cpu(es->s_last_error_line));
2534 if (es->s_last_error_ino)
2535 printk(": inode %u",
2536 le32_to_cpu(es->s_last_error_ino));
2537 if (es->s_last_error_block)
2538 printk(": block %llu", (unsigned long long)
2539 le64_to_cpu(es->s_last_error_block));
2540 printk("\n");
2541 }
2542 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2543}
2544
2429static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2545static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2430 __releases(kernel_lock) 2546 __releases(kernel_lock)
2431 __acquires(kernel_lock) 2547 __acquires(kernel_lock)
2432{ 2548{
2549 char *orig_data = kstrdup(data, GFP_KERNEL);
2433 struct buffer_head *bh; 2550 struct buffer_head *bh;
2434 struct ext4_super_block *es = NULL; 2551 struct ext4_super_block *es = NULL;
2435 struct ext4_sb_info *sbi; 2552 struct ext4_sb_info *sbi;
@@ -2442,7 +2559,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2442 struct inode *root; 2559 struct inode *root;
2443 char *cp; 2560 char *cp;
2444 const char *descr; 2561 const char *descr;
2445 int ret = -EINVAL; 2562 int ret = -ENOMEM;
2446 int blocksize; 2563 int blocksize;
2447 unsigned int db_count; 2564 unsigned int db_count;
2448 unsigned int i; 2565 unsigned int i;
@@ -2453,13 +2570,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2453 2570
2454 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2571 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2455 if (!sbi) 2572 if (!sbi)
2456 return -ENOMEM; 2573 goto out_free_orig;
2457 2574
2458 sbi->s_blockgroup_lock = 2575 sbi->s_blockgroup_lock =
2459 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 2576 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
2460 if (!sbi->s_blockgroup_lock) { 2577 if (!sbi->s_blockgroup_lock) {
2461 kfree(sbi); 2578 kfree(sbi);
2462 return -ENOMEM; 2579 goto out_free_orig;
2463 } 2580 }
2464 sb->s_fs_info = sbi; 2581 sb->s_fs_info = sbi;
2465 sbi->s_mount_opt = 0; 2582 sbi->s_mount_opt = 0;
@@ -2467,8 +2584,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2467 sbi->s_resgid = EXT4_DEF_RESGID; 2584 sbi->s_resgid = EXT4_DEF_RESGID;
2468 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 2585 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
2469 sbi->s_sb_block = sb_block; 2586 sbi->s_sb_block = sb_block;
2470 sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, 2587 if (sb->s_bdev->bd_part)
2471 sectors[1]); 2588 sbi->s_sectors_written_start =
2589 part_stat_read(sb->s_bdev->bd_part, sectors[1]);
2472 2590
2473 unlock_kernel(); 2591 unlock_kernel();
2474 2592
@@ -2476,6 +2594,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2476 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 2594 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
2477 *cp = '!'; 2595 *cp = '!';
2478 2596
2597 ret = -EINVAL;
2479 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 2598 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
2480 if (!blocksize) { 2599 if (!blocksize) {
2481 ext4_msg(sb, KERN_ERR, "unable to set blocksize"); 2600 ext4_msg(sb, KERN_ERR, "unable to set blocksize");
@@ -2540,6 +2659,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2540 set_opt(sbi->s_mount_opt, ERRORS_CONT); 2659 set_opt(sbi->s_mount_opt, ERRORS_CONT);
2541 else 2660 else
2542 set_opt(sbi->s_mount_opt, ERRORS_RO); 2661 set_opt(sbi->s_mount_opt, ERRORS_RO);
2662 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
2663 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
2664 if (def_mount_opts & EXT4_DEFM_DISCARD)
2665 set_opt(sbi->s_mount_opt, DISCARD);
2543 2666
2544 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 2667 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
2545 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 2668 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2547,15 +2670,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2547 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2670 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2548 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2671 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2549 2672
2550 set_opt(sbi->s_mount_opt, BARRIER); 2673 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
2674 set_opt(sbi->s_mount_opt, BARRIER);
2551 2675
2552 /* 2676 /*
2553 * enable delayed allocation by default 2677 * enable delayed allocation by default
2554 * Use -o nodelalloc to turn it off 2678 * Use -o nodelalloc to turn it off
2555 */ 2679 */
2556 if (!IS_EXT3_SB(sb)) 2680 if (!IS_EXT3_SB(sb) &&
2681 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
2557 set_opt(sbi->s_mount_opt, DELALLOC); 2682 set_opt(sbi->s_mount_opt, DELALLOC);
2558 2683
2684 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
2685 &journal_devnum, &journal_ioprio, NULL, 0)) {
2686 ext4_msg(sb, KERN_WARNING,
2687 "failed to parse options in superblock: %s",
2688 sbi->s_es->s_mount_opts);
2689 }
2559 if (!parse_options((char *) data, sb, &journal_devnum, 2690 if (!parse_options((char *) data, sb, &journal_devnum,
2560 &journal_ioprio, NULL, 0)) 2691 &journal_ioprio, NULL, 0))
2561 goto failed_mount; 2692 goto failed_mount;
@@ -2793,24 +2924,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2793 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2924 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2794 spin_lock_init(&sbi->s_next_gen_lock); 2925 spin_lock_init(&sbi->s_next_gen_lock);
2795 2926
2796 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2797 ext4_count_free_blocks(sb));
2798 if (!err) {
2799 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2800 ext4_count_free_inodes(sb));
2801 }
2802 if (!err) {
2803 err = percpu_counter_init(&sbi->s_dirs_counter,
2804 ext4_count_dirs(sb));
2805 }
2806 if (!err) {
2807 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2808 }
2809 if (err) {
2810 ext4_msg(sb, KERN_ERR, "insufficient memory");
2811 goto failed_mount3;
2812 }
2813
2814 sbi->s_stripe = ext4_get_stripe_size(sbi); 2927 sbi->s_stripe = ext4_get_stripe_size(sbi);
2815 sbi->s_max_writeback_mb_bump = 128; 2928 sbi->s_max_writeback_mb_bump = 128;
2816 2929
@@ -2910,18 +3023,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2910 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3023 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2911 3024
2912no_journal: 3025no_journal:
2913 if (test_opt(sb, NOBH)) { 3026 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2914 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 3027 ext4_count_free_blocks(sb));
2915 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 3028 if (!err)
2916 "its supported only with writeback mode"); 3029 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2917 clear_opt(sbi->s_mount_opt, NOBH); 3030 ext4_count_free_inodes(sb));
2918 } 3031 if (!err)
2919 if (test_opt(sb, DIOREAD_NOLOCK)) { 3032 err = percpu_counter_init(&sbi->s_dirs_counter,
2920 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is " 3033 ext4_count_dirs(sb));
2921 "not supported with nobh mode"); 3034 if (!err)
2922 goto failed_mount_wq; 3035 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2923 } 3036 if (err) {
3037 ext4_msg(sb, KERN_ERR, "insufficient memory");
3038 goto failed_mount_wq;
2924 } 3039 }
3040
2925 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3041 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2926 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3042 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2927 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3043 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3001,14 +3117,14 @@ no_journal:
3001 err = ext4_setup_system_zone(sb); 3117 err = ext4_setup_system_zone(sb);
3002 if (err) { 3118 if (err) {
3003 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3119 ext4_msg(sb, KERN_ERR, "failed to initialize system "
3004 "zone (%d)\n", err); 3120 "zone (%d)", err);
3005 goto failed_mount4; 3121 goto failed_mount4;
3006 } 3122 }
3007 3123
3008 ext4_ext_init(sb); 3124 ext4_ext_init(sb);
3009 err = ext4_mb_init(sb, needs_recovery); 3125 err = ext4_mb_init(sb, needs_recovery);
3010 if (err) { 3126 if (err) {
3011 ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", 3127 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
3012 err); 3128 err);
3013 goto failed_mount4; 3129 goto failed_mount4;
3014 } 3130 }
@@ -3040,9 +3156,18 @@ no_journal:
3040 } else 3156 } else
3041 descr = "out journal"; 3157 descr = "out journal";
3042 3158
3043 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr); 3159 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
3160 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
3161 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
3162
3163 init_timer(&sbi->s_err_report);
3164 sbi->s_err_report.function = print_daily_error_info;
3165 sbi->s_err_report.data = (unsigned long) sb;
3166 if (es->s_error_count)
3167 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3044 3168
3045 lock_kernel(); 3169 lock_kernel();
3170 kfree(orig_data);
3046 return 0; 3171 return 0;
3047 3172
3048cantfind_ext4: 3173cantfind_ext4:
@@ -3059,6 +3184,10 @@ failed_mount_wq:
3059 jbd2_journal_destroy(sbi->s_journal); 3184 jbd2_journal_destroy(sbi->s_journal);
3060 sbi->s_journal = NULL; 3185 sbi->s_journal = NULL;
3061 } 3186 }
3187 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3188 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3189 percpu_counter_destroy(&sbi->s_dirs_counter);
3190 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3062failed_mount3: 3191failed_mount3:
3063 if (sbi->s_flex_groups) { 3192 if (sbi->s_flex_groups) {
3064 if (is_vmalloc_addr(sbi->s_flex_groups)) 3193 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3195,6 @@ failed_mount3:
3066 else 3195 else
3067 kfree(sbi->s_flex_groups); 3196 kfree(sbi->s_flex_groups);
3068 } 3197 }
3069 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3070 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3071 percpu_counter_destroy(&sbi->s_dirs_counter);
3072 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3073failed_mount2: 3198failed_mount2:
3074 for (i = 0; i < db_count; i++) 3199 for (i = 0; i < db_count; i++)
3075 brelse(sbi->s_group_desc[i]); 3200 brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3214,8 @@ out_fail:
3089 kfree(sbi->s_blockgroup_lock); 3214 kfree(sbi->s_blockgroup_lock);
3090 kfree(sbi); 3215 kfree(sbi);
3091 lock_kernel(); 3216 lock_kernel();
3217out_free_orig:
3218 kfree(orig_data);
3092 return ret; 3219 return ret;
3093} 3220}
3094 3221
@@ -3105,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
3105 journal->j_min_batch_time = sbi->s_min_batch_time; 3232 journal->j_min_batch_time = sbi->s_min_batch_time;
3106 journal->j_max_batch_time = sbi->s_max_batch_time; 3233 journal->j_max_batch_time = sbi->s_max_batch_time;
3107 3234
3108 spin_lock(&journal->j_state_lock); 3235 write_lock(&journal->j_state_lock);
3109 if (test_opt(sb, BARRIER)) 3236 if (test_opt(sb, BARRIER))
3110 journal->j_flags |= JBD2_BARRIER; 3237 journal->j_flags |= JBD2_BARRIER;
3111 else 3238 else
@@ -3114,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
3114 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; 3241 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
3115 else 3242 else
3116 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; 3243 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
3117 spin_unlock(&journal->j_state_lock); 3244 write_unlock(&journal->j_state_lock);
3118} 3245}
3119 3246
3120static journal_t *ext4_get_journal(struct super_block *sb, 3247static journal_t *ext4_get_journal(struct super_block *sb,
@@ -3322,8 +3449,17 @@ static int ext4_load_journal(struct super_block *sb,
3322 3449
3323 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) 3450 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
3324 err = jbd2_journal_wipe(journal, !really_read_only); 3451 err = jbd2_journal_wipe(journal, !really_read_only);
3325 if (!err) 3452 if (!err) {
3453 char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
3454 if (save)
3455 memcpy(save, ((char *) es) +
3456 EXT4_S_ERR_START, EXT4_S_ERR_LEN);
3326 err = jbd2_journal_load(journal); 3457 err = jbd2_journal_load(journal);
3458 if (save)
3459 memcpy(((char *) es) + EXT4_S_ERR_START,
3460 save, EXT4_S_ERR_LEN);
3461 kfree(save);
3462 }
3327 3463
3328 if (err) { 3464 if (err) {
3329 ext4_msg(sb, KERN_ERR, "error loading journal"); 3465 ext4_msg(sb, KERN_ERR, "error loading journal");
@@ -3379,10 +3515,14 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3379 */ 3515 */
3380 if (!(sb->s_flags & MS_RDONLY)) 3516 if (!(sb->s_flags & MS_RDONLY))
3381 es->s_wtime = cpu_to_le32(get_seconds()); 3517 es->s_wtime = cpu_to_le32(get_seconds());
3382 es->s_kbytes_written = 3518 if (sb->s_bdev->bd_part)
3383 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3519 es->s_kbytes_written =
3520 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3384 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3521 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3385 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 3522 EXT4_SB(sb)->s_sectors_written_start) >> 1));
3523 else
3524 es->s_kbytes_written =
3525 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
3386 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3526 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
3387 &EXT4_SB(sb)->s_freeblocks_counter)); 3527 &EXT4_SB(sb)->s_freeblocks_counter));
3388 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3528 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3485,8 +3625,10 @@ int ext4_force_commit(struct super_block *sb)
3485 return 0; 3625 return 0;
3486 3626
3487 journal = EXT4_SB(sb)->s_journal; 3627 journal = EXT4_SB(sb)->s_journal;
3488 if (journal) 3628 if (journal) {
3629 vfs_check_frozen(sb, SB_FREEZE_TRANS);
3489 ret = ext4_journal_force_commit(journal); 3630 ret = ext4_journal_force_commit(journal);
3631 }
3490 3632
3491 return ret; 3633 return ret;
3492} 3634}
@@ -3535,18 +3677,16 @@ static int ext4_freeze(struct super_block *sb)
3535 * the journal. 3677 * the journal.
3536 */ 3678 */
3537 error = jbd2_journal_flush(journal); 3679 error = jbd2_journal_flush(journal);
3538 if (error < 0) { 3680 if (error < 0)
3539 out: 3681 goto out;
3540 jbd2_journal_unlock_updates(journal);
3541 return error;
3542 }
3543 3682
3544 /* Journal blocked and flushed, clear needs_recovery flag. */ 3683 /* Journal blocked and flushed, clear needs_recovery flag. */
3545 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3684 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3546 error = ext4_commit_super(sb, 1); 3685 error = ext4_commit_super(sb, 1);
3547 if (error) 3686out:
3548 goto out; 3687 /* we rely on s_frozen to stop further updates */
3549 return 0; 3688 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3689 return error;
3550} 3690}
3551 3691
3552/* 3692/*
@@ -3563,7 +3703,6 @@ static int ext4_unfreeze(struct super_block *sb)
3563 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3703 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3564 ext4_commit_super(sb, 1); 3704 ext4_commit_super(sb, 1);
3565 unlock_super(sb); 3705 unlock_super(sb);
3566 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3567 return 0; 3706 return 0;
3568} 3707}
3569 3708
@@ -3574,12 +3713,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3574 ext4_fsblk_t n_blocks_count = 0; 3713 ext4_fsblk_t n_blocks_count = 0;
3575 unsigned long old_sb_flags; 3714 unsigned long old_sb_flags;
3576 struct ext4_mount_options old_opts; 3715 struct ext4_mount_options old_opts;
3716 int enable_quota = 0;
3577 ext4_group_t g; 3717 ext4_group_t g;
3578 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3718 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3579 int err; 3719 int err;
3580#ifdef CONFIG_QUOTA 3720#ifdef CONFIG_QUOTA
3581 int i; 3721 int i;
3582#endif 3722#endif
3723 char *orig_data = kstrdup(data, GFP_KERNEL);
3583 3724
3584 lock_kernel(); 3725 lock_kernel();
3585 3726
@@ -3610,7 +3751,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3610 } 3751 }
3611 3752
3612 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) 3753 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
3613 ext4_abort(sb, __func__, "Abort forced by user"); 3754 ext4_abort(sb, "Abort forced by user");
3614 3755
3615 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3756 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3616 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); 3757 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3630,6 +3771,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3630 } 3771 }
3631 3772
3632 if (*flags & MS_RDONLY) { 3773 if (*flags & MS_RDONLY) {
3774 err = dquot_suspend(sb, -1);
3775 if (err < 0)
3776 goto restore_opts;
3777
3633 /* 3778 /*
3634 * First of all, the unconditional stuff we have to do 3779 * First of all, the unconditional stuff we have to do
3635 * to disable replay of the journal when we next remount 3780 * to disable replay of the journal when we next remount
@@ -3698,6 +3843,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3698 goto restore_opts; 3843 goto restore_opts;
3699 if (!ext4_setup_super(sb, es, 0)) 3844 if (!ext4_setup_super(sb, es, 0))
3700 sb->s_flags &= ~MS_RDONLY; 3845 sb->s_flags &= ~MS_RDONLY;
3846 enable_quota = 1;
3701 } 3847 }
3702 } 3848 }
3703 ext4_setup_system_zone(sb); 3849 ext4_setup_system_zone(sb);
@@ -3713,6 +3859,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3713#endif 3859#endif
3714 unlock_super(sb); 3860 unlock_super(sb);
3715 unlock_kernel(); 3861 unlock_kernel();
3862 if (enable_quota)
3863 dquot_resume(sb, -1);
3864
3865 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
3866 kfree(orig_data);
3716 return 0; 3867 return 0;
3717 3868
3718restore_opts: 3869restore_opts:
@@ -3734,6 +3885,7 @@ restore_opts:
3734#endif 3885#endif
3735 unlock_super(sb); 3886 unlock_super(sb);
3736 unlock_kernel(); 3887 unlock_kernel();
3888 kfree(orig_data);
3737 return err; 3889 return err;
3738} 3890}
3739 3891
@@ -3906,24 +4058,21 @@ static int ext4_write_info(struct super_block *sb, int type)
3906 */ 4058 */
3907static int ext4_quota_on_mount(struct super_block *sb, int type) 4059static int ext4_quota_on_mount(struct super_block *sb, int type)
3908{ 4060{
3909 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], 4061 return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
3910 EXT4_SB(sb)->s_jquota_fmt, type); 4062 EXT4_SB(sb)->s_jquota_fmt, type);
3911} 4063}
3912 4064
3913/* 4065/*
3914 * Standard function to be called on quota_on 4066 * Standard function to be called on quota_on
3915 */ 4067 */
3916static int ext4_quota_on(struct super_block *sb, int type, int format_id, 4068static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3917 char *name, int remount) 4069 char *name)
3918{ 4070{
3919 int err; 4071 int err;
3920 struct path path; 4072 struct path path;
3921 4073
3922 if (!test_opt(sb, QUOTA)) 4074 if (!test_opt(sb, QUOTA))
3923 return -EINVAL; 4075 return -EINVAL;
3924 /* When remounting, no checks are needed and in fact, name is NULL */
3925 if (remount)
3926 return vfs_quota_on(sb, type, format_id, name, remount);
3927 4076
3928 err = kern_path(name, LOOKUP_FOLLOW, &path); 4077 err = kern_path(name, LOOKUP_FOLLOW, &path);
3929 if (err) 4078 if (err)
@@ -3962,11 +4111,23 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3962 } 4111 }
3963 } 4112 }
3964 4113
3965 err = vfs_quota_on_path(sb, type, format_id, &path); 4114 err = dquot_quota_on_path(sb, type, format_id, &path);
3966 path_put(&path); 4115 path_put(&path);
3967 return err; 4116 return err;
3968} 4117}
3969 4118
4119static int ext4_quota_off(struct super_block *sb, int type)
4120{
4121 /* Force all delayed allocation blocks to be allocated */
4122 if (test_opt(sb, DELALLOC)) {
4123 down_read(&sb->s_umount);
4124 sync_filesystem(sb);
4125 up_read(&sb->s_umount);
4126 }
4127
4128 return dquot_quota_off(sb, type);
4129}
4130
3970/* Read data from quotafile - avoid pagecache and such because we cannot afford 4131/* Read data from quotafile - avoid pagecache and such because we cannot afford
3971 * acquiring the locks... As quota files are never truncated and quota code 4132 * acquiring the locks... As quota files are never truncated and quota code
3972 * itself serializes the operations (and noone else should touch the files) 4133 * itself serializes the operations (and noone else should touch the files)
@@ -4016,7 +4177,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
4016 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 4177 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
4017 int err = 0; 4178 int err = 0;
4018 int offset = off & (sb->s_blocksize - 1); 4179 int offset = off & (sb->s_blocksize - 1);
4019 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
4020 struct buffer_head *bh; 4180 struct buffer_head *bh;
4021 handle_t *handle = journal_current_handle(); 4181 handle_t *handle = journal_current_handle();
4022 4182
@@ -4041,24 +4201,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
4041 bh = ext4_bread(handle, inode, blk, 1, &err); 4201 bh = ext4_bread(handle, inode, blk, 1, &err);
4042 if (!bh) 4202 if (!bh)
4043 goto out; 4203 goto out;
4044 if (journal_quota) { 4204 err = ext4_journal_get_write_access(handle, bh);
4045 err = ext4_journal_get_write_access(handle, bh); 4205 if (err) {
4046 if (err) { 4206 brelse(bh);
4047 brelse(bh); 4207 goto out;
4048 goto out;
4049 }
4050 } 4208 }
4051 lock_buffer(bh); 4209 lock_buffer(bh);
4052 memcpy(bh->b_data+offset, data, len); 4210 memcpy(bh->b_data+offset, data, len);
4053 flush_dcache_page(bh->b_page); 4211 flush_dcache_page(bh->b_page);
4054 unlock_buffer(bh); 4212 unlock_buffer(bh);
4055 if (journal_quota) 4213 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4056 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4057 else {
4058 /* Always do at least ordered writes for quotas */
4059 err = ext4_jbd2_file_inode(handle, inode);
4060 mark_buffer_dirty(bh);
4061 }
4062 brelse(bh); 4214 brelse(bh);
4063out: 4215out:
4064 if (err) { 4216 if (err) {
@@ -4141,6 +4293,7 @@ static int __init init_ext4_fs(void)
4141{ 4293{
4142 int err; 4294 int err;
4143 4295
4296 ext4_check_flag_values();
4144 err = init_ext4_system_zone(); 4297 err = init_ext4_system_zone();
4145 if (err) 4298 if (err)
4146 return err; 4299 return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext4_setattr,
37#ifdef CONFIG_EXT4_FS_XATTR 38#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 46const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 48 .follow_link = ext4_follow_link,
49 .setattr = ext4_setattr,
48#ifdef CONFIG_EXT4_FS_XATTR 50#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8489d8..3a8cd8dff1ad 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
97 97
98static struct mb_cache *ext4_xattr_cache; 98static struct mb_cache *ext4_xattr_cache;
99 99
100static struct xattr_handler *ext4_xattr_handler_map[] = { 100static const struct xattr_handler *ext4_xattr_handler_map[] = {
101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
102#ifdef CONFIG_EXT4_FS_POSIX_ACL 102#ifdef CONFIG_EXT4_FS_POSIX_ACL
103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
109#endif 109#endif
110}; 110};
111 111
112struct xattr_handler *ext4_xattr_handlers[] = { 112const struct xattr_handler *ext4_xattr_handlers[] = {
113 &ext4_xattr_user_handler, 113 &ext4_xattr_user_handler,
114 &ext4_xattr_trusted_handler, 114 &ext4_xattr_trusted_handler,
115#ifdef CONFIG_EXT4_FS_POSIX_ACL 115#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
122 NULL 122 NULL
123}; 123};
124 124
125static inline struct xattr_handler * 125static inline const struct xattr_handler *
126ext4_xattr_handler(int name_index) 126ext4_xattr_handler(int name_index)
127{ 127{
128 struct xattr_handler *handler = NULL; 128 const struct xattr_handler *handler = NULL;
129 129
130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) 130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
131 handler = ext4_xattr_handler_map[name_index]; 131 handler = ext4_xattr_handler_map[name_index];
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: 230bad_block:
231 ext4_error(inode->i_sb, 231 EXT4_ERROR_INODE(inode, "bad block %llu",
232 "inode %lu: bad block %llu", inode->i_ino, 232 EXT4_I(inode)->i_file_acl);
233 EXT4_I(inode)->i_file_acl);
234 error = -EIO; 233 error = -EIO;
235 goto cleanup; 234 goto cleanup;
236 } 235 }
@@ -332,7 +331,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
332 size_t rest = buffer_size; 331 size_t rest = buffer_size;
333 332
334 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { 333 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
335 struct xattr_handler *handler = 334 const struct xattr_handler *handler =
336 ext4_xattr_handler(entry->e_name_index); 335 ext4_xattr_handler(entry->e_name_index);
337 336
338 if (handler) { 337 if (handler) {
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
372 ea_bdebug(bh, "b_count=%d, refcount=%d", 371 ea_bdebug(bh, "b_count=%d, refcount=%d",
373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
374 if (ext4_xattr_check_block(bh)) { 373 if (ext4_xattr_check_block(bh)) {
375 ext4_error(inode->i_sb, 374 EXT4_ERROR_INODE(inode, "bad block %llu",
376 "inode %lu: bad block %llu", inode->i_ino, 375 EXT4_I(inode)->i_file_acl);
377 EXT4_I(inode)->i_file_acl);
378 error = -EIO; 376 error = -EIO;
379 goto cleanup; 377 goto cleanup;
380 } 378 }
@@ -460,8 +458,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
460 458
461 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { 459 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
462 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); 460 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
463 sb->s_dirt = 1; 461 ext4_handle_dirty_super(handle, sb);
464 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
465 } 462 }
466} 463}
467 464
@@ -666,8 +663,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
666 atomic_read(&(bs->bh->b_count)), 663 atomic_read(&(bs->bh->b_count)),
667 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 664 le32_to_cpu(BHDR(bs->bh)->h_refcount));
668 if (ext4_xattr_check_block(bs->bh)) { 665 if (ext4_xattr_check_block(bs->bh)) {
669 ext4_error(sb, "inode %lu: bad block %llu", 666 EXT4_ERROR_INODE(inode, "bad block %llu",
670 inode->i_ino, EXT4_I(inode)->i_file_acl); 667 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 668 error = -EIO;
672 goto cleanup; 669 goto cleanup;
673 } 670 }
@@ -820,7 +817,7 @@ inserted:
820 EXT4_I(inode)->i_block_group); 817 EXT4_I(inode)->i_block_group);
821 818
822 /* non-extent files can't have physical blocks past 2^32 */ 819 /* non-extent files can't have physical blocks past 2^32 */
823 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
824 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
825 822
826 block = ext4_new_meta_blocks(handle, inode, 823 block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +825,7 @@ inserted:
828 if (error) 825 if (error)
829 goto cleanup; 826 goto cleanup;
830 827
831 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 828 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
832 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); 829 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
833 830
834 ea_idebug(inode, "creating block %d", block); 831 ea_idebug(inode, "creating block %d", block);
@@ -880,8 +877,8 @@ cleanup_dquot:
880 goto cleanup; 877 goto cleanup;
881 878
882bad_block: 879bad_block:
883 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 880 EXT4_ERROR_INODE(inode, "bad block %llu",
884 inode->i_ino, EXT4_I(inode)->i_file_acl); 881 EXT4_I(inode)->i_file_acl);
885 goto cleanup; 882 goto cleanup;
886 883
887#undef header 884#undef header
@@ -1194,8 +1191,8 @@ retry:
1194 if (!bh) 1191 if (!bh)
1195 goto cleanup; 1192 goto cleanup;
1196 if (ext4_xattr_check_block(bh)) { 1193 if (ext4_xattr_check_block(bh)) {
1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1194 EXT4_ERROR_INODE(inode, "bad block %llu",
1198 inode->i_ino, EXT4_I(inode)->i_file_acl); 1195 EXT4_I(inode)->i_file_acl);
1199 error = -EIO; 1196 error = -EIO;
1200 goto cleanup; 1197 goto cleanup;
1201 } 1198 }
@@ -1372,14 +1369,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1372 goto cleanup; 1369 goto cleanup;
1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1370 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1374 if (!bh) { 1371 if (!bh) {
1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error", 1372 EXT4_ERROR_INODE(inode, "block %llu read error",
1376 inode->i_ino, EXT4_I(inode)->i_file_acl); 1373 EXT4_I(inode)->i_file_acl);
1377 goto cleanup; 1374 goto cleanup;
1378 } 1375 }
1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1376 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1377 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1378 EXT4_ERROR_INODE(inode, "bad block %llu",
1382 inode->i_ino, EXT4_I(inode)->i_file_acl); 1379 EXT4_I(inode)->i_file_acl);
1383 goto cleanup; 1380 goto cleanup;
1384 } 1381 }
1385 ext4_xattr_release_block(handle, inode, bh); 1382 ext4_xattr_release_block(handle, inode, bh);
@@ -1420,7 +1417,7 @@ ext4_xattr_cache_insert(struct buffer_head *bh)
1420 ea_bdebug(bh, "out of memory"); 1417 ea_bdebug(bh, "out of memory");
1421 return; 1418 return;
1422 } 1419 }
1423 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); 1420 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
1424 if (error) { 1421 if (error) {
1425 mb_cache_entry_free(ce); 1422 mb_cache_entry_free(ce);
1426 if (error == -EBUSY) { 1423 if (error == -EBUSY) {
@@ -1492,8 +1489,8 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1492 return NULL; /* never share */ 1489 return NULL; /* never share */
1493 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1490 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1494again: 1491again:
1495 ce = mb_cache_entry_find_first(ext4_xattr_cache, 0, 1492 ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
1496 inode->i_sb->s_bdev, hash); 1493 hash);
1497 while (ce) { 1494 while (ce) {
1498 struct buffer_head *bh; 1495 struct buffer_head *bh;
1499 1496
@@ -1504,9 +1501,8 @@ again:
1504 } 1501 }
1505 bh = sb_bread(inode->i_sb, ce->e_block); 1502 bh = sb_bread(inode->i_sb, ce->e_block);
1506 if (!bh) { 1503 if (!bh) {
1507 ext4_error(inode->i_sb, 1504 EXT4_ERROR_INODE(inode, "block %lu read error",
1508 "inode %lu: block %lu read error", 1505 (unsigned long) ce->e_block);
1509 inode->i_ino, (unsigned long) ce->e_block);
1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1506 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1511 EXT4_XATTR_REFCOUNT_MAX) { 1507 EXT4_XATTR_REFCOUNT_MAX) {
1512 ea_idebug(inode, "block %lu refcount %d>=%d", 1508 ea_idebug(inode, "block %lu refcount %d>=%d",
@@ -1518,7 +1514,7 @@ again:
1518 return bh; 1514 return bh;
1519 } 1515 }
1520 brelse(bh); 1516 brelse(bh);
1521 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); 1517 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
1522 } 1518 }
1523 return NULL; 1519 return NULL;
1524} 1520}
@@ -1594,9 +1590,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1594int __init 1590int __init
1595init_ext4_xattr(void) 1591init_ext4_xattr(void)
1596{ 1592{
1597 ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL, 1593 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
1598 sizeof(struct mb_cache_entry) +
1599 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1600 if (!ext4_xattr_cache) 1594 if (!ext4_xattr_cache)
1601 return -ENOMEM; 1595 return -ENOMEM;
1602 return 0; 1596 return 0;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c29..518e96e43905 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
65 65
66# ifdef CONFIG_EXT4_FS_XATTR 66# ifdef CONFIG_EXT4_FS_XATTR
67 67
68extern struct xattr_handler ext4_xattr_user_handler; 68extern const struct xattr_handler ext4_xattr_user_handler;
69extern struct xattr_handler ext4_xattr_trusted_handler; 69extern const struct xattr_handler ext4_xattr_trusted_handler;
70extern struct xattr_handler ext4_xattr_acl_access_handler; 70extern const struct xattr_handler ext4_xattr_acl_access_handler;
71extern struct xattr_handler ext4_xattr_acl_default_handler; 71extern const struct xattr_handler ext4_xattr_acl_default_handler;
72extern struct xattr_handler ext4_xattr_security_handler; 72extern const struct xattr_handler ext4_xattr_security_handler;
73 73
74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); 74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
75 75
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
86extern int init_ext4_xattr(void); 86extern int init_ext4_xattr(void);
87extern void exit_ext4_xattr(void); 87extern void exit_ext4_xattr(void);
88 88
89extern struct xattr_handler *ext4_xattr_handlers[]; 89extern const struct xattr_handler *ext4_xattr_handlers[];
90 90
91# else /* CONFIG_EXT4_FS_XATTR */ 91# else /* CONFIG_EXT4_FS_XATTR */
92 92
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 8b145e98df07..9b21268e121c 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
69 return err; 69 return err;
70} 70}
71 71
72struct xattr_handler ext4_xattr_security_handler = { 72const struct xattr_handler ext4_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext4_xattr_security_list, 74 .list = ext4_xattr_security_list,
75 .get = ext4_xattr_security_get, 75 .get = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc6587..37e6ebca2cc3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
51 name, value, size, flags); 51 name, value, size, flags);
52} 52}
53 53
54struct xattr_handler ext4_xattr_trusted_handler = { 54const struct xattr_handler ext4_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext4_xattr_trusted_list, 56 .list = ext4_xattr_trusted_list,
57 .get = ext4_xattr_trusted_get, 57 .get = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce1..98c375352d0e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext4_xattr_user_handler = { 57const struct xattr_handler ext4_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext4_xattr_user_list, 59 .list = ext4_xattr_user_list,
60 .get = ext4_xattr_user_get, 60 .get = ext4_xattr_user_get,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565d..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
242 while (*fclus < cluster) { 242 while (*fclus < cluster) {
243 /* prevent the infinite loop of cluster chain */ 243 /* prevent the infinite loop of cluster chain */
244 if (*fclus > limit) { 244 if (*fclus > limit) {
245 fat_fs_error(sb, "%s: detected the cluster chain loop" 245 fat_fs_error_ratelimit(sb,
246 " (i_pos %lld)", __func__, 246 "%s: detected the cluster chain loop"
247 MSDOS_I(inode)->i_pos); 247 " (i_pos %lld)", __func__,
248 MSDOS_I(inode)->i_pos);
248 nr = -EIO; 249 nr = -EIO;
249 goto out; 250 goto out;
250 } 251 }
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
253 if (nr < 0) 254 if (nr < 0)
254 goto out; 255 goto out;
255 else if (nr == FAT_ENT_FREE) { 256 else if (nr == FAT_ENT_FREE) {
256 fat_fs_error(sb, "%s: invalid cluster chain" 257 fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
257 " (i_pos %lld)", __func__, 258 " (i_pos %lld)", __func__,
258 MSDOS_I(inode)->i_pos); 259 MSDOS_I(inode)->i_pos);
259 nr = -EIO; 260 nr = -EIO;
260 goto out; 261 goto out;
261 } else if (nr == FAT_ENT_EOF) { 262 } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include <linux/compat.h> 20#include <linux/compat.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <linux/kernel.h>
22#include "fat.h" 23#include "fat.h"
23 24
24/* 25/*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
140{ 141{
141 const wchar_t *ip; 142 const wchar_t *ip;
142 wchar_t ec; 143 wchar_t ec;
143 unsigned char *op, nc; 144 unsigned char *op;
144 int charlen; 145 int charlen;
145 int k;
146 146
147 ip = uni; 147 ip = uni;
148 op = ascii; 148 op = ascii;
149 149
150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { 150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
151 ec = *ip++; 151 ec = *ip++;
152 if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { 152 if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
153 op += charlen; 153 op += charlen;
154 len -= charlen; 154 len -= charlen;
155 } else { 155 } else {
156 if (uni_xlate == 1) { 156 if (uni_xlate == 1) {
157 *op = ':'; 157 *op++ = ':';
158 for (k = 4; k > 0; k--) { 158 op = pack_hex_byte(op, ec >> 8);
159 nc = ec & 0xF; 159 op = pack_hex_byte(op, ec);
160 op[k] = nc > 9 ? nc + ('a' - 10)
161 : nc + '0';
162 ec >>= 4;
163 }
164 op += 5;
165 len -= 5; 160 len -= 5;
166 } else { 161 } else {
167 *op++ = '?'; 162 *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
758 return ret; 753 return ret;
759} 754}
760 755
761static int fat_dir_ioctl(struct inode *inode, struct file *filp, 756static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
762 unsigned int cmd, unsigned long arg) 757 unsigned long arg)
763{ 758{
759 struct inode *inode = filp->f_path.dentry->d_inode;
764 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; 760 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
765 int short_only, both; 761 int short_only, both;
766 762
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
774 both = 1; 770 both = 1;
775 break; 771 break;
776 default: 772 default:
777 return fat_generic_ioctl(inode, filp, cmd, arg); 773 return fat_generic_ioctl(filp, cmd, arg);
778 } 774 }
779 775
780 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) 776 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
814 both = 1; 810 both = 1;
815 break; 811 break;
816 default: 812 default:
817 return -ENOIOCTLCMD; 813 return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
818 } 814 }
819 815
820 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) 816 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
836 .llseek = generic_file_llseek, 832 .llseek = generic_file_llseek,
837 .read = generic_read_dir, 833 .read = generic_read_dir,
838 .readdir = fat_readdir, 834 .readdir = fat_readdir,
839 .ioctl = fat_dir_ioctl, 835 .unlocked_ioctl = fat_dir_ioctl,
840#ifdef CONFIG_COMPAT 836#ifdef CONFIG_COMPAT
841 .compat_ioctl = fat_compat_dir_ioctl, 837 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 838#endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..d75a77f85c28 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
6#include <linux/nls.h> 6#include <linux/nls.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/ratelimit.h>
9#include <linux/msdos_fs.h> 10#include <linux/msdos_fs.h>
10 11
11/* 12/*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
82 struct fatent_operations *fatent_ops; 83 struct fatent_operations *fatent_ops;
83 struct inode *fat_inode; 84 struct inode *fat_inode;
84 85
86 struct ratelimit_state ratelimit;
87
85 spinlock_t inode_hash_lock; 88 spinlock_t inode_hash_lock;
86 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 89 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
87}; 90};
@@ -298,16 +301,15 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
298extern int fat_count_free_clusters(struct super_block *sb); 301extern int fat_count_free_clusters(struct super_block *sb);
299 302
300/* fat/file.c */ 303/* fat/file.c */
301extern int fat_generic_ioctl(struct inode *inode, struct file *filp, 304extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
302 unsigned int cmd, unsigned long arg); 305 unsigned long arg);
303extern const struct file_operations fat_file_operations; 306extern const struct file_operations fat_file_operations;
304extern const struct inode_operations fat_file_inode_operations; 307extern const struct inode_operations fat_file_inode_operations;
305extern int fat_setattr(struct dentry * dentry, struct iattr * attr); 308extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
306extern void fat_truncate(struct inode *inode); 309extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
307extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, 310extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
308 struct kstat *stat); 311 struct kstat *stat);
309extern int fat_file_fsync(struct file *file, struct dentry *dentry, 312extern int fat_file_fsync(struct file *file, int datasync);
310 int datasync);
311 313
312/* fat/inode.c */ 314/* fat/inode.c */
313extern void fat_attach(struct inode *inode, loff_t i_pos); 315extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -322,8 +324,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 324extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323 struct inode *i2); 325 struct inode *i2);
324/* fat/misc.c */ 326/* fat/misc.c */
325extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 327extern void
326 __attribute__ ((format (printf, 2, 3))) __cold; 328__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
329 __attribute__ ((format (printf, 3, 4))) __cold;
330#define fat_fs_error(s, fmt, args...) \
331 __fat_fs_error(s, 1, fmt , ## args)
332#define fat_fs_error_ratelimit(s, fmt, args...) \
333 __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
327extern int fat_clusters_flush(struct super_block *sb); 334extern int fat_clusters_flush(struct super_block *sb);
328extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 335extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
329extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 336extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..7257752b6d5d 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/capability.h> 9#include <linux/capability.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/compat.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/time.h> 13#include <linux/time.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
114 return err; 115 return err;
115} 116}
116 117
117int fat_generic_ioctl(struct inode *inode, struct file *filp, 118long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118 unsigned int cmd, unsigned long arg)
119{ 119{
120 struct inode *inode = filp->f_path.dentry->d_inode;
120 u32 __user *user_attr = (u32 __user *)arg; 121 u32 __user *user_attr = (u32 __user *)arg;
121 122
122 switch (cmd) { 123 switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
129 } 130 }
130} 131}
131 132
133#ifdef CONFIG_COMPAT
134static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
135 unsigned long arg)
136
137{
138 return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
139}
140#endif
141
132static int fat_file_release(struct inode *inode, struct file *filp) 142static int fat_file_release(struct inode *inode, struct file *filp)
133{ 143{
134 if ((filp->f_mode & FMODE_WRITE) && 144 if ((filp->f_mode & FMODE_WRITE) &&
@@ -139,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
139 return 0; 149 return 0;
140} 150}
141 151
142int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 152int fat_file_fsync(struct file *filp, int datasync)
143{ 153{
144 struct inode *inode = dentry->d_inode; 154 struct inode *inode = filp->f_mapping->host;
145 int res, err; 155 int res, err;
146 156
147 res = simple_fsync(filp, dentry, datasync); 157 res = generic_file_fsync(filp, datasync);
148 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); 158 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
149 159
150 return res ? res : err; 160 return res ? res : err;
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
159 .aio_write = generic_file_aio_write, 169 .aio_write = generic_file_aio_write,
160 .mmap = generic_file_mmap, 170 .mmap = generic_file_mmap,
161 .release = fat_file_release, 171 .release = fat_file_release,
162 .ioctl = fat_generic_ioctl, 172 .unlocked_ioctl = fat_generic_ioctl,
173#ifdef CONFIG_COMPAT
174 .compat_ioctl = fat_generic_compat_ioctl,
175#endif
163 .fsync = fat_file_fsync, 176 .fsync = fat_file_fsync,
164 .splice_read = generic_file_splice_read, 177 .splice_read = generic_file_splice_read,
165}; 178};
@@ -270,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
270 return fat_free_clusters(inode, free_start); 283 return fat_free_clusters(inode, free_start);
271} 284}
272 285
273void fat_truncate(struct inode *inode) 286void fat_truncate_blocks(struct inode *inode, loff_t offset)
274{ 287{
275 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 288 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
276 const unsigned int cluster_size = sbi->cluster_size; 289 const unsigned int cluster_size = sbi->cluster_size;
@@ -280,10 +293,10 @@ void fat_truncate(struct inode *inode)
280 * This protects against truncating a file bigger than it was then 293 * This protects against truncating a file bigger than it was then
281 * trying to write into the hole. 294 * trying to write into the hole.
282 */ 295 */
283 if (MSDOS_I(inode)->mmu_private > inode->i_size) 296 if (MSDOS_I(inode)->mmu_private > offset)
284 MSDOS_I(inode)->mmu_private = inode->i_size; 297 MSDOS_I(inode)->mmu_private = offset;
285 298
286 nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; 299 nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
287 300
288 fat_free(inode, nr_clusters); 301 fat_free(inode, nr_clusters);
289 fat_flush_inodes(inode->i_sb, inode, NULL); 302 fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -362,20 +375,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
362 unsigned int ia_valid; 375 unsigned int ia_valid;
363 int error; 376 int error;
364 377
365 /*
366 * Expand the file. Since inode_setattr() updates ->i_size
367 * before calling the ->truncate(), but FAT needs to fill the
368 * hole before it.
369 */
370 if (attr->ia_valid & ATTR_SIZE) {
371 if (attr->ia_size > inode->i_size) {
372 error = fat_cont_expand(inode, attr->ia_size);
373 if (error || attr->ia_valid == ATTR_SIZE)
374 goto out;
375 attr->ia_valid &= ~ATTR_SIZE;
376 }
377 }
378
379 /* Check for setting the inode time. */ 378 /* Check for setting the inode time. */
380 ia_valid = attr->ia_valid; 379 ia_valid = attr->ia_valid;
381 if (ia_valid & TIMES_SET_FLAGS) { 380 if (ia_valid & TIMES_SET_FLAGS) {
@@ -391,6 +390,21 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
391 goto out; 390 goto out;
392 } 391 }
393 392
393 /*
394 * Expand the file. Since inode_setattr() updates ->i_size
395 * before calling the ->truncate(), but FAT needs to fill the
396 * hole before it. XXX: this is no longer true with new truncate
397 * sequence.
398 */
399 if (attr->ia_valid & ATTR_SIZE) {
400 if (attr->ia_size > inode->i_size) {
401 error = fat_cont_expand(inode, attr->ia_size);
402 if (error || attr->ia_valid == ATTR_SIZE)
403 goto out;
404 attr->ia_valid &= ~ATTR_SIZE;
405 }
406 }
407
394 if (((attr->ia_valid & ATTR_UID) && 408 if (((attr->ia_valid & ATTR_UID) &&
395 (attr->ia_uid != sbi->options.fs_uid)) || 409 (attr->ia_uid != sbi->options.fs_uid)) ||
396 ((attr->ia_valid & ATTR_GID) && 410 ((attr->ia_valid & ATTR_GID) &&
@@ -414,15 +428,19 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
414 attr->ia_valid &= ~ATTR_MODE; 428 attr->ia_valid &= ~ATTR_MODE;
415 } 429 }
416 430
417 if (attr->ia_valid) 431 if (attr->ia_valid & ATTR_SIZE) {
418 error = inode_setattr(inode, attr); 432 truncate_setsize(inode, attr->ia_size);
433 fat_truncate_blocks(inode, attr->ia_size);
434 }
435
436 setattr_copy(inode, attr);
437 mark_inode_dirty(inode);
419out: 438out:
420 return error; 439 return error;
421} 440}
422EXPORT_SYMBOL_GPL(fat_setattr); 441EXPORT_SYMBOL_GPL(fat_setattr);
423 442
424const struct inode_operations fat_file_inode_operations = { 443const struct inode_operations fat_file_inode_operations = {
425 .truncate = fat_truncate,
426 .setattr = fat_setattr, 444 .setattr = fat_setattr,
427 .getattr = fat_getattr, 445 .getattr = fat_getattr,
428}; 446};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..830058057d33 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block); 142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
143} 143}
144 144
145static void fat_write_failed(struct address_space *mapping, loff_t to)
146{
147 struct inode *inode = mapping->host;
148
149 if (to > inode->i_size) {
150 truncate_pagecache(inode, to, inode->i_size);
151 fat_truncate_blocks(inode, inode->i_size);
152 }
153}
154
145static int fat_write_begin(struct file *file, struct address_space *mapping, 155static int fat_write_begin(struct file *file, struct address_space *mapping,
146 loff_t pos, unsigned len, unsigned flags, 156 loff_t pos, unsigned len, unsigned flags,
147 struct page **pagep, void **fsdata) 157 struct page **pagep, void **fsdata)
148{ 158{
159 int err;
160
149 *pagep = NULL; 161 *pagep = NULL;
150 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 162 err = cont_write_begin(file, mapping, pos, len, flags,
151 fat_get_block, 163 pagep, fsdata, fat_get_block,
152 &MSDOS_I(mapping->host)->mmu_private); 164 &MSDOS_I(mapping->host)->mmu_private);
165 if (err < 0)
166 fat_write_failed(mapping, pos + len);
167 return err;
153} 168}
154 169
155static int fat_write_end(struct file *file, struct address_space *mapping, 170static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
159 struct inode *inode = mapping->host; 174 struct inode *inode = mapping->host;
160 int err; 175 int err;
161 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); 176 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
177 if (err < len)
178 fat_write_failed(mapping, pos + len);
162 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { 179 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
163 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 180 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
164 MSDOS_I(inode)->i_attrs |= ATTR_ARCH; 181 MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
172 loff_t offset, unsigned long nr_segs) 189 loff_t offset, unsigned long nr_segs)
173{ 190{
174 struct file *file = iocb->ki_filp; 191 struct file *file = iocb->ki_filp;
175 struct inode *inode = file->f_mapping->host; 192 struct address_space *mapping = file->f_mapping;
193 struct inode *inode = mapping->host;
194 ssize_t ret;
176 195
177 if (rw == WRITE) { 196 if (rw == WRITE) {
178 /* 197 /*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
193 * FAT need to use the DIO_LOCKING for avoiding the race 212 * FAT need to use the DIO_LOCKING for avoiding the race
194 * condition of fat_get_block() and ->truncate(). 213 * condition of fat_get_block() and ->truncate().
195 */ 214 */
196 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 215 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
197 offset, nr_segs, fat_get_block, NULL); 216 iov, offset, nr_segs, fat_get_block, NULL);
217 if (ret < 0 && (rw & WRITE))
218 fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
219
220 return ret;
198} 221}
199 222
200static sector_t _fat_bmap(struct address_space *mapping, sector_t block) 223static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -240,7 +263,7 @@ static const struct address_space_operations fat_aops = {
240 * check if the location is still valid and retry if it 263 * check if the location is still valid and retry if it
241 * isn't. Otherwise we do changes. 264 * isn't. Otherwise we do changes.
242 * 5. Spinlock is used to protect hash/unhash/location check/lookup 265 * 5. Spinlock is used to protect hash/unhash/location check/lookup
243 * 6. fat_clear_inode() unhashes the F-d-c entry. 266 * 6. fat_evict_inode() unhashes the F-d-c entry.
244 * 7. lookup() and readdir() do igrab() if they find a F-d-c entry 267 * 7. lookup() and readdir() do igrab() if they find a F-d-c entry
245 * and consider negative result as cache miss. 268 * and consider negative result as cache miss.
246 */ 269 */
@@ -425,16 +448,15 @@ out:
425 448
426EXPORT_SYMBOL_GPL(fat_build_inode); 449EXPORT_SYMBOL_GPL(fat_build_inode);
427 450
428static void fat_delete_inode(struct inode *inode) 451static void fat_evict_inode(struct inode *inode)
429{ 452{
430 truncate_inode_pages(&inode->i_data, 0); 453 truncate_inode_pages(&inode->i_data, 0);
431 inode->i_size = 0; 454 if (!inode->i_nlink) {
432 fat_truncate(inode); 455 inode->i_size = 0;
433 clear_inode(inode); 456 fat_truncate_blocks(inode, 0);
434} 457 }
435 458 invalidate_inode_buffers(inode);
436static void fat_clear_inode(struct inode *inode) 459 end_writeback(inode);
437{
438 fat_cache_inval_inode(inode); 460 fat_cache_inval_inode(inode);
439 fat_detach(inode); 461 fat_detach(inode);
440} 462}
@@ -651,12 +673,11 @@ static const struct super_operations fat_sops = {
651 .alloc_inode = fat_alloc_inode, 673 .alloc_inode = fat_alloc_inode,
652 .destroy_inode = fat_destroy_inode, 674 .destroy_inode = fat_destroy_inode,
653 .write_inode = fat_write_inode, 675 .write_inode = fat_write_inode,
654 .delete_inode = fat_delete_inode, 676 .evict_inode = fat_evict_inode,
655 .put_super = fat_put_super, 677 .put_super = fat_put_super,
656 .write_super = fat_write_super, 678 .write_super = fat_write_super,
657 .sync_fs = fat_sync_fs, 679 .sync_fs = fat_sync_fs,
658 .statfs = fat_statfs, 680 .statfs = fat_statfs,
659 .clear_inode = fat_clear_inode,
660 .remount_fs = fat_remount, 681 .remount_fs = fat_remount,
661 682
662 .show_options = fat_show_options, 683 .show_options = fat_show_options,
@@ -1250,6 +1271,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1250 sb->s_op = &fat_sops; 1271 sb->s_op = &fat_sops;
1251 sb->s_export_op = &fat_export_ops; 1272 sb->s_export_op = &fat_export_ops;
1252 sbi->dir_ops = fs_dir_inode_ops; 1273 sbi->dir_ops = fs_dir_inode_ops;
1274 ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
1275 DEFAULT_RATELIMIT_BURST);
1253 1276
1254 error = parse_options(data, isvfat, silent, &debug, &sbi->options); 1277 error = parse_options(data, isvfat, silent, &debug, &sbi->options);
1255 if (error) 1278 if (error)
@@ -1497,10 +1520,8 @@ out_fail:
1497 iput(fat_inode); 1520 iput(fat_inode);
1498 if (root_inode) 1521 if (root_inode)
1499 iput(root_inode); 1522 iput(root_inode);
1500 if (sbi->nls_io) 1523 unload_nls(sbi->nls_io);
1501 unload_nls(sbi->nls_io); 1524 unload_nls(sbi->nls_disk);
1502 if (sbi->nls_disk)
1503 unload_nls(sbi->nls_disk);
1504 if (sbi->options.iocharset != fat_default_iocharset) 1525 if (sbi->options.iocharset != fat_default_iocharset)
1505 kfree(sbi->options.iocharset); 1526 kfree(sbi->options.iocharset);
1506 sb->s_fs_info = NULL; 1527 sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1736f2356388 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
20 * In case the file system is remounted read-only, it can be made writable 20 * In case the file system is remounted read-only, it can be made writable
21 * again by remounting it. 21 * again by remounting it.
22 */ 22 */
23void fat_fs_error(struct super_block *s, const char *fmt, ...) 23void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
24{ 24{
25 struct fat_mount_options *opts = &MSDOS_SB(s)->options; 25 struct fat_mount_options *opts = &MSDOS_SB(s)->options;
26 va_list args; 26 va_list args;
27 27
28 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); 28 if (report) {
29 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
29 30
30 printk(KERN_ERR " "); 31 printk(KERN_ERR " ");
31 va_start(args, fmt); 32 va_start(args, fmt);
32 vprintk(fmt, args); 33 vprintk(fmt, args);
33 va_end(args); 34 va_end(args);
34 printk("\n"); 35 printk("\n");
36 }
35 37
36 if (opts->errors == FAT_ERRORS_PANIC) 38 if (opts->errors == FAT_ERRORS_PANIC)
37 panic(" FAT fs panic from previous error\n"); 39 panic("FAT: fs panic from previous error\n");
38 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { 40 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
39 s->s_flags |= MS_RDONLY; 41 s->s_flags |= MS_RDONLY;
40 printk(KERN_ERR " File system has been set read-only\n"); 42 printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
41 } 43 }
42} 44}
43EXPORT_SYMBOL_GPL(fat_fs_error); 45EXPORT_SYMBOL_GPL(__fat_fs_error);
44 46
45/* Flushes the number of free clusters on FAT32 */ 47/* Flushes the number of free clusters on FAT32 */
46/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 48/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
@@ -248,7 +250,9 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
248{ 250{
249 int i, err = 0; 251 int i, err = 0;
250 252
251 ll_rw_block(SWRITE, nr_bhs, bhs); 253 for (i = 0; i < nr_bhs; i++)
254 write_dirty_buffer(bhs[i], WRITE);
255
252 for (i = 0; i < nr_bhs; i++) { 256 for (i = 0; i < nr_bhs; i++) {
253 wait_on_buffer(bhs[i]); 257 wait_on_buffer(bhs[i]);
254 if (buffer_eopnotsupp(bhs[i])) { 258 if (buffer_eopnotsupp(bhs[i])) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..f8cc34f542c3 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
14#include <linux/dnotify.h> 14#include <linux/dnotify.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/pipe_fs_i.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/ptrace.h> 19#include <linux/ptrace.h>
19#include <linux/signal.h> 20#include <linux/signal.h>
@@ -273,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
273 274
274 ret = copy_from_user(&owner, owner_p, sizeof(owner)); 275 ret = copy_from_user(&owner, owner_p, sizeof(owner));
275 if (ret) 276 if (ret)
276 return ret; 277 return -EFAULT;
277 278
278 switch (owner.type) { 279 switch (owner.type) {
279 case F_OWNER_TID: 280 case F_OWNER_TID:
@@ -331,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
331 } 332 }
332 read_unlock(&filp->f_owner.lock); 333 read_unlock(&filp->f_owner.lock);
333 334
334 if (!ret) 335 if (!ret) {
335 ret = copy_to_user(owner_p, &owner, sizeof(owner)); 336 ret = copy_to_user(owner_p, &owner, sizeof(owner));
337 if (ret)
338 ret = -EFAULT;
339 }
336 return ret; 340 return ret;
337} 341}
338 342
@@ -412,6 +416,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
412 case F_NOTIFY: 416 case F_NOTIFY:
413 err = fcntl_dirnotify(fd, filp, arg); 417 err = fcntl_dirnotify(fd, filp, arg);
414 break; 418 break;
419 case F_SETPIPE_SZ:
420 case F_GETPIPE_SZ:
421 err = pipe_fcntl(filp, cmd, arg);
422 break;
415 default: 423 default:
416 break; 424 break;
417 } 425 }
@@ -614,9 +622,15 @@ int send_sigurg(struct fown_struct *fown)
614 return ret; 622 return ret;
615} 623}
616 624
617static DEFINE_RWLOCK(fasync_lock); 625static DEFINE_SPINLOCK(fasync_lock);
618static struct kmem_cache *fasync_cache __read_mostly; 626static struct kmem_cache *fasync_cache __read_mostly;
619 627
628static void fasync_free_rcu(struct rcu_head *head)
629{
630 kmem_cache_free(fasync_cache,
631 container_of(head, struct fasync_struct, fa_rcu));
632}
633
620/* 634/*
621 * Remove a fasync entry. If successfully removed, return 635 * Remove a fasync entry. If successfully removed, return
622 * positive and clear the FASYNC flag. If no entry exists, 636 * positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +639,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
625 * NOTE! It is very important that the FASYNC flag always 639 * NOTE! It is very important that the FASYNC flag always
626 * match the state "is the filp on a fasync list". 640 * match the state "is the filp on a fasync list".
627 * 641 *
628 * We always take the 'filp->f_lock', in since fasync_lock
629 * needs to be irq-safe.
630 */ 642 */
631static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) 643static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
632{ 644{
@@ -634,17 +646,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
634 int result = 0; 646 int result = 0;
635 647
636 spin_lock(&filp->f_lock); 648 spin_lock(&filp->f_lock);
637 write_lock_irq(&fasync_lock); 649 spin_lock(&fasync_lock);
638 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 650 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
639 if (fa->fa_file != filp) 651 if (fa->fa_file != filp)
640 continue; 652 continue;
653
654 spin_lock_irq(&fa->fa_lock);
655 fa->fa_file = NULL;
656 spin_unlock_irq(&fa->fa_lock);
657
641 *fp = fa->fa_next; 658 *fp = fa->fa_next;
642 kmem_cache_free(fasync_cache, fa); 659 call_rcu(&fa->fa_rcu, fasync_free_rcu);
643 filp->f_flags &= ~FASYNC; 660 filp->f_flags &= ~FASYNC;
644 result = 1; 661 result = 1;
645 break; 662 break;
646 } 663 }
647 write_unlock_irq(&fasync_lock); 664 spin_unlock(&fasync_lock);
648 spin_unlock(&filp->f_lock); 665 spin_unlock(&filp->f_lock);
649 return result; 666 return result;
650} 667}
@@ -666,25 +683,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
666 return -ENOMEM; 683 return -ENOMEM;
667 684
668 spin_lock(&filp->f_lock); 685 spin_lock(&filp->f_lock);
669 write_lock_irq(&fasync_lock); 686 spin_lock(&fasync_lock);
670 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 687 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
671 if (fa->fa_file != filp) 688 if (fa->fa_file != filp)
672 continue; 689 continue;
690
691 spin_lock_irq(&fa->fa_lock);
673 fa->fa_fd = fd; 692 fa->fa_fd = fd;
693 spin_unlock_irq(&fa->fa_lock);
694
674 kmem_cache_free(fasync_cache, new); 695 kmem_cache_free(fasync_cache, new);
675 goto out; 696 goto out;
676 } 697 }
677 698
699 spin_lock_init(&new->fa_lock);
678 new->magic = FASYNC_MAGIC; 700 new->magic = FASYNC_MAGIC;
679 new->fa_file = filp; 701 new->fa_file = filp;
680 new->fa_fd = fd; 702 new->fa_fd = fd;
681 new->fa_next = *fapp; 703 new->fa_next = *fapp;
682 *fapp = new; 704 rcu_assign_pointer(*fapp, new);
683 result = 1; 705 result = 1;
684 filp->f_flags |= FASYNC; 706 filp->f_flags |= FASYNC;
685 707
686out: 708out:
687 write_unlock_irq(&fasync_lock); 709 spin_unlock(&fasync_lock);
688 spin_unlock(&filp->f_lock); 710 spin_unlock(&filp->f_lock);
689 return result; 711 return result;
690} 712}
@@ -704,46 +726,67 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
704 726
705EXPORT_SYMBOL(fasync_helper); 727EXPORT_SYMBOL(fasync_helper);
706 728
707void __kill_fasync(struct fasync_struct *fa, int sig, int band) 729/*
730 * rcu_read_lock() is held
731 */
732static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
708{ 733{
709 while (fa) { 734 while (fa) {
710 struct fown_struct * fown; 735 struct fown_struct *fown;
736 unsigned long flags;
737
711 if (fa->magic != FASYNC_MAGIC) { 738 if (fa->magic != FASYNC_MAGIC) {
712 printk(KERN_ERR "kill_fasync: bad magic number in " 739 printk(KERN_ERR "kill_fasync: bad magic number in "
713 "fasync_struct!\n"); 740 "fasync_struct!\n");
714 return; 741 return;
715 } 742 }
716 fown = &fa->fa_file->f_owner; 743 spin_lock_irqsave(&fa->fa_lock, flags);
717 /* Don't send SIGURG to processes which have not set a 744 if (fa->fa_file) {
718 queued signum: SIGURG has its own default signalling 745 fown = &fa->fa_file->f_owner;
719 mechanism. */ 746 /* Don't send SIGURG to processes which have not set a
720 if (!(sig == SIGURG && fown->signum == 0)) 747 queued signum: SIGURG has its own default signalling
721 send_sigio(fown, fa->fa_fd, band); 748 mechanism. */
722 fa = fa->fa_next; 749 if (!(sig == SIGURG && fown->signum == 0))
750 send_sigio(fown, fa->fa_fd, band);
751 }
752 spin_unlock_irqrestore(&fa->fa_lock, flags);
753 fa = rcu_dereference(fa->fa_next);
723 } 754 }
724} 755}
725 756
726EXPORT_SYMBOL(__kill_fasync);
727
728void kill_fasync(struct fasync_struct **fp, int sig, int band) 757void kill_fasync(struct fasync_struct **fp, int sig, int band)
729{ 758{
730 /* First a quick test without locking: usually 759 /* First a quick test without locking: usually
731 * the list is empty. 760 * the list is empty.
732 */ 761 */
733 if (*fp) { 762 if (*fp) {
734 read_lock(&fasync_lock); 763 rcu_read_lock();
735 /* reread *fp after obtaining the lock */ 764 kill_fasync_rcu(rcu_dereference(*fp), sig, band);
736 __kill_fasync(*fp, sig, band); 765 rcu_read_unlock();
737 read_unlock(&fasync_lock);
738 } 766 }
739} 767}
740EXPORT_SYMBOL(kill_fasync); 768EXPORT_SYMBOL(kill_fasync);
741 769
742static int __init fasync_init(void) 770static int __init fcntl_init(void)
743{ 771{
772 /*
773 * Please add new bits here to ensure allocation uniqueness.
774 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
775 * is defined as O_NONBLOCK on some platforms and not on others.
776 */
777 BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
778 O_RDONLY | O_WRONLY | O_RDWR |
779 O_CREAT | O_EXCL | O_NOCTTY |
780 O_TRUNC | O_APPEND | /* O_NONBLOCK | */
781 __O_SYNC | O_DSYNC | FASYNC |
782 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
783 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
784 FMODE_EXEC
785 ));
786
744 fasync_cache = kmem_cache_create("fasync_cache", 787 fasync_cache = kmem_cache_create("fasync_cache",
745 sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); 788 sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
746 return 0; 789 return 0;
747} 790}
748 791
749module_init(fasync_init) 792module_init(fcntl_init)
diff --git a/fs/file.c b/fs/file.c
index 34bb7f71d994..0be344755c02 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -39,28 +39,27 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */
39 */ 39 */
40static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); 40static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
41 41
42static inline void * alloc_fdmem(unsigned int size) 42static inline void *alloc_fdmem(unsigned int size)
43{ 43{
44 if (size <= PAGE_SIZE) 44 void *data;
45 return kmalloc(size, GFP_KERNEL); 45
46 else 46 data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
47 return vmalloc(size); 47 if (data != NULL)
48 return data;
49
50 return vmalloc(size);
48} 51}
49 52
50static inline void free_fdarr(struct fdtable *fdt) 53static void free_fdmem(void *ptr)
51{ 54{
52 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) 55 is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
53 kfree(fdt->fd);
54 else
55 vfree(fdt->fd);
56} 56}
57 57
58static inline void free_fdset(struct fdtable *fdt) 58static void __free_fdtable(struct fdtable *fdt)
59{ 59{
60 if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2)) 60 free_fdmem(fdt->fd);
61 kfree(fdt->open_fds); 61 free_fdmem(fdt->open_fds);
62 else 62 kfree(fdt);
63 vfree(fdt->open_fds);
64} 63}
65 64
66static void free_fdtable_work(struct work_struct *work) 65static void free_fdtable_work(struct work_struct *work)
@@ -75,9 +74,8 @@ static void free_fdtable_work(struct work_struct *work)
75 spin_unlock_bh(&f->lock); 74 spin_unlock_bh(&f->lock);
76 while(fdt) { 75 while(fdt) {
77 struct fdtable *next = fdt->next; 76 struct fdtable *next = fdt->next;
78 vfree(fdt->fd); 77
79 free_fdset(fdt); 78 __free_fdtable(fdt);
80 kfree(fdt);
81 fdt = next; 79 fdt = next;
82 } 80 }
83} 81}
@@ -98,7 +96,7 @@ void free_fdtable_rcu(struct rcu_head *rcu)
98 container_of(fdt, struct files_struct, fdtab)); 96 container_of(fdt, struct files_struct, fdtab));
99 return; 97 return;
100 } 98 }
101 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) { 99 if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
102 kfree(fdt->fd); 100 kfree(fdt->fd);
103 kfree(fdt->open_fds); 101 kfree(fdt->open_fds);
104 kfree(fdt); 102 kfree(fdt);
@@ -178,13 +176,12 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
178 fdt->open_fds = (fd_set *)data; 176 fdt->open_fds = (fd_set *)data;
179 data += nr / BITS_PER_BYTE; 177 data += nr / BITS_PER_BYTE;
180 fdt->close_on_exec = (fd_set *)data; 178 fdt->close_on_exec = (fd_set *)data;
181 INIT_RCU_HEAD(&fdt->rcu);
182 fdt->next = NULL; 179 fdt->next = NULL;
183 180
184 return fdt; 181 return fdt;
185 182
186out_arr: 183out_arr:
187 free_fdarr(fdt); 184 free_fdmem(fdt->fd);
188out_fdt: 185out_fdt:
189 kfree(fdt); 186 kfree(fdt);
190out: 187out:
@@ -214,9 +211,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
214 * caller and alloc_fdtable(). Cheaper to catch it here... 211 * caller and alloc_fdtable(). Cheaper to catch it here...
215 */ 212 */
216 if (unlikely(new_fdt->max_fds <= nr)) { 213 if (unlikely(new_fdt->max_fds <= nr)) {
217 free_fdarr(new_fdt); 214 __free_fdtable(new_fdt);
218 free_fdset(new_fdt);
219 kfree(new_fdt);
220 return -EMFILE; 215 return -EMFILE;
221 } 216 }
222 /* 217 /*
@@ -232,9 +227,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
232 free_fdtable(cur_fdt); 227 free_fdtable(cur_fdt);
233 } else { 228 } else {
234 /* Somebody else expanded, so undo our attempt */ 229 /* Somebody else expanded, so undo our attempt */
235 free_fdarr(new_fdt); 230 __free_fdtable(new_fdt);
236 free_fdset(new_fdt);
237 kfree(new_fdt);
238 } 231 }
239 return 1; 232 return 1;
240} 233}
@@ -312,7 +305,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
312 new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; 305 new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
313 new_fdt->open_fds = (fd_set *)&newf->open_fds_init; 306 new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
314 new_fdt->fd = &newf->fd_array[0]; 307 new_fdt->fd = &newf->fd_array[0];
315 INIT_RCU_HEAD(&new_fdt->rcu);
316 new_fdt->next = NULL; 308 new_fdt->next = NULL;
317 309
318 spin_lock(&oldf->file_lock); 310 spin_lock(&oldf->file_lock);
@@ -325,11 +317,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
325 while (unlikely(open_files > new_fdt->max_fds)) { 317 while (unlikely(open_files > new_fdt->max_fds)) {
326 spin_unlock(&oldf->file_lock); 318 spin_unlock(&oldf->file_lock);
327 319
328 if (new_fdt != &newf->fdtab) { 320 if (new_fdt != &newf->fdtab)
329 free_fdarr(new_fdt); 321 __free_fdtable(new_fdt);
330 free_fdset(new_fdt);
331 kfree(new_fdt);
332 }
333 322
334 new_fdt = alloc_fdtable(open_files - 1); 323 new_fdt = alloc_fdtable(open_files - 1);
335 if (!new_fdt) { 324 if (!new_fdt) {
@@ -339,9 +328,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
339 328
340 /* beyond sysctl_nr_open; nothing to do */ 329 /* beyond sysctl_nr_open; nothing to do */
341 if (unlikely(new_fdt->max_fds < open_files)) { 330 if (unlikely(new_fdt->max_fds < open_files)) {
342 free_fdarr(new_fdt); 331 __free_fdtable(new_fdt);
343 free_fdset(new_fdt);
344 kfree(new_fdt);
345 *errorp = -EMFILE; 332 *errorp = -EMFILE;
346 goto out_release; 333 goto out_release;
347 } 334 }
@@ -430,7 +417,6 @@ struct files_struct init_files = {
430 .fd = &init_files.fd_array[0], 417 .fd = &init_files.fd_array[0],
431 .close_on_exec = (fd_set *)&init_files.close_on_exec_init, 418 .close_on_exec = (fd_set *)&init_files.close_on_exec_init,
432 .open_fds = (fd_set *)&init_files.open_fds_init, 419 .open_fds = (fd_set *)&init_files.open_fds_init,
433 .rcu = RCU_HEAD_INIT,
434 }, 420 },
435 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), 421 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
436}; 422};
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..a04bdd81c11c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,9 @@
20#include <linux/cdev.h> 20#include <linux/cdev.h>
21#include <linux/fsnotify.h> 21#include <linux/fsnotify.h>
22#include <linux/sysctl.h> 22#include <linux/sysctl.h>
23#include <linux/lglock.h>
23#include <linux/percpu_counter.h> 24#include <linux/percpu_counter.h>
25#include <linux/percpu.h>
24#include <linux/ima.h> 26#include <linux/ima.h>
25 27
26#include <asm/atomic.h> 28#include <asm/atomic.h>
@@ -32,8 +34,8 @@ struct files_stat_struct files_stat = {
32 .max_files = NR_FILE 34 .max_files = NR_FILE
33}; 35};
34 36
35/* public. Not pretty! */ 37DECLARE_LGLOCK(files_lglock);
36__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); 38DEFINE_LGLOCK(files_lglock);
37 39
38/* SLAB cache for file structures */ 40/* SLAB cache for file structures */
39static struct kmem_cache *filp_cachep __read_mostly; 41static struct kmem_cache *filp_cachep __read_mostly;
@@ -194,14 +196,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
194} 196}
195EXPORT_SYMBOL(alloc_file); 197EXPORT_SYMBOL(alloc_file);
196 198
197void fput(struct file *file)
198{
199 if (atomic_long_dec_and_test(&file->f_count))
200 __fput(file);
201}
202
203EXPORT_SYMBOL(fput);
204
205/** 199/**
206 * drop_file_write_access - give up ability to write to a file 200 * drop_file_write_access - give up ability to write to a file
207 * @file: the file to which we will stop writing 201 * @file: the file to which we will stop writing
@@ -227,10 +221,9 @@ void drop_file_write_access(struct file *file)
227} 221}
228EXPORT_SYMBOL_GPL(drop_file_write_access); 222EXPORT_SYMBOL_GPL(drop_file_write_access);
229 223
230/* __fput is called from task context when aio completion releases the last 224/* the real guts of fput() - releasing the last reference to file
231 * last use of a struct file *. Do not use otherwise.
232 */ 225 */
233void __fput(struct file *file) 226static void __fput(struct file *file)
234{ 227{
235 struct dentry *dentry = file->f_path.dentry; 228 struct dentry *dentry = file->f_path.dentry;
236 struct vfsmount *mnt = file->f_path.mnt; 229 struct vfsmount *mnt = file->f_path.mnt;
@@ -258,7 +251,7 @@ void __fput(struct file *file)
258 cdev_put(inode->i_cdev); 251 cdev_put(inode->i_cdev);
259 fops_put(file->f_op); 252 fops_put(file->f_op);
260 put_pid(file->f_owner.pid); 253 put_pid(file->f_owner.pid);
261 file_kill(file); 254 file_sb_list_del(file);
262 if (file->f_mode & FMODE_WRITE) 255 if (file->f_mode & FMODE_WRITE)
263 drop_file_write_access(file); 256 drop_file_write_access(file);
264 file->f_path.dentry = NULL; 257 file->f_path.dentry = NULL;
@@ -268,6 +261,14 @@ void __fput(struct file *file)
268 mntput(mnt); 261 mntput(mnt);
269} 262}
270 263
264void fput(struct file *file)
265{
266 if (atomic_long_dec_and_test(&file->f_count))
267 __fput(file);
268}
269
270EXPORT_SYMBOL(fput);
271
271struct file *fget(unsigned int fd) 272struct file *fget(unsigned int fd)
272{ 273{
273 struct file *file; 274 struct file *file;
@@ -290,11 +291,20 @@ struct file *fget(unsigned int fd)
290EXPORT_SYMBOL(fget); 291EXPORT_SYMBOL(fget);
291 292
292/* 293/*
293 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 294 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
294 * You can use this only if it is guranteed that the current task already 295 *
295 * holds a refcnt to that file. That check has to be done at fget() only 296 * You can use this instead of fget if you satisfy all of the following
296 * and a flag is returned to be passed to the corresponding fput_light(). 297 * conditions:
297 * There must not be a cloning between an fget_light/fput_light pair. 298 * 1) You must call fput_light before exiting the syscall and returning control
299 * to userspace (i.e. you cannot remember the returned struct file * after
300 * returning to userspace).
301 * 2) You must not call filp_close on the returned struct file * in between
302 * calls to fget_light and fput_light.
303 * 3) You must not clone the current task in between the calls to fget_light
304 * and fput_light.
305 *
306 * The fput_needed flag returned by fget_light should be passed to the
307 * corresponding fput_light.
298 */ 308 */
299struct file *fget_light(unsigned int fd, int *fput_needed) 309struct file *fget_light(unsigned int fd, int *fput_needed)
300{ 310{
@@ -320,41 +330,107 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
320 return file; 330 return file;
321} 331}
322 332
323
324void put_filp(struct file *file) 333void put_filp(struct file *file)
325{ 334{
326 if (atomic_long_dec_and_test(&file->f_count)) { 335 if (atomic_long_dec_and_test(&file->f_count)) {
327 security_file_free(file); 336 security_file_free(file);
328 file_kill(file); 337 file_sb_list_del(file);
329 file_free(file); 338 file_free(file);
330 } 339 }
331} 340}
332 341
333void file_move(struct file *file, struct list_head *list) 342static inline int file_list_cpu(struct file *file)
334{ 343{
335 if (!list) 344#ifdef CONFIG_SMP
336 return; 345 return file->f_sb_list_cpu;
337 file_list_lock(); 346#else
338 list_move(&file->f_u.fu_list, list); 347 return smp_processor_id();
339 file_list_unlock(); 348#endif
340} 349}
341 350
342void file_kill(struct file *file) 351/* helper for file_sb_list_add to reduce ifdefs */
352static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
353{
354 struct list_head *list;
355#ifdef CONFIG_SMP
356 int cpu;
357 cpu = smp_processor_id();
358 file->f_sb_list_cpu = cpu;
359 list = per_cpu_ptr(sb->s_files, cpu);
360#else
361 list = &sb->s_files;
362#endif
363 list_add(&file->f_u.fu_list, list);
364}
365
366/**
367 * file_sb_list_add - add a file to the sb's file list
368 * @file: file to add
369 * @sb: sb to add it to
370 *
371 * Use this function to associate a file with the superblock of the inode it
372 * refers to.
373 */
374void file_sb_list_add(struct file *file, struct super_block *sb)
375{
376 lg_local_lock(files_lglock);
377 __file_sb_list_add(file, sb);
378 lg_local_unlock(files_lglock);
379}
380
381/**
382 * file_sb_list_del - remove a file from the sb's file list
383 * @file: file to remove
384 * @sb: sb to remove it from
385 *
386 * Use this function to remove a file from its superblock.
387 */
388void file_sb_list_del(struct file *file)
343{ 389{
344 if (!list_empty(&file->f_u.fu_list)) { 390 if (!list_empty(&file->f_u.fu_list)) {
345 file_list_lock(); 391 lg_local_lock_cpu(files_lglock, file_list_cpu(file));
346 list_del_init(&file->f_u.fu_list); 392 list_del_init(&file->f_u.fu_list);
347 file_list_unlock(); 393 lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
348 } 394 }
349} 395}
350 396
397#ifdef CONFIG_SMP
398
399/*
400 * These macros iterate all files on all CPUs for a given superblock.
401 * files_lglock must be held globally.
402 */
403#define do_file_list_for_each_entry(__sb, __file) \
404{ \
405 int i; \
406 for_each_possible_cpu(i) { \
407 struct list_head *list; \
408 list = per_cpu_ptr((__sb)->s_files, i); \
409 list_for_each_entry((__file), list, f_u.fu_list)
410
411#define while_file_list_for_each_entry \
412 } \
413}
414
415#else
416
417#define do_file_list_for_each_entry(__sb, __file) \
418{ \
419 struct list_head *list; \
420 list = &(sb)->s_files; \
421 list_for_each_entry((__file), list, f_u.fu_list)
422
423#define while_file_list_for_each_entry \
424}
425
426#endif
427
351int fs_may_remount_ro(struct super_block *sb) 428int fs_may_remount_ro(struct super_block *sb)
352{ 429{
353 struct file *file; 430 struct file *file;
354
355 /* Check that no files are currently opened for writing. */ 431 /* Check that no files are currently opened for writing. */
356 file_list_lock(); 432 lg_global_lock(files_lglock);
357 list_for_each_entry(file, &sb->s_files, f_u.fu_list) { 433 do_file_list_for_each_entry(sb, file) {
358 struct inode *inode = file->f_path.dentry->d_inode; 434 struct inode *inode = file->f_path.dentry->d_inode;
359 435
360 /* File with pending delete? */ 436 /* File with pending delete? */
@@ -364,11 +440,11 @@ int fs_may_remount_ro(struct super_block *sb)
364 /* Writeable file? */ 440 /* Writeable file? */
365 if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) 441 if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
366 goto too_bad; 442 goto too_bad;
367 } 443 } while_file_list_for_each_entry;
368 file_list_unlock(); 444 lg_global_unlock(files_lglock);
369 return 1; /* Tis' cool bro. */ 445 return 1; /* Tis' cool bro. */
370too_bad: 446too_bad:
371 file_list_unlock(); 447 lg_global_unlock(files_lglock);
372 return 0; 448 return 0;
373} 449}
374 450
@@ -384,8 +460,8 @@ void mark_files_ro(struct super_block *sb)
384 struct file *f; 460 struct file *f;
385 461
386retry: 462retry:
387 file_list_lock(); 463 lg_global_lock(files_lglock);
388 list_for_each_entry(f, &sb->s_files, f_u.fu_list) { 464 do_file_list_for_each_entry(sb, f) {
389 struct vfsmount *mnt; 465 struct vfsmount *mnt;
390 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) 466 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
391 continue; 467 continue;
@@ -400,16 +476,13 @@ retry:
400 continue; 476 continue;
401 file_release_write(f); 477 file_release_write(f);
402 mnt = mntget(f->f_path.mnt); 478 mnt = mntget(f->f_path.mnt);
403 file_list_unlock(); 479 /* This can sleep, so we can't hold the spinlock. */
404 /* 480 lg_global_unlock(files_lglock);
405 * This can sleep, so we can't hold
406 * the file_list_lock() spinlock.
407 */
408 mnt_drop_write(mnt); 481 mnt_drop_write(mnt);
409 mntput(mnt); 482 mntput(mnt);
410 goto retry; 483 goto retry;
411 } 484 } while_file_list_for_each_entry;
412 file_list_unlock(); 485 lg_global_unlock(files_lglock);
413} 486}
414 487
415void __init files_init(unsigned long mempages) 488void __init files_init(unsigned long mempages)
@@ -429,5 +502,6 @@ void __init files_init(unsigned long mempages)
429 if (files_stat.max_files < NR_FILE) 502 if (files_stat.max_files < NR_FILE)
430 files_stat.max_files = NR_FILE; 503 files_stat.max_files = NR_FILE;
431 files_defer_init(); 504 files_defer_init();
505 lg_lock_init(files_lglock);
432 percpu_counter_init(&nr_files, 0); 506 percpu_counter_init(&nr_files, 0);
433} 507}
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 50ab5eecb99b..881aa3d217f0 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -63,7 +63,7 @@ extern void vxfs_put_fake_inode(struct inode *);
63extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t); 63extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t);
64extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t); 64extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t);
65extern struct inode * vxfs_iget(struct super_block *, ino_t); 65extern struct inode * vxfs_iget(struct super_block *, ino_t);
66extern void vxfs_clear_inode(struct inode *); 66extern void vxfs_evict_inode(struct inode *);
67 67
68/* vxfs_lookup.c */ 68/* vxfs_lookup.c */
69extern const struct inode_operations vxfs_dir_inode_ops; 69extern const struct inode_operations vxfs_dir_inode_ops;
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 03a6ea5e99f7..79d1b4ea13e7 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -337,15 +337,17 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
337} 337}
338 338
339/** 339/**
340 * vxfs_clear_inode - remove inode from main memory 340 * vxfs_evict_inode - remove inode from main memory
341 * @ip: inode to discard. 341 * @ip: inode to discard.
342 * 342 *
343 * Description: 343 * Description:
344 * vxfs_clear_inode() is called on the final iput and frees the private 344 * vxfs_evict_inode() is called on the final iput and frees the private
345 * inode area. 345 * inode area.
346 */ 346 */
347void 347void
348vxfs_clear_inode(struct inode *ip) 348vxfs_evict_inode(struct inode *ip)
349{ 349{
350 truncate_inode_pages(&ip->i_data, 0);
351 end_writeback(ip);
350 kmem_cache_free(vxfs_inode_cachep, ip->i_private); 352 kmem_cache_free(vxfs_inode_cachep, ip->i_private);
351} 353}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
57}; 57};
58 58
59const struct file_operations vxfs_dir_operations = { 59const struct file_operations vxfs_dir_operations = {
60 .llseek = generic_file_llseek,
61 .read = generic_read_dir,
60 .readdir = vxfs_readdir, 62 .readdir = vxfs_readdir,
61}; 63};
62 64
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1e8af939b3e4..dc0c041e85cb 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -61,7 +61,7 @@ static int vxfs_statfs(struct dentry *, struct kstatfs *);
61static int vxfs_remount(struct super_block *, int *, char *); 61static int vxfs_remount(struct super_block *, int *, char *);
62 62
63static const struct super_operations vxfs_super_ops = { 63static const struct super_operations vxfs_super_ops = {
64 .clear_inode = vxfs_clear_inode, 64 .evict_inode = vxfs_evict_inode,
65 .put_super = vxfs_put_super, 65 .put_super = vxfs_put_super,
66 .statfs = vxfs_statfs, 66 .statfs = vxfs_statfs,
67 .remount_fs = vxfs_remount, 67 .remount_fs = vxfs_remount,
@@ -135,7 +135,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
135} 135}
136 136
137/** 137/**
138 * vxfs_read_super - read superblock into memory and initalize filesystem 138 * vxfs_read_super - read superblock into memory and initialize filesystem
139 * @sbp: VFS superblock (to fill) 139 * @sbp: VFS superblock (to fill)
140 * @dp: fs private mount data 140 * @dp: fs private mount data
141 * @silent: do not complain loudly when sth is wrong 141 * @silent: do not complain loudly when sth is wrong
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4dd..ab38fef1c9a1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -26,62 +26,36 @@
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/tracepoint.h>
29#include "internal.h" 30#include "internal.h"
30 31
31#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
32
33/*
34 * We don't actually have pdflush, but this one is exported though /proc...
35 */
36int nr_pdflush_threads;
37
38/* 32/*
39 * Passed into wb_writeback(), essentially a subset of writeback_control 33 * Passed into wb_writeback(), essentially a subset of writeback_control
40 */ 34 */
41struct wb_writeback_args { 35struct wb_writeback_work {
42 long nr_pages; 36 long nr_pages;
43 struct super_block *sb; 37 struct super_block *sb;
44 enum writeback_sync_modes sync_mode; 38 enum writeback_sync_modes sync_mode;
45 int for_kupdate:1; 39 unsigned int for_kupdate:1;
46 int range_cyclic:1; 40 unsigned int range_cyclic:1;
47 int for_background:1; 41 unsigned int for_background:1;
48};
49 42
50/*
51 * Work items for the bdi_writeback threads
52 */
53struct bdi_work {
54 struct list_head list; /* pending work list */ 43 struct list_head list; /* pending work list */
55 struct rcu_head rcu_head; /* for RCU free/clear of work */ 44 struct completion *done; /* set if the caller waits */
56
57 unsigned long seen; /* threads that have seen this work */
58 atomic_t pending; /* number of threads still to do work */
59
60 struct wb_writeback_args args; /* writeback arguments */
61
62 unsigned long state; /* flag bits, see WS_* */
63}; 45};
64 46
65enum { 47/*
66 WS_USED_B = 0, 48 * Include the creation of the trace points after defining the
67 WS_ONSTACK_B, 49 * wb_writeback_work structure so that the definition remains local to this
68}; 50 * file.
69 51 */
70#define WS_USED (1 << WS_USED_B) 52#define CREATE_TRACE_POINTS
71#define WS_ONSTACK (1 << WS_ONSTACK_B) 53#include <trace/events/writeback.h>
72
73static inline bool bdi_work_on_stack(struct bdi_work *work)
74{
75 return test_bit(WS_ONSTACK_B, &work->state);
76}
77 54
78static inline void bdi_work_init(struct bdi_work *work, 55/*
79 struct wb_writeback_args *args) 56 * We don't actually have pdflush, but this one is exported though /proc...
80{ 57 */
81 INIT_RCU_HEAD(&work->rcu_head); 58int nr_pdflush_threads;
82 work->args = *args;
83 work->state = WS_USED;
84}
85 59
86/** 60/**
87 * writeback_in_progress - determine whether there is writeback in progress 61 * writeback_in_progress - determine whether there is writeback in progress
@@ -92,186 +66,94 @@ static inline void bdi_work_init(struct bdi_work *work,
92 */ 66 */
93int writeback_in_progress(struct backing_dev_info *bdi) 67int writeback_in_progress(struct backing_dev_info *bdi)
94{ 68{
95 return !list_empty(&bdi->work_list); 69 return test_bit(BDI_writeback_running, &bdi->state);
96}
97
98static void bdi_work_clear(struct bdi_work *work)
99{
100 clear_bit(WS_USED_B, &work->state);
101 smp_mb__after_clear_bit();
102 /*
103 * work can have disappeared at this point. bit waitq functions
104 * should be able to tolerate this, provided bdi_sched_wait does
105 * not dereference it's pointer argument.
106 */
107 wake_up_bit(&work->state, WS_USED_B);
108}
109
110static void bdi_work_free(struct rcu_head *head)
111{
112 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
113
114 if (!bdi_work_on_stack(work))
115 kfree(work);
116 else
117 bdi_work_clear(work);
118} 70}
119 71
120static void wb_work_complete(struct bdi_work *work) 72static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
121{ 73{
122 const enum writeback_sync_modes sync_mode = work->args.sync_mode; 74 struct super_block *sb = inode->i_sb;
123 int onstack = bdi_work_on_stack(work);
124
125 /*
126 * For allocated work, we can clear the done/seen bit right here.
127 * For on-stack work, we need to postpone both the clear and free
128 * to after the RCU grace period, since the stack could be invalidated
129 * as soon as bdi_work_clear() has done the wakeup.
130 */
131 if (!onstack)
132 bdi_work_clear(work);
133 if (sync_mode == WB_SYNC_NONE || onstack)
134 call_rcu(&work->rcu_head, bdi_work_free);
135}
136
137static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
138{
139 /*
140 * The caller has retrieved the work arguments from this work,
141 * drop our reference. If this is the last ref, delete and free it
142 */
143 if (atomic_dec_and_test(&work->pending)) {
144 struct backing_dev_info *bdi = wb->bdi;
145 75
146 spin_lock(&bdi->wb_lock); 76 if (strcmp(sb->s_type->name, "bdev") == 0)
147 list_del_rcu(&work->list); 77 return inode->i_mapping->backing_dev_info;
148 spin_unlock(&bdi->wb_lock);
149 78
150 wb_work_complete(work); 79 return sb->s_bdi;
151 }
152} 80}
153 81
154static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) 82static void bdi_queue_work(struct backing_dev_info *bdi,
83 struct wb_writeback_work *work)
155{ 84{
156 work->seen = bdi->wb_mask; 85 trace_writeback_queue(bdi, work);
157 BUG_ON(!work->seen);
158 atomic_set(&work->pending, bdi->wb_cnt);
159 BUG_ON(!bdi->wb_cnt);
160 86
161 /* 87 spin_lock_bh(&bdi->wb_lock);
162 * list_add_tail_rcu() contains the necessary barriers to 88 list_add_tail(&work->list, &bdi->work_list);
163 * make sure the above stores are seen before the item is 89 if (bdi->wb.task) {
164 * noticed on the list 90 wake_up_process(bdi->wb.task);
165 */ 91 } else {
166 spin_lock(&bdi->wb_lock); 92 /*
167 list_add_tail_rcu(&work->list, &bdi->work_list); 93 * The bdi thread isn't there, wake up the forker thread which
168 spin_unlock(&bdi->wb_lock); 94 * will create and run it.
169 95 */
170 /* 96 trace_writeback_nothread(bdi, work);
171 * If the default thread isn't there, make sure we add it. When
172 * it gets created and wakes up, we'll run this work.
173 */
174 if (unlikely(list_empty_careful(&bdi->wb_list)))
175 wake_up_process(default_backing_dev_info.wb.task); 97 wake_up_process(default_backing_dev_info.wb.task);
176 else {
177 struct bdi_writeback *wb = &bdi->wb;
178
179 if (wb->task)
180 wake_up_process(wb->task);
181 } 98 }
99 spin_unlock_bh(&bdi->wb_lock);
182} 100}
183 101
184/* 102static void
185 * Used for on-stack allocated work items. The caller needs to wait until 103__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
186 * the wb threads have acked the work before it's safe to continue. 104 bool range_cyclic, bool for_background)
187 */
188static void bdi_wait_on_work_clear(struct bdi_work *work)
189{
190 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
191 TASK_UNINTERRUPTIBLE);
192}
193
194static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
195 struct wb_writeback_args *args)
196{ 105{
197 struct bdi_work *work; 106 struct wb_writeback_work *work;
198 107
199 /* 108 /*
200 * This is WB_SYNC_NONE writeback, so if allocation fails just 109 * This is WB_SYNC_NONE writeback, so if allocation fails just
201 * wakeup the thread for old dirty data writeback 110 * wakeup the thread for old dirty data writeback
202 */ 111 */
203 work = kmalloc(sizeof(*work), GFP_ATOMIC); 112 work = kzalloc(sizeof(*work), GFP_ATOMIC);
204 if (work) { 113 if (!work) {
205 bdi_work_init(work, args); 114 if (bdi->wb.task) {
206 bdi_queue_work(bdi, work); 115 trace_writeback_nowork(bdi);
207 } else { 116 wake_up_process(bdi->wb.task);
208 struct bdi_writeback *wb = &bdi->wb; 117 }
209 118 return;
210 if (wb->task)
211 wake_up_process(wb->task);
212 } 119 }
120
121 work->sync_mode = WB_SYNC_NONE;
122 work->nr_pages = nr_pages;
123 work->range_cyclic = range_cyclic;
124 work->for_background = for_background;
125
126 bdi_queue_work(bdi, work);
213} 127}
214 128
215/** 129/**
216 * bdi_sync_writeback - start and wait for writeback 130 * bdi_start_writeback - start writeback
217 * @bdi: the backing device to write from 131 * @bdi: the backing device to write from
218 * @sb: write inodes from this super_block 132 * @nr_pages: the number of pages to write
219 * 133 *
220 * Description: 134 * Description:
221 * This does WB_SYNC_ALL data integrity writeback and waits for the 135 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
222 * IO to complete. Callers must hold the sb s_umount semaphore for 136 * started when this function returns, we make no guarentees on
223 * reading, to avoid having the super disappear before we are done. 137 * completion. Caller need not hold sb s_umount semaphore.
138 *
224 */ 139 */
225static void bdi_sync_writeback(struct backing_dev_info *bdi, 140void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
226 struct super_block *sb)
227{ 141{
228 struct wb_writeback_args args = { 142 __bdi_start_writeback(bdi, nr_pages, true, false);
229 .sb = sb,
230 .sync_mode = WB_SYNC_ALL,
231 .nr_pages = LONG_MAX,
232 .range_cyclic = 0,
233 };
234 struct bdi_work work;
235
236 bdi_work_init(&work, &args);
237 work.state |= WS_ONSTACK;
238
239 bdi_queue_work(bdi, &work);
240 bdi_wait_on_work_clear(&work);
241} 143}
242 144
243/** 145/**
244 * bdi_start_writeback - start writeback 146 * bdi_start_background_writeback - start background writeback
245 * @bdi: the backing device to write from 147 * @bdi: the backing device to write from
246 * @sb: write inodes from this super_block
247 * @nr_pages: the number of pages to write
248 * 148 *
249 * Description: 149 * Description:
250 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 150 * This does WB_SYNC_NONE background writeback. The IO is only
251 * started when this function returns, we make no guarentees on 151 * started when this function returns, we make no guarentees on
252 * completion. Caller need not hold sb s_umount semaphore. 152 * completion. Caller need not hold sb s_umount semaphore.
253 *
254 */ 153 */
255void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 154void bdi_start_background_writeback(struct backing_dev_info *bdi)
256 long nr_pages)
257{ 155{
258 struct wb_writeback_args args = { 156 __bdi_start_writeback(bdi, LONG_MAX, true, true);
259 .sb = sb,
260 .sync_mode = WB_SYNC_NONE,
261 .nr_pages = nr_pages,
262 .range_cyclic = 1,
263 };
264
265 /*
266 * We treat @nr_pages=0 as the special case to do background writeback,
267 * ie. to sync pages until the background dirty threshold is reached.
268 */
269 if (!nr_pages) {
270 args.nr_pages = LONG_MAX;
271 args.for_background = 1;
272 }
273
274 bdi_alloc_queue_work(bdi, &args);
275} 157}
276 158
277/* 159/*
@@ -375,10 +257,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
375 257
376/* 258/*
377 * Queue all expired dirty inodes for io, eldest first. 259 * Queue all expired dirty inodes for io, eldest first.
260 * Before
261 * newly dirtied b_dirty b_io b_more_io
262 * =============> gf edc BA
263 * After
264 * newly dirtied b_dirty b_io b_more_io
265 * =============> g fBAedc
266 * |
267 * +--> dequeue for IO
378 */ 268 */
379static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 269static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
380{ 270{
381 list_splice_init(&wb->b_more_io, wb->b_io.prev); 271 list_splice_init(&wb->b_more_io, &wb->b_io);
382 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 272 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
383} 273}
384 274
@@ -398,11 +288,11 @@ static void inode_wait_for_writeback(struct inode *inode)
398 wait_queue_head_t *wqh; 288 wait_queue_head_t *wqh;
399 289
400 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 290 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
401 do { 291 while (inode->i_state & I_SYNC) {
402 spin_unlock(&inode_lock); 292 spin_unlock(&inode_lock);
403 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 293 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
404 spin_lock(&inode_lock); 294 spin_lock(&inode_lock);
405 } while (inode->i_state & I_SYNC); 295 }
406} 296}
407 297
408/* 298/*
@@ -452,11 +342,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
452 342
453 BUG_ON(inode->i_state & I_SYNC); 343 BUG_ON(inode->i_state & I_SYNC);
454 344
455 /* Set I_SYNC, reset I_DIRTY */ 345 /* Set I_SYNC, reset I_DIRTY_PAGES */
456 dirty = inode->i_state & I_DIRTY;
457 inode->i_state |= I_SYNC; 346 inode->i_state |= I_SYNC;
458 inode->i_state &= ~I_DIRTY; 347 inode->i_state &= ~I_DIRTY_PAGES;
459
460 spin_unlock(&inode_lock); 348 spin_unlock(&inode_lock);
461 349
462 ret = do_writepages(mapping, wbc); 350 ret = do_writepages(mapping, wbc);
@@ -472,6 +360,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
472 ret = err; 360 ret = err;
473 } 361 }
474 362
363 /*
364 * Some filesystems may redirty the inode during the writeback
365 * due to delalloc, clear dirty metadata flags right before
366 * write_inode()
367 */
368 spin_lock(&inode_lock);
369 dirty = inode->i_state & I_DIRTY;
370 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
371 spin_unlock(&inode_lock);
475 /* Don't write the inode if only I_DIRTY_PAGES was set */ 372 /* Don't write the inode if only I_DIRTY_PAGES was set */
476 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 373 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
477 int err = write_inode(inode, wbc); 374 int err = write_inode(inode, wbc);
@@ -481,63 +378,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
481 378
482 spin_lock(&inode_lock); 379 spin_lock(&inode_lock);
483 inode->i_state &= ~I_SYNC; 380 inode->i_state &= ~I_SYNC;
484 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 381 if (!(inode->i_state & I_FREEING)) {
485 if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { 382 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
486 /*
487 * More pages get dirtied by a fast dirtier.
488 */
489 goto select_queue;
490 } else if (inode->i_state & I_DIRTY) {
491 /*
492 * At least XFS will redirty the inode during the
493 * writeback (delalloc) and on io completion (isize).
494 */
495 redirty_tail(inode);
496 } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
497 /* 383 /*
498 * We didn't write back all the pages. nfs_writepages() 384 * We didn't write back all the pages. nfs_writepages()
499 * sometimes bales out without doing anything. Redirty 385 * sometimes bales out without doing anything.
500 * the inode; Move it from b_io onto b_more_io/b_dirty.
501 */ 386 */
502 /* 387 inode->i_state |= I_DIRTY_PAGES;
503 * akpm: if the caller was the kupdate function we put 388 if (wbc->nr_to_write <= 0) {
504 * this inode at the head of b_dirty so it gets first
505 * consideration. Otherwise, move it to the tail, for
506 * the reasons described there. I'm not really sure
507 * how much sense this makes. Presumably I had a good
508 * reasons for doing it this way, and I'd rather not
509 * muck with it at present.
510 */
511 if (wbc->for_kupdate) {
512 /* 389 /*
513 * For the kupdate function we move the inode 390 * slice used up: queue for next turn
514 * to b_more_io so it will get more writeout as
515 * soon as the queue becomes uncongested.
516 */ 391 */
517 inode->i_state |= I_DIRTY_PAGES; 392 requeue_io(inode);
518select_queue:
519 if (wbc->nr_to_write <= 0) {
520 /*
521 * slice used up: queue for next turn
522 */
523 requeue_io(inode);
524 } else {
525 /*
526 * somehow blocked: retry later
527 */
528 redirty_tail(inode);
529 }
530 } else { 393 } else {
531 /* 394 /*
532 * Otherwise fully redirty the inode so that 395 * Writeback blocked by something other than
533 * other inodes on this superblock will get some 396 * congestion. Delay the inode for some time to
534 * writeout. Otherwise heavy writing to one 397 * avoid spinning on the CPU (100% iowait)
535 * file would indefinitely suspend writeout of 398 * retrying writeback of the dirty page/inode
536 * all the other files. 399 * that cannot be performed immediately.
537 */ 400 */
538 inode->i_state |= I_DIRTY_PAGES;
539 redirty_tail(inode); 401 redirty_tail(inode);
540 } 402 }
403 } else if (inode->i_state & I_DIRTY) {
404 /*
405 * Filesystems can dirty the inode during writeback
406 * operations, such as delayed allocation during
407 * submission or metadata updates after data IO
408 * completion.
409 */
410 redirty_tail(inode);
541 } else if (atomic_read(&inode->i_count)) { 411 } else if (atomic_read(&inode->i_count)) {
542 /* 412 /*
543 * The inode is clean, inuse 413 * The inode is clean, inuse
@@ -554,75 +424,69 @@ select_queue:
554 return ret; 424 return ret;
555} 425}
556 426
557static void unpin_sb_for_writeback(struct super_block *sb)
558{
559 up_read(&sb->s_umount);
560 put_super(sb);
561}
562
563enum sb_pin_state {
564 SB_PINNED,
565 SB_NOT_PINNED,
566 SB_PIN_FAILED
567};
568
569/* 427/*
570 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 428 * For background writeback the caller does not have the sb pinned
571 * before calling writeback. So make sure that we do pin it, so it doesn't 429 * before calling writeback. So make sure that we do pin it, so it doesn't
572 * go away while we are writing inodes from it. 430 * go away while we are writing inodes from it.
573 */ 431 */
574static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, 432static bool pin_sb_for_writeback(struct super_block *sb)
575 struct super_block *sb)
576{ 433{
577 /*
578 * Caller must already hold the ref for this
579 */
580 if (wbc->sync_mode == WB_SYNC_ALL) {
581 WARN_ON(!rwsem_is_locked(&sb->s_umount));
582 return SB_NOT_PINNED;
583 }
584 spin_lock(&sb_lock); 434 spin_lock(&sb_lock);
435 if (list_empty(&sb->s_instances)) {
436 spin_unlock(&sb_lock);
437 return false;
438 }
439
585 sb->s_count++; 440 sb->s_count++;
441 spin_unlock(&sb_lock);
442
586 if (down_read_trylock(&sb->s_umount)) { 443 if (down_read_trylock(&sb->s_umount)) {
587 if (sb->s_root) { 444 if (sb->s_root)
588 spin_unlock(&sb_lock); 445 return true;
589 return SB_PINNED;
590 }
591 /*
592 * umounted, drop rwsem again and fall through to failure
593 */
594 up_read(&sb->s_umount); 446 up_read(&sb->s_umount);
595 } 447 }
596 sb->s_count--; 448
597 spin_unlock(&sb_lock); 449 put_super(sb);
598 return SB_PIN_FAILED; 450 return false;
599} 451}
600 452
601/* 453/*
602 * Write a portion of b_io inodes which belong to @sb. 454 * Write a portion of b_io inodes which belong to @sb.
603 * If @wbc->sb != NULL, then find and write all such 455 *
456 * If @only_this_sb is true, then find and write all such
604 * inodes. Otherwise write only ones which go sequentially 457 * inodes. Otherwise write only ones which go sequentially
605 * in reverse order. 458 * in reverse order.
459 *
606 * Return 1, if the caller writeback routine should be 460 * Return 1, if the caller writeback routine should be
607 * interrupted. Otherwise return 0. 461 * interrupted. Otherwise return 0.
608 */ 462 */
609static int writeback_sb_inodes(struct super_block *sb, 463static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
610 struct bdi_writeback *wb, 464 struct writeback_control *wbc, bool only_this_sb)
611 struct writeback_control *wbc)
612{ 465{
613 while (!list_empty(&wb->b_io)) { 466 while (!list_empty(&wb->b_io)) {
614 long pages_skipped; 467 long pages_skipped;
615 struct inode *inode = list_entry(wb->b_io.prev, 468 struct inode *inode = list_entry(wb->b_io.prev,
616 struct inode, i_list); 469 struct inode, i_list);
617 if (wbc->sb && sb != inode->i_sb) { 470
618 /* super block given and doesn't 471 if (inode->i_sb != sb) {
619 match, skip this inode */ 472 if (only_this_sb) {
620 redirty_tail(inode); 473 /*
621 continue; 474 * We only want to write back data for this
622 } 475 * superblock, move all inodes not belonging
623 if (sb != inode->i_sb) 476 * to it back onto the dirty list.
624 /* finish with this superblock */ 477 */
478 redirty_tail(inode);
479 continue;
480 }
481
482 /*
483 * The inode belongs to a different superblock.
484 * Bounce back to the caller to unpin this and
485 * pin the next superblock.
486 */
625 return 0; 487 return 0;
488 }
489
626 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 490 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
627 requeue_io(inode); 491 requeue_io(inode);
628 continue; 492 continue;
@@ -634,7 +498,7 @@ static int writeback_sb_inodes(struct super_block *sb,
634 if (inode_dirtied_after(inode, wbc->wb_start)) 498 if (inode_dirtied_after(inode, wbc->wb_start))
635 return 1; 499 return 1;
636 500
637 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 501 BUG_ON(inode->i_state & I_FREEING);
638 __iget(inode); 502 __iget(inode);
639 pages_skipped = wbc->pages_skipped; 503 pages_skipped = wbc->pages_skipped;
640 writeback_single_inode(inode, wbc); 504 writeback_single_inode(inode, wbc);
@@ -660,12 +524,13 @@ static int writeback_sb_inodes(struct super_block *sb,
660 return 1; 524 return 1;
661} 525}
662 526
663static void writeback_inodes_wb(struct bdi_writeback *wb, 527void writeback_inodes_wb(struct bdi_writeback *wb,
664 struct writeback_control *wbc) 528 struct writeback_control *wbc)
665{ 529{
666 int ret = 0; 530 int ret = 0;
667 531
668 wbc->wb_start = jiffies; /* livelock avoidance */ 532 if (!wbc->wb_start)
533 wbc->wb_start = jiffies; /* livelock avoidance */
669 spin_lock(&inode_lock); 534 spin_lock(&inode_lock);
670 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 535 if (!wbc->for_kupdate || list_empty(&wb->b_io))
671 queue_io(wb, wbc->older_than_this); 536 queue_io(wb, wbc->older_than_this);
@@ -674,24 +539,14 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
674 struct inode *inode = list_entry(wb->b_io.prev, 539 struct inode *inode = list_entry(wb->b_io.prev,
675 struct inode, i_list); 540 struct inode, i_list);
676 struct super_block *sb = inode->i_sb; 541 struct super_block *sb = inode->i_sb;
677 enum sb_pin_state state;
678 542
679 if (wbc->sb && sb != wbc->sb) { 543 if (!pin_sb_for_writeback(sb)) {
680 /* super block given and doesn't
681 match, skip this inode */
682 redirty_tail(inode);
683 continue;
684 }
685 state = pin_sb_for_writeback(wbc, sb);
686
687 if (state == SB_PIN_FAILED) {
688 requeue_io(inode); 544 requeue_io(inode);
689 continue; 545 continue;
690 } 546 }
691 ret = writeback_sb_inodes(sb, wb, wbc); 547 ret = writeback_sb_inodes(sb, wb, wbc, false);
548 drop_super(sb);
692 549
693 if (state == SB_PINNED)
694 unpin_sb_for_writeback(sb);
695 if (ret) 550 if (ret)
696 break; 551 break;
697 } 552 }
@@ -699,11 +554,16 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
699 /* Leave any unwritten inodes on b_io */ 554 /* Leave any unwritten inodes on b_io */
700} 555}
701 556
702void writeback_inodes_wbc(struct writeback_control *wbc) 557static void __writeback_inodes_sb(struct super_block *sb,
558 struct bdi_writeback *wb, struct writeback_control *wbc)
703{ 559{
704 struct backing_dev_info *bdi = wbc->bdi; 560 WARN_ON(!rwsem_is_locked(&sb->s_umount));
705 561
706 writeback_inodes_wb(&bdi->wb, wbc); 562 spin_lock(&inode_lock);
563 if (!wbc->for_kupdate || list_empty(&wb->b_io))
564 queue_io(wb, wbc->older_than_this);
565 writeback_sb_inodes(sb, wb, wbc, true);
566 spin_unlock(&inode_lock);
707} 567}
708 568
709/* 569/*
@@ -719,7 +579,7 @@ static inline bool over_bground_thresh(void)
719{ 579{
720 unsigned long background_thresh, dirty_thresh; 580 unsigned long background_thresh, dirty_thresh;
721 581
722 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 582 global_dirty_limits(&background_thresh, &dirty_thresh);
723 583
724 return (global_page_state(NR_FILE_DIRTY) + 584 return (global_page_state(NR_FILE_DIRTY) +
725 global_page_state(NR_UNSTABLE_NFS) >= background_thresh); 585 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
@@ -741,16 +601,14 @@ static inline bool over_bground_thresh(void)
741 * all dirty pages if they are all attached to "old" mappings. 601 * all dirty pages if they are all attached to "old" mappings.
742 */ 602 */
743static long wb_writeback(struct bdi_writeback *wb, 603static long wb_writeback(struct bdi_writeback *wb,
744 struct wb_writeback_args *args) 604 struct wb_writeback_work *work)
745{ 605{
746 struct writeback_control wbc = { 606 struct writeback_control wbc = {
747 .bdi = wb->bdi, 607 .sync_mode = work->sync_mode,
748 .sb = args->sb,
749 .sync_mode = args->sync_mode,
750 .older_than_this = NULL, 608 .older_than_this = NULL,
751 .for_kupdate = args->for_kupdate, 609 .for_kupdate = work->for_kupdate,
752 .for_background = args->for_background, 610 .for_background = work->for_background,
753 .range_cyclic = args->range_cyclic, 611 .range_cyclic = work->range_cyclic,
754 }; 612 };
755 unsigned long oldest_jif; 613 unsigned long oldest_jif;
756 long wrote = 0; 614 long wrote = 0;
@@ -766,25 +624,33 @@ static long wb_writeback(struct bdi_writeback *wb,
766 wbc.range_end = LLONG_MAX; 624 wbc.range_end = LLONG_MAX;
767 } 625 }
768 626
627 wbc.wb_start = jiffies; /* livelock avoidance */
769 for (;;) { 628 for (;;) {
770 /* 629 /*
771 * Stop writeback when nr_pages has been consumed 630 * Stop writeback when nr_pages has been consumed
772 */ 631 */
773 if (args->nr_pages <= 0) 632 if (work->nr_pages <= 0)
774 break; 633 break;
775 634
776 /* 635 /*
777 * For background writeout, stop when we are below the 636 * For background writeout, stop when we are below the
778 * background dirty threshold 637 * background dirty threshold
779 */ 638 */
780 if (args->for_background && !over_bground_thresh()) 639 if (work->for_background && !over_bground_thresh())
781 break; 640 break;
782 641
783 wbc.more_io = 0; 642 wbc.more_io = 0;
784 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 643 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
785 wbc.pages_skipped = 0; 644 wbc.pages_skipped = 0;
786 writeback_inodes_wb(wb, &wbc); 645
787 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 646 trace_wbc_writeback_start(&wbc, wb->bdi);
647 if (work->sb)
648 __writeback_inodes_sb(work->sb, wb, &wbc);
649 else
650 writeback_inodes_wb(wb, &wbc);
651 trace_wbc_writeback_written(&wbc, wb->bdi);
652
653 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
788 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 654 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
789 655
790 /* 656 /*
@@ -811,6 +677,7 @@ static long wb_writeback(struct bdi_writeback *wb,
811 if (!list_empty(&wb->b_more_io)) { 677 if (!list_empty(&wb->b_more_io)) {
812 inode = list_entry(wb->b_more_io.prev, 678 inode = list_entry(wb->b_more_io.prev,
813 struct inode, i_list); 679 struct inode, i_list);
680 trace_wbc_writeback_wait(&wbc, wb->bdi);
814 inode_wait_for_writeback(inode); 681 inode_wait_for_writeback(inode);
815 } 682 }
816 spin_unlock(&inode_lock); 683 spin_unlock(&inode_lock);
@@ -820,31 +687,21 @@ static long wb_writeback(struct bdi_writeback *wb,
820} 687}
821 688
822/* 689/*
823 * Return the next bdi_work struct that hasn't been processed by this 690 * Return the next wb_writeback_work struct that hasn't been processed yet.
824 * wb thread yet. ->seen is initially set for each thread that exists
825 * for this device, when a thread first notices a piece of work it
826 * clears its bit. Depending on writeback type, the thread will notify
827 * completion on either receiving the work (WB_SYNC_NONE) or after
828 * it is done (WB_SYNC_ALL).
829 */ 691 */
830static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, 692static struct wb_writeback_work *
831 struct bdi_writeback *wb) 693get_next_work_item(struct backing_dev_info *bdi)
832{ 694{
833 struct bdi_work *work, *ret = NULL; 695 struct wb_writeback_work *work = NULL;
834
835 rcu_read_lock();
836 696
837 list_for_each_entry_rcu(work, &bdi->work_list, list) { 697 spin_lock_bh(&bdi->wb_lock);
838 if (!test_bit(wb->nr, &work->seen)) 698 if (!list_empty(&bdi->work_list)) {
839 continue; 699 work = list_entry(bdi->work_list.next,
840 clear_bit(wb->nr, &work->seen); 700 struct wb_writeback_work, list);
841 701 list_del_init(&work->list);
842 ret = work;
843 break;
844 } 702 }
845 703 spin_unlock_bh(&bdi->wb_lock);
846 rcu_read_unlock(); 704 return work;
847 return ret;
848} 705}
849 706
850static long wb_check_old_data_flush(struct bdi_writeback *wb) 707static long wb_check_old_data_flush(struct bdi_writeback *wb)
@@ -852,6 +709,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
852 unsigned long expired; 709 unsigned long expired;
853 long nr_pages; 710 long nr_pages;
854 711
712 /*
713 * When set to zero, disable periodic writeback
714 */
715 if (!dirty_writeback_interval)
716 return 0;
717
855 expired = wb->last_old_flush + 718 expired = wb->last_old_flush +
856 msecs_to_jiffies(dirty_writeback_interval * 10); 719 msecs_to_jiffies(dirty_writeback_interval * 10);
857 if (time_before(jiffies, expired)) 720 if (time_before(jiffies, expired))
@@ -863,14 +726,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
863 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 726 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
864 727
865 if (nr_pages) { 728 if (nr_pages) {
866 struct wb_writeback_args args = { 729 struct wb_writeback_work work = {
867 .nr_pages = nr_pages, 730 .nr_pages = nr_pages,
868 .sync_mode = WB_SYNC_NONE, 731 .sync_mode = WB_SYNC_NONE,
869 .for_kupdate = 1, 732 .for_kupdate = 1,
870 .range_cyclic = 1, 733 .range_cyclic = 1,
871 }; 734 };
872 735
873 return wb_writeback(wb, &args); 736 return wb_writeback(wb, &work);
874 } 737 }
875 738
876 return 0; 739 return 0;
@@ -882,39 +745,37 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
882long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 745long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
883{ 746{
884 struct backing_dev_info *bdi = wb->bdi; 747 struct backing_dev_info *bdi = wb->bdi;
885 struct bdi_work *work; 748 struct wb_writeback_work *work;
886 long wrote = 0; 749 long wrote = 0;
887 750
888 while ((work = get_next_work_item(bdi, wb)) != NULL) { 751 set_bit(BDI_writeback_running, &wb->bdi->state);
889 struct wb_writeback_args args = work->args; 752 while ((work = get_next_work_item(bdi)) != NULL) {
890
891 /* 753 /*
892 * Override sync mode, in case we must wait for completion 754 * Override sync mode, in case we must wait for completion
755 * because this thread is exiting now.
893 */ 756 */
894 if (force_wait) 757 if (force_wait)
895 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 758 work->sync_mode = WB_SYNC_ALL;
896 759
897 /* 760 trace_writeback_exec(bdi, work);
898 * If this isn't a data integrity operation, just notify
899 * that we have seen this work and we are now starting it.
900 */
901 if (args.sync_mode == WB_SYNC_NONE)
902 wb_clear_pending(wb, work);
903 761
904 wrote += wb_writeback(wb, &args); 762 wrote += wb_writeback(wb, work);
905 763
906 /* 764 /*
907 * This is a data integrity writeback, so only do the 765 * Notify the caller of completion if this is a synchronous
908 * notification when we have completed the work. 766 * work item, otherwise just free it.
909 */ 767 */
910 if (args.sync_mode == WB_SYNC_ALL) 768 if (work->done)
911 wb_clear_pending(wb, work); 769 complete(work->done);
770 else
771 kfree(work);
912 } 772 }
913 773
914 /* 774 /*
915 * Check for periodic writeback, kupdated() style 775 * Check for periodic writeback, kupdated() style
916 */ 776 */
917 wrote += wb_check_old_data_flush(wb); 777 wrote += wb_check_old_data_flush(wb);
778 clear_bit(BDI_writeback_running, &wb->bdi->state);
918 779
919 return wrote; 780 return wrote;
920} 781}
@@ -923,75 +784,88 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
923 * Handle writeback of dirty data for the device backed by this bdi. Also 784 * Handle writeback of dirty data for the device backed by this bdi. Also
924 * wakes up periodically and does kupdated style flushing. 785 * wakes up periodically and does kupdated style flushing.
925 */ 786 */
926int bdi_writeback_task(struct bdi_writeback *wb) 787int bdi_writeback_thread(void *data)
927{ 788{
928 unsigned long last_active = jiffies; 789 struct bdi_writeback *wb = data;
929 unsigned long wait_jiffies = -1UL; 790 struct backing_dev_info *bdi = wb->bdi;
930 long pages_written; 791 long pages_written;
931 792
793 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
794 set_freezable();
795 wb->last_active = jiffies;
796
797 /*
798 * Our parent may run at a different priority, just set us to normal
799 */
800 set_user_nice(current, 0);
801
802 trace_writeback_thread_start(bdi);
803
932 while (!kthread_should_stop()) { 804 while (!kthread_should_stop()) {
805 /*
806 * Remove own delayed wake-up timer, since we are already awake
807 * and we'll take care of the preriodic write-back.
808 */
809 del_timer(&wb->wakeup_timer);
810
933 pages_written = wb_do_writeback(wb, 0); 811 pages_written = wb_do_writeback(wb, 0);
934 812
813 trace_writeback_pages_written(pages_written);
814
935 if (pages_written) 815 if (pages_written)
936 last_active = jiffies; 816 wb->last_active = jiffies;
937 else if (wait_jiffies != -1UL) {
938 unsigned long max_idle;
939 817
818 set_current_state(TASK_INTERRUPTIBLE);
819 if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
820 __set_current_state(TASK_RUNNING);
821 continue;
822 }
823
824 if (wb_has_dirty_io(wb) && dirty_writeback_interval)
825 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
826 else {
940 /* 827 /*
941 * Longest period of inactivity that we tolerate. If we 828 * We have nothing to do, so can go sleep without any
942 * see dirty data again later, the task will get 829 * timeout and save power. When a work is queued or
943 * recreated automatically. 830 * something is made dirty - we will be woken up.
944 */ 831 */
945 max_idle = max(5UL * 60 * HZ, wait_jiffies); 832 schedule();
946 if (time_after(jiffies, max_idle + last_active))
947 break;
948 } 833 }
949 834
950 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
951 schedule_timeout_interruptible(wait_jiffies);
952 try_to_freeze(); 835 try_to_freeze();
953 } 836 }
954 837
838 /* Flush any work that raced with us exiting */
839 if (!list_empty(&bdi->work_list))
840 wb_do_writeback(wb, 1);
841
842 trace_writeback_thread_stop(bdi);
955 return 0; 843 return 0;
956} 844}
957 845
846
958/* 847/*
959 * Schedule writeback for all backing devices. This does WB_SYNC_NONE 848 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
960 * writeback, for integrity writeback see bdi_sync_writeback(). 849 * the whole world.
961 */ 850 */
962static void bdi_writeback_all(struct super_block *sb, long nr_pages) 851void wakeup_flusher_threads(long nr_pages)
963{ 852{
964 struct wb_writeback_args args = {
965 .sb = sb,
966 .nr_pages = nr_pages,
967 .sync_mode = WB_SYNC_NONE,
968 };
969 struct backing_dev_info *bdi; 853 struct backing_dev_info *bdi;
970 854
971 rcu_read_lock(); 855 if (!nr_pages) {
856 nr_pages = global_page_state(NR_FILE_DIRTY) +
857 global_page_state(NR_UNSTABLE_NFS);
858 }
972 859
860 rcu_read_lock();
973 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 861 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
974 if (!bdi_has_dirty_io(bdi)) 862 if (!bdi_has_dirty_io(bdi))
975 continue; 863 continue;
976 864 __bdi_start_writeback(bdi, nr_pages, false, false);
977 bdi_alloc_queue_work(bdi, &args);
978 } 865 }
979
980 rcu_read_unlock(); 866 rcu_read_unlock();
981} 867}
982 868
983/*
984 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
985 * the whole world.
986 */
987void wakeup_flusher_threads(long nr_pages)
988{
989 if (nr_pages == 0)
990 nr_pages = global_page_state(NR_FILE_DIRTY) +
991 global_page_state(NR_UNSTABLE_NFS);
992 bdi_writeback_all(NULL, nr_pages);
993}
994
995static noinline void block_dump___mark_inode_dirty(struct inode *inode) 869static noinline void block_dump___mark_inode_dirty(struct inode *inode)
996{ 870{
997 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 871 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1044,6 +918,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1044void __mark_inode_dirty(struct inode *inode, int flags) 918void __mark_inode_dirty(struct inode *inode, int flags)
1045{ 919{
1046 struct super_block *sb = inode->i_sb; 920 struct super_block *sb = inode->i_sb;
921 struct backing_dev_info *bdi = NULL;
922 bool wakeup_bdi = false;
1047 923
1048 /* 924 /*
1049 * Don't do this for I_DIRTY_PAGES - that doesn't actually 925 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1089,7 +965,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1089 if (hlist_unhashed(&inode->i_hash)) 965 if (hlist_unhashed(&inode->i_hash))
1090 goto out; 966 goto out;
1091 } 967 }
1092 if (inode->i_state & (I_FREEING|I_CLEAR)) 968 if (inode->i_state & I_FREEING)
1093 goto out; 969 goto out;
1094 970
1095 /* 971 /*
@@ -1097,22 +973,31 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1097 * reposition it (that would break b_dirty time-ordering). 973 * reposition it (that would break b_dirty time-ordering).
1098 */ 974 */
1099 if (!was_dirty) { 975 if (!was_dirty) {
1100 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 976 bdi = inode_to_bdi(inode);
1101 struct backing_dev_info *bdi = wb->bdi; 977
1102 978 if (bdi_cap_writeback_dirty(bdi)) {
1103 if (bdi_cap_writeback_dirty(bdi) && 979 WARN(!test_bit(BDI_registered, &bdi->state),
1104 !test_bit(BDI_registered, &bdi->state)) { 980 "bdi-%s not registered\n", bdi->name);
1105 WARN_ON(1); 981
1106 printk(KERN_ERR "bdi-%s not registered\n", 982 /*
1107 bdi->name); 983 * If this is the first dirty inode for this
984 * bdi, we have to wake-up the corresponding
985 * bdi thread to make sure background
986 * write-back happens later.
987 */
988 if (!wb_has_dirty_io(&bdi->wb))
989 wakeup_bdi = true;
1108 } 990 }
1109 991
1110 inode->dirtied_when = jiffies; 992 inode->dirtied_when = jiffies;
1111 list_move(&inode->i_list, &wb->b_dirty); 993 list_move(&inode->i_list, &bdi->wb.b_dirty);
1112 } 994 }
1113 } 995 }
1114out: 996out:
1115 spin_unlock(&inode_lock); 997 spin_unlock(&inode_lock);
998
999 if (wakeup_bdi)
1000 bdi_wakeup_thread_delayed(bdi);
1116} 1001}
1117EXPORT_SYMBOL(__mark_inode_dirty); 1002EXPORT_SYMBOL(__mark_inode_dirty);
1118 1003
@@ -1155,7 +1040,7 @@ static void wait_sb_inodes(struct super_block *sb)
1155 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1040 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1156 struct address_space *mapping; 1041 struct address_space *mapping;
1157 1042
1158 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 1043 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
1159 continue; 1044 continue;
1160 mapping = inode->i_mapping; 1045 mapping = inode->i_mapping;
1161 if (mapping->nrpages == 0) 1046 if (mapping->nrpages == 0)
@@ -1196,12 +1081,20 @@ void writeback_inodes_sb(struct super_block *sb)
1196{ 1081{
1197 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1082 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1198 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1083 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1199 long nr_to_write; 1084 DECLARE_COMPLETION_ONSTACK(done);
1085 struct wb_writeback_work work = {
1086 .sb = sb,
1087 .sync_mode = WB_SYNC_NONE,
1088 .done = &done,
1089 };
1090
1091 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1200 1092
1201 nr_to_write = nr_dirty + nr_unstable + 1093 work.nr_pages = nr_dirty + nr_unstable +
1202 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1094 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1203 1095
1204 bdi_start_writeback(sb->s_bdi, sb, nr_to_write); 1096 bdi_queue_work(sb->s_bdi, &work);
1097 wait_for_completion(&done);
1205} 1098}
1206EXPORT_SYMBOL(writeback_inodes_sb); 1099EXPORT_SYMBOL(writeback_inodes_sb);
1207 1100
@@ -1215,7 +1108,9 @@ EXPORT_SYMBOL(writeback_inodes_sb);
1215int writeback_inodes_sb_if_idle(struct super_block *sb) 1108int writeback_inodes_sb_if_idle(struct super_block *sb)
1216{ 1109{
1217 if (!writeback_in_progress(sb->s_bdi)) { 1110 if (!writeback_in_progress(sb->s_bdi)) {
1111 down_read(&sb->s_umount);
1218 writeback_inodes_sb(sb); 1112 writeback_inodes_sb(sb);
1113 up_read(&sb->s_umount);
1219 return 1; 1114 return 1;
1220 } else 1115 } else
1221 return 0; 1116 return 0;
@@ -1231,7 +1126,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1231 */ 1126 */
1232void sync_inodes_sb(struct super_block *sb) 1127void sync_inodes_sb(struct super_block *sb)
1233{ 1128{
1234 bdi_sync_writeback(sb->s_bdi, sb); 1129 DECLARE_COMPLETION_ONSTACK(done);
1130 struct wb_writeback_work work = {
1131 .sb = sb,
1132 .sync_mode = WB_SYNC_ALL,
1133 .nr_pages = LONG_MAX,
1134 .range_cyclic = 0,
1135 .done = &done,
1136 };
1137
1138 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1139
1140 bdi_queue_work(sb->s_bdi, &work);
1141 wait_for_completion(&done);
1142
1235 wait_sb_inodes(sb); 1143 wait_sb_inodes(sb);
1236} 1144}
1237EXPORT_SYMBOL(sync_inodes_sb); 1145EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index eee059052db5..ed45a9cf5f3d 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
13{ 13{
14 struct path old_root; 14 struct path old_root;
15 15
16 write_lock(&fs->lock); 16 spin_lock(&fs->lock);
17 old_root = fs->root; 17 old_root = fs->root;
18 fs->root = *path; 18 fs->root = *path;
19 path_get(path); 19 path_get(path);
20 write_unlock(&fs->lock); 20 spin_unlock(&fs->lock);
21 if (old_root.dentry) 21 if (old_root.dentry)
22 path_put(&old_root); 22 path_put(&old_root);
23} 23}
@@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
30{ 30{
31 struct path old_pwd; 31 struct path old_pwd;
32 32
33 write_lock(&fs->lock); 33 spin_lock(&fs->lock);
34 old_pwd = fs->pwd; 34 old_pwd = fs->pwd;
35 fs->pwd = *path; 35 fs->pwd = *path;
36 path_get(path); 36 path_get(path);
37 write_unlock(&fs->lock); 37 spin_unlock(&fs->lock);
38 38
39 if (old_pwd.dentry) 39 if (old_pwd.dentry)
40 path_put(&old_pwd); 40 path_put(&old_pwd);
@@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
51 task_lock(p); 51 task_lock(p);
52 fs = p->fs; 52 fs = p->fs;
53 if (fs) { 53 if (fs) {
54 write_lock(&fs->lock); 54 spin_lock(&fs->lock);
55 if (fs->root.dentry == old_root->dentry 55 if (fs->root.dentry == old_root->dentry
56 && fs->root.mnt == old_root->mnt) { 56 && fs->root.mnt == old_root->mnt) {
57 path_get(new_root); 57 path_get(new_root);
@@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
64 fs->pwd = *new_root; 64 fs->pwd = *new_root;
65 count++; 65 count++;
66 } 66 }
67 write_unlock(&fs->lock); 67 spin_unlock(&fs->lock);
68 } 68 }
69 task_unlock(p); 69 task_unlock(p);
70 } while_each_thread(g, p); 70 } while_each_thread(g, p);
@@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk)
87 if (fs) { 87 if (fs) {
88 int kill; 88 int kill;
89 task_lock(tsk); 89 task_lock(tsk);
90 write_lock(&fs->lock); 90 spin_lock(&fs->lock);
91 tsk->fs = NULL; 91 tsk->fs = NULL;
92 kill = !--fs->users; 92 kill = !--fs->users;
93 write_unlock(&fs->lock); 93 spin_unlock(&fs->lock);
94 task_unlock(tsk); 94 task_unlock(tsk);
95 if (kill) 95 if (kill)
96 free_fs_struct(fs); 96 free_fs_struct(fs);
@@ -104,14 +104,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
104 if (fs) { 104 if (fs) {
105 fs->users = 1; 105 fs->users = 1;
106 fs->in_exec = 0; 106 fs->in_exec = 0;
107 rwlock_init(&fs->lock); 107 spin_lock_init(&fs->lock);
108 fs->umask = old->umask; 108 fs->umask = old->umask;
109 read_lock(&old->lock); 109 get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
110 fs->root = old->root;
111 path_get(&old->root);
112 fs->pwd = old->pwd;
113 path_get(&old->pwd);
114 read_unlock(&old->lock);
115 } 110 }
116 return fs; 111 return fs;
117} 112}
@@ -126,10 +121,10 @@ int unshare_fs_struct(void)
126 return -ENOMEM; 121 return -ENOMEM;
127 122
128 task_lock(current); 123 task_lock(current);
129 write_lock(&fs->lock); 124 spin_lock(&fs->lock);
130 kill = !--fs->users; 125 kill = !--fs->users;
131 current->fs = new_fs; 126 current->fs = new_fs;
132 write_unlock(&fs->lock); 127 spin_unlock(&fs->lock);
133 task_unlock(current); 128 task_unlock(current);
134 129
135 if (kill) 130 if (kill)
@@ -148,7 +143,7 @@ EXPORT_SYMBOL(current_umask);
148/* to be mentioned only in INIT_TASK */ 143/* to be mentioned only in INIT_TASK */
149struct fs_struct init_fs = { 144struct fs_struct init_fs = {
150 .users = 1, 145 .users = 1,
151 .lock = __RW_LOCK_UNLOCKED(init_fs.lock), 146 .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
152 .umask = 0022, 147 .umask = 0022,
153}; 148};
154 149
@@ -161,14 +156,14 @@ void daemonize_fs_struct(void)
161 156
162 task_lock(current); 157 task_lock(current);
163 158
164 write_lock(&init_fs.lock); 159 spin_lock(&init_fs.lock);
165 init_fs.users++; 160 init_fs.users++;
166 write_unlock(&init_fs.lock); 161 spin_unlock(&init_fs.lock);
167 162
168 write_lock(&fs->lock); 163 spin_lock(&fs->lock);
169 current->fs = &init_fs; 164 current->fs = &init_fs;
170 kill = !--fs->users; 165 kill = !--fs->users;
171 write_unlock(&fs->lock); 166 spin_unlock(&fs->lock);
172 167
173 task_unlock(current); 168 task_unlock(current);
174 if (kill) 169 if (kill)
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index cc94bb9563f2..3f6dfa989881 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
1 1
2config FSCACHE 2config FSCACHE
3 tristate "General filesystem local caching manager" 3 tristate "General filesystem local caching manager"
4 select SLOW_WORK
5 help 4 help
6 This option enables a generic filesystem caching manager that can be 5 This option enables a generic filesystem caching manager that can be
7 used by various network and other filesystems to cache data locally. 6 used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index edd7434ab6e5..f6aad48d38a8 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -82,6 +82,14 @@ extern unsigned fscache_defer_lookup;
82extern unsigned fscache_defer_create; 82extern unsigned fscache_defer_create;
83extern unsigned fscache_debug; 83extern unsigned fscache_debug;
84extern struct kobject *fscache_root; 84extern struct kobject *fscache_root;
85extern struct workqueue_struct *fscache_object_wq;
86extern struct workqueue_struct *fscache_op_wq;
87DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
88
89static inline bool fscache_object_congested(void)
90{
91 return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
92}
85 93
86extern int fscache_wait_bit(void *); 94extern int fscache_wait_bit(void *);
87extern int fscache_wait_bit_interruptible(void *); 95extern int fscache_wait_bit_interruptible(void *);
@@ -313,17 +321,11 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
313#define dbgprintk(FMT, ...) \ 321#define dbgprintk(FMT, ...) \
314 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) 322 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
315 323
316/* make sure we maintain the format strings, even when debugging is disabled */
317static inline __attribute__((format(printf, 1, 2)))
318void _dbprintk(const char *fmt, ...)
319{
320}
321
322#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 324#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
323#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 325#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
324#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) 326#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
325 327
326#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) 328#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
327 329
328#ifdef __KDEBUG 330#ifdef __KDEBUG
329#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) 331#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
@@ -350,9 +352,9 @@ do { \
350} while (0) 352} while (0)
351 353
352#else 354#else
353#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 355#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
354#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 356#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
355#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) 357#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
356#endif 358#endif
357 359
358/* 360/*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index add6bdb53f04..f9d856773f79 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/completion.h> 16#include <linux/completion.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/seq_file.h>
18#include "internal.h" 19#include "internal.h"
19 20
20MODULE_DESCRIPTION("FS Cache Manager"); 21MODULE_DESCRIPTION("FS Cache Manager");
@@ -40,22 +41,105 @@ MODULE_PARM_DESC(fscache_debug,
40 "FS-Cache debugging mask"); 41 "FS-Cache debugging mask");
41 42
42struct kobject *fscache_root; 43struct kobject *fscache_root;
44struct workqueue_struct *fscache_object_wq;
45struct workqueue_struct *fscache_op_wq;
46
47DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
48
49/* these values serve as lower bounds, will be adjusted in fscache_init() */
50static unsigned fscache_object_max_active = 4;
51static unsigned fscache_op_max_active = 2;
52
53#ifdef CONFIG_SYSCTL
54static struct ctl_table_header *fscache_sysctl_header;
55
56static int fscache_max_active_sysctl(struct ctl_table *table, int write,
57 void __user *buffer,
58 size_t *lenp, loff_t *ppos)
59{
60 struct workqueue_struct **wqp = table->extra1;
61 unsigned int *datap = table->data;
62 int ret;
63
64 ret = proc_dointvec(table, write, buffer, lenp, ppos);
65 if (ret == 0)
66 workqueue_set_max_active(*wqp, *datap);
67 return ret;
68}
69
70ctl_table fscache_sysctls[] = {
71 {
72 .procname = "object_max_active",
73 .data = &fscache_object_max_active,
74 .maxlen = sizeof(unsigned),
75 .mode = 0644,
76 .proc_handler = fscache_max_active_sysctl,
77 .extra1 = &fscache_object_wq,
78 },
79 {
80 .procname = "operation_max_active",
81 .data = &fscache_op_max_active,
82 .maxlen = sizeof(unsigned),
83 .mode = 0644,
84 .proc_handler = fscache_max_active_sysctl,
85 .extra1 = &fscache_op_wq,
86 },
87 {}
88};
89
90ctl_table fscache_sysctls_root[] = {
91 {
92 .procname = "fscache",
93 .mode = 0555,
94 .child = fscache_sysctls,
95 },
96 {}
97};
98#endif
43 99
44/* 100/*
45 * initialise the fs caching module 101 * initialise the fs caching module
46 */ 102 */
47static int __init fscache_init(void) 103static int __init fscache_init(void)
48{ 104{
105 unsigned int nr_cpus = num_possible_cpus();
106 unsigned int cpu;
49 int ret; 107 int ret;
50 108
51 ret = slow_work_register_user(THIS_MODULE); 109 fscache_object_max_active =
52 if (ret < 0) 110 clamp_val(nr_cpus,
53 goto error_slow_work; 111 fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
112
113 ret = -ENOMEM;
114 fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
115 fscache_object_max_active);
116 if (!fscache_object_wq)
117 goto error_object_wq;
118
119 fscache_op_max_active =
120 clamp_val(fscache_object_max_active / 2,
121 fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
122
123 ret = -ENOMEM;
124 fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
125 fscache_op_max_active);
126 if (!fscache_op_wq)
127 goto error_op_wq;
128
129 for_each_possible_cpu(cpu)
130 init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
54 131
55 ret = fscache_proc_init(); 132 ret = fscache_proc_init();
56 if (ret < 0) 133 if (ret < 0)
57 goto error_proc; 134 goto error_proc;
58 135
136#ifdef CONFIG_SYSCTL
137 ret = -ENOMEM;
138 fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
139 if (!fscache_sysctl_header)
140 goto error_sysctl;
141#endif
142
59 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", 143 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
60 sizeof(struct fscache_cookie), 144 sizeof(struct fscache_cookie),
61 0, 145 0,
@@ -78,10 +162,16 @@ static int __init fscache_init(void)
78error_kobj: 162error_kobj:
79 kmem_cache_destroy(fscache_cookie_jar); 163 kmem_cache_destroy(fscache_cookie_jar);
80error_cookie_jar: 164error_cookie_jar:
165#ifdef CONFIG_SYSCTL
166 unregister_sysctl_table(fscache_sysctl_header);
167error_sysctl:
168#endif
81 fscache_proc_cleanup(); 169 fscache_proc_cleanup();
82error_proc: 170error_proc:
83 slow_work_unregister_user(THIS_MODULE); 171 destroy_workqueue(fscache_op_wq);
84error_slow_work: 172error_op_wq:
173 destroy_workqueue(fscache_object_wq);
174error_object_wq:
85 return ret; 175 return ret;
86} 176}
87 177
@@ -96,8 +186,12 @@ static void __exit fscache_exit(void)
96 186
97 kobject_put(fscache_root); 187 kobject_put(fscache_root);
98 kmem_cache_destroy(fscache_cookie_jar); 188 kmem_cache_destroy(fscache_cookie_jar);
189#ifdef CONFIG_SYSCTL
190 unregister_sysctl_table(fscache_sysctl_header);
191#endif
99 fscache_proc_cleanup(); 192 fscache_proc_cleanup();
100 slow_work_unregister_user(THIS_MODULE); 193 destroy_workqueue(fscache_op_wq);
194 destroy_workqueue(fscache_object_wq);
101 printk(KERN_NOTICE "FS-Cache: Unloaded\n"); 195 printk(KERN_NOTICE "FS-Cache: Unloaded\n");
102} 196}
103 197
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..ebe29c581380 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -34,8 +34,8 @@ struct fscache_objlist_data {
34#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ 34#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */
35#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ 35#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */
36#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ 36#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */
37#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */ 37#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */
38#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */ 38#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */
39 39
40 u8 buf[512]; /* key and aux data buffer */ 40 u8 buf[512]; /* key and aux data buffer */
41}; 41};
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
103 /* banners (can't represent line 0 by pos 0 as that would involve 103 /* banners (can't represent line 0 by pos 0 as that would involve
104 * returning a NULL pointer) */ 104 * returning a NULL pointer) */
105 if (pos == 0) 105 if (pos == 0)
106 return (struct fscache_object *) ++(*_pos); 106 return (struct fscache_object *)(long)++(*_pos);
107 if (pos < 3) 107 if (pos < 3)
108 return (struct fscache_object *)pos; 108 return (struct fscache_object *)pos;
109 109
@@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
231 READS, NOREADS); 231 READS, NOREADS);
232 FILTER(obj->events & obj->event_mask, 232 FILTER(obj->events & obj->event_mask,
233 EVENTS, NOEVENTS); 233 EVENTS, NOEVENTS);
234 FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW), 234 FILTER(work_busy(&obj->work), WORK, NOWORK);
235 WORK, NOWORK);
236 } 235 }
237 236
238 seq_printf(m, 237 seq_printf(m,
239 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ", 238 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
240 obj->debug_id, 239 obj->debug_id,
241 obj->parent ? obj->parent->debug_id : -1, 240 obj->parent ? obj->parent->debug_id : -1,
242 fscache_object_states_short[obj->state], 241 fscache_object_states_short[obj->state],
@@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
249 obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, 248 obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
250 obj->events, 249 obj->events,
251 obj->flags, 250 obj->flags,
252 obj->work.flags); 251 work_busy(&obj->work));
253 252
254 no_cookie = true; 253 no_cookie = true;
255 keylen = auxlen = 0; 254 keylen = auxlen = 0;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 0b589a9b4ffc..b6b897c550ac 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,7 +14,6 @@
14 14
15#define FSCACHE_DEBUG_LEVEL COOKIE 15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/seq_file.h>
18#include "internal.h" 17#include "internal.h"
19 18
20const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { 19const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
50 [FSCACHE_OBJECT_DEAD] = "DEAD", 49 [FSCACHE_OBJECT_DEAD] = "DEAD",
51}; 50};
52 51
53static void fscache_object_slow_work_put_ref(struct slow_work *); 52static int fscache_get_object(struct fscache_object *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 53static void fscache_put_object(struct fscache_object *);
55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif
59static void fscache_initialise_object(struct fscache_object *); 54static void fscache_initialise_object(struct fscache_object *);
60static void fscache_lookup_object(struct fscache_object *); 55static void fscache_lookup_object(struct fscache_object *);
61static void fscache_object_available(struct fscache_object *); 56static void fscache_object_available(struct fscache_object *);
@@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *);
64static void fscache_enqueue_dependents(struct fscache_object *); 59static void fscache_enqueue_dependents(struct fscache_object *);
65static void fscache_dequeue_object(struct fscache_object *); 60static void fscache_dequeue_object(struct fscache_object *);
66 61
67const struct slow_work_ops fscache_object_slow_work_ops = {
68 .owner = THIS_MODULE,
69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc,
74#endif
75};
76EXPORT_SYMBOL(fscache_object_slow_work_ops);
77
78/* 62/*
79 * we need to notify the parent when an op completes that we had outstanding 63 * we need to notify the parent when an op completes that we had outstanding
80 * upon it 64 * upon it
@@ -345,7 +329,7 @@ unsupported_event:
345/* 329/*
346 * execute an object 330 * execute an object
347 */ 331 */
348static void fscache_object_slow_work_execute(struct slow_work *work) 332void fscache_object_work_func(struct work_struct *work)
349{ 333{
350 struct fscache_object *object = 334 struct fscache_object *object =
351 container_of(work, struct fscache_object, work); 335 container_of(work, struct fscache_object, work);
@@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
359 if (object->events & object->event_mask) 343 if (object->events & object->event_mask)
360 fscache_enqueue_object(object); 344 fscache_enqueue_object(object);
361 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); 345 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
346 fscache_put_object(object);
362} 347}
363 348EXPORT_SYMBOL(fscache_object_work_func);
364/*
365 * describe an object for slow-work debugging
366 */
367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m)
370{
371 struct fscache_object *object =
372 container_of(work, struct fscache_object, work);
373
374 seq_printf(m, "FSC: OBJ%x: %s",
375 object->debug_id,
376 fscache_object_states_short[object->state]);
377}
378#endif
379 349
380/* 350/*
381 * initialise an object 351 * initialise an object
@@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object)
393 _enter(""); 363 _enter("");
394 ASSERT(object->cookie != NULL); 364 ASSERT(object->cookie != NULL);
395 ASSERT(object->cookie->parent != NULL); 365 ASSERT(object->cookie->parent != NULL);
396 ASSERT(list_empty(&object->work.link));
397 366
398 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | 367 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
399 (1 << FSCACHE_OBJECT_EV_RELEASE) | 368 (1 << FSCACHE_OBJECT_EV_RELEASE) |
@@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object)
671 object->parent = NULL; 640 object->parent = NULL;
672 } 641 }
673 642
674 /* this just shifts the object release to the slow work processor */ 643 /* this just shifts the object release to the work processor */
675 fscache_stat(&fscache_n_cop_put_object); 644 fscache_put_object(object);
676 object->cache->ops->put_object(object);
677 fscache_stat_d(&fscache_n_cop_put_object);
678 645
679 _leave(""); 646 _leave("");
680} 647}
@@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache,
758} 725}
759 726
760/* 727/*
761 * allow the slow work item processor to get a ref on an object 728 * get a ref on an object
762 */ 729 */
763static int fscache_object_slow_work_get_ref(struct slow_work *work) 730static int fscache_get_object(struct fscache_object *object)
764{ 731{
765 struct fscache_object *object =
766 container_of(work, struct fscache_object, work);
767 int ret; 732 int ret;
768 733
769 fscache_stat(&fscache_n_cop_grab_object); 734 fscache_stat(&fscache_n_cop_grab_object);
@@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
773} 738}
774 739
775/* 740/*
776 * allow the slow work item processor to discard a ref on a work item 741 * discard a ref on a work item
777 */ 742 */
778static void fscache_object_slow_work_put_ref(struct slow_work *work) 743static void fscache_put_object(struct fscache_object *object)
779{ 744{
780 struct fscache_object *object =
781 container_of(work, struct fscache_object, work);
782
783 fscache_stat(&fscache_n_cop_put_object); 745 fscache_stat(&fscache_n_cop_put_object);
784 object->cache->ops->put_object(object); 746 object->cache->ops->put_object(object);
785 fscache_stat_d(&fscache_n_cop_put_object); 747 fscache_stat_d(&fscache_n_cop_put_object);
@@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object)
792{ 754{
793 _enter("{OBJ%x}", object->debug_id); 755 _enter("{OBJ%x}", object->debug_id);
794 756
795 slow_work_enqueue(&object->work); 757 if (fscache_get_object(object) >= 0) {
758 wait_queue_head_t *cong_wq =
759 &get_cpu_var(fscache_object_cong_wait);
760
761 if (queue_work(fscache_object_wq, &object->work)) {
762 if (fscache_object_congested())
763 wake_up(cong_wq);
764 } else
765 fscache_put_object(object);
766
767 put_cpu_var(fscache_object_cong_wait);
768 }
769}
770
771/**
772 * fscache_object_sleep_till_congested - Sleep until object wq is congested
773 * @timoutp: Scheduler sleep timeout
774 *
775 * Allow an object handler to sleep until the object workqueue is congested.
776 *
777 * The caller must set up a wake up event before calling this and must have set
778 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
779 * condition before calling this function as no test is made here.
780 *
781 * %true is returned if the object wq is congested, %false otherwise.
782 */
783bool fscache_object_sleep_till_congested(signed long *timeoutp)
784{
785 wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
786 DEFINE_WAIT(wait);
787
788 if (fscache_object_congested())
789 return true;
790
791 add_wait_queue_exclusive(cong_wq, &wait);
792 if (!fscache_object_congested())
793 *timeoutp = schedule_timeout(*timeoutp);
794 finish_wait(cong_wq, &wait);
795
796 return fscache_object_congested();
796} 797}
798EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
797 799
798/* 800/*
799 * enqueue the dependents of an object for metadata-type processing 801 * enqueue the dependents of an object for metadata-type processing
@@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
819 821
820 /* sort onto appropriate lists */ 822 /* sort onto appropriate lists */
821 fscache_enqueue_object(dep); 823 fscache_enqueue_object(dep);
822 fscache_stat(&fscache_n_cop_put_object); 824 fscache_put_object(dep);
823 dep->cache->ops->put_object(dep);
824 fscache_stat_d(&fscache_n_cop_put_object);
825 825
826 if (!list_empty(&object->dependents)) 826 if (!list_empty(&object->dependents))
827 cond_resched_lock(&object->lock); 827 cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index f17cecafae44..b9f34eaede09 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op)
42 42
43 fscache_stat(&fscache_n_op_enqueue); 43 fscache_stat(&fscache_n_op_enqueue);
44 switch (op->flags & FSCACHE_OP_TYPE) { 44 switch (op->flags & FSCACHE_OP_TYPE) {
45 case FSCACHE_OP_FAST: 45 case FSCACHE_OP_ASYNC:
46 _debug("queue fast"); 46 _debug("queue async");
47 atomic_inc(&op->usage); 47 atomic_inc(&op->usage);
48 if (!schedule_work(&op->fast_work)) 48 if (!queue_work(fscache_op_wq, &op->work))
49 fscache_put_operation(op); 49 fscache_put_operation(op);
50 break; 50 break;
51 case FSCACHE_OP_SLOW:
52 _debug("queue slow");
53 slow_work_enqueue(&op->slow_work);
54 break;
55 case FSCACHE_OP_MYTHREAD: 51 case FSCACHE_OP_MYTHREAD:
56 _debug("queue for caller's attention"); 52 _debug("queue for caller's attention");
57 break; 53 break;
@@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work)
455} 451}
456 452
457/* 453/*
458 * allow the slow work item processor to get a ref on an operation 454 * execute an operation using fs_op_wq to provide processing context -
459 */ 455 * the caller holds a ref to this object, so we don't need to hold one
460static int fscache_op_get_ref(struct slow_work *work)
461{
462 struct fscache_operation *op =
463 container_of(work, struct fscache_operation, slow_work);
464
465 atomic_inc(&op->usage);
466 return 0;
467}
468
469/*
470 * allow the slow work item processor to discard a ref on an operation
471 */
472static void fscache_op_put_ref(struct slow_work *work)
473{
474 struct fscache_operation *op =
475 container_of(work, struct fscache_operation, slow_work);
476
477 fscache_put_operation(op);
478}
479
480/*
481 * execute an operation using the slow thread pool to provide processing context
482 * - the caller holds a ref to this object, so we don't need to hold one
483 */ 456 */
484static void fscache_op_execute(struct slow_work *work) 457void fscache_op_work_func(struct work_struct *work)
485{ 458{
486 struct fscache_operation *op = 459 struct fscache_operation *op =
487 container_of(work, struct fscache_operation, slow_work); 460 container_of(work, struct fscache_operation, work);
488 unsigned long start; 461 unsigned long start;
489 462
490 _enter("{OBJ%x OP%x,%d}", 463 _enter("{OBJ%x OP%x,%d}",
@@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work)
494 start = jiffies; 467 start = jiffies;
495 op->processor(op); 468 op->processor(op);
496 fscache_hist(fscache_ops_histogram, start); 469 fscache_hist(fscache_ops_histogram, start);
470 fscache_put_operation(op);
497 471
498 _leave(""); 472 _leave("");
499} 473}
500
501/*
502 * describe an operation for slow-work debugging
503 */
504#ifdef CONFIG_SLOW_WORK_DEBUG
505static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
506{
507 struct fscache_operation *op =
508 container_of(work, struct fscache_operation, slow_work);
509
510 seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
511 op->object->debug_id, op->debug_id,
512 op->name, op->state, op->flags);
513}
514#endif
515
516const struct slow_work_ops fscache_op_slow_work_ops = {
517 .owner = THIS_MODULE,
518 .get_ref = fscache_op_get_ref,
519 .put_ref = fscache_op_put_ref,
520 .execute = fscache_op_execute,
521#ifdef CONFIG_SLOW_WORK_DEBUG
522 .desc = fscache_op_desc,
523#endif
524};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 47aefd376e54..41c441c2058d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
105 105
106page_busy: 106page_busy:
107 /* we might want to wait here, but that could deadlock the allocator as 107 /* we might want to wait here, but that could deadlock the allocator as
108 * the slow-work threads writing to the cache may all end up sleeping 108 * the work threads writing to the cache may all end up sleeping
109 * on memory allocation */ 109 * on memory allocation */
110 fscache_stat(&fscache_n_store_vmscan_busy); 110 fscache_stat(&fscache_n_store_vmscan_busy);
111 return false; 111 return false;
@@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
188 return -ENOMEM; 188 return -ENOMEM;
189 } 189 }
190 190
191 fscache_operation_init(op, NULL); 191 fscache_operation_init(op, fscache_attr_changed_op, NULL);
192 fscache_operation_init_slow(op, fscache_attr_changed_op); 192 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
193 op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
194 fscache_set_op_name(op, "Attr"); 193 fscache_set_op_name(op, "Attr");
195 194
196 spin_lock(&cookie->lock); 195 spin_lock(&cookie->lock);
@@ -218,24 +217,6 @@ nobufs:
218EXPORT_SYMBOL(__fscache_attr_changed); 217EXPORT_SYMBOL(__fscache_attr_changed);
219 218
220/* 219/*
221 * handle secondary execution given to a retrieval op on behalf of the
222 * cache
223 */
224static void fscache_retrieval_work(struct work_struct *work)
225{
226 struct fscache_retrieval *op =
227 container_of(work, struct fscache_retrieval, op.fast_work);
228 unsigned long start;
229
230 _enter("{OP%x}", op->op.debug_id);
231
232 start = jiffies;
233 op->op.processor(&op->op);
234 fscache_hist(fscache_ops_histogram, start);
235 fscache_put_operation(&op->op);
236}
237
238/*
239 * release a retrieval op reference 220 * release a retrieval op reference
240 */ 221 */
241static void fscache_release_retrieval_op(struct fscache_operation *_op) 222static void fscache_release_retrieval_op(struct fscache_operation *_op)
@@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
269 return NULL; 250 return NULL;
270 } 251 }
271 252
272 fscache_operation_init(&op->op, fscache_release_retrieval_op); 253 fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
273 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); 254 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
274 op->mapping = mapping; 255 op->mapping = mapping;
275 op->end_io_func = end_io_func; 256 op->end_io_func = end_io_func;
276 op->context = context; 257 op->context = context;
277 op->start_time = jiffies; 258 op->start_time = jiffies;
278 INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
279 INIT_LIST_HEAD(&op->to_do); 259 INIT_LIST_HEAD(&op->to_do);
280 fscache_set_op_name(&op->op, "Retr"); 260 fscache_set_op_name(&op->op, "Retr");
281 return op; 261 return op;
@@ -710,30 +690,26 @@ static void fscache_write_op(struct fscache_operation *_op)
710 goto superseded; 690 goto superseded;
711 } 691 }
712 692
713 if (page) { 693 radix_tree_tag_set(&cookie->stores, page->index,
714 radix_tree_tag_set(&cookie->stores, page->index, 694 FSCACHE_COOKIE_STORING_TAG);
715 FSCACHE_COOKIE_STORING_TAG); 695 radix_tree_tag_clear(&cookie->stores, page->index,
716 radix_tree_tag_clear(&cookie->stores, page->index, 696 FSCACHE_COOKIE_PENDING_TAG);
717 FSCACHE_COOKIE_PENDING_TAG);
718 }
719 697
720 spin_unlock(&cookie->stores_lock); 698 spin_unlock(&cookie->stores_lock);
721 spin_unlock(&object->lock); 699 spin_unlock(&object->lock);
722 700
723 if (page) { 701 fscache_set_op_state(&op->op, "Store");
724 fscache_set_op_state(&op->op, "Store"); 702 fscache_stat(&fscache_n_store_pages);
725 fscache_stat(&fscache_n_store_pages); 703 fscache_stat(&fscache_n_cop_write_page);
726 fscache_stat(&fscache_n_cop_write_page); 704 ret = object->cache->ops->write_page(op, page);
727 ret = object->cache->ops->write_page(op, page); 705 fscache_stat_d(&fscache_n_cop_write_page);
728 fscache_stat_d(&fscache_n_cop_write_page); 706 fscache_set_op_state(&op->op, "EndWrite");
729 fscache_set_op_state(&op->op, "EndWrite"); 707 fscache_end_page_write(object, page);
730 fscache_end_page_write(object, page); 708 if (ret < 0) {
731 if (ret < 0) { 709 fscache_set_op_state(&op->op, "Abort");
732 fscache_set_op_state(&op->op, "Abort"); 710 fscache_abort_object(object);
733 fscache_abort_object(object); 711 } else {
734 } else { 712 fscache_enqueue_operation(&op->op);
735 fscache_enqueue_operation(&op->op);
736 }
737 } 713 }
738 714
739 _leave(""); 715 _leave("");
@@ -799,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
799 if (!op) 775 if (!op)
800 goto nomem; 776 goto nomem;
801 777
802 fscache_operation_init(&op->op, fscache_release_write_op); 778 fscache_operation_init(&op->op, fscache_write_op,
803 fscache_operation_init_slow(&op->op, fscache_write_op); 779 fscache_release_write_op);
804 op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); 780 op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
805 fscache_set_op_name(&op->op, "Write1"); 781 fscache_set_op_name(&op->op, "Write1");
806 782
807 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 783 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
@@ -856,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
856 fscache_stat(&fscache_n_store_ops); 832 fscache_stat(&fscache_n_store_ops);
857 fscache_stat(&fscache_n_stores_ok); 833 fscache_stat(&fscache_n_stores_ok);
858 834
859 /* the slow work queue now carries its own ref on the object */ 835 /* the work queue now carries its own ref on the object */
860 fscache_put_operation(&op->op); 836 fscache_put_operation(&op->op);
861 _leave(" = 0"); 837 _leave(" = 0");
862 return 0; 838 return 0;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691f..cde755cca564 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,8 +16,12 @@
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/file.h> 17#include <linux/file.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/pipe_fs_i.h>
20#include <linux/swap.h>
21#include <linux/splice.h>
19 22
20MODULE_ALIAS_MISCDEV(FUSE_MINOR); 23MODULE_ALIAS_MISCDEV(FUSE_MINOR);
24MODULE_ALIAS("devname:fuse");
21 25
22static struct kmem_cache *fuse_req_cachep; 26static struct kmem_cache *fuse_req_cachep;
23 27
@@ -235,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc)
235 239
236static void queue_request(struct fuse_conn *fc, struct fuse_req *req) 240static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
237{ 241{
238 req->in.h.unique = fuse_get_unique(fc);
239 req->in.h.len = sizeof(struct fuse_in_header) + 242 req->in.h.len = sizeof(struct fuse_in_header) +
240 len_args(req->in.numargs, (struct fuse_arg *) req->in.args); 243 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
241 list_add_tail(&req->list, &fc->pending); 244 list_add_tail(&req->list, &fc->pending);
@@ -257,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
257 req = list_entry(fc->bg_queue.next, struct fuse_req, list); 260 req = list_entry(fc->bg_queue.next, struct fuse_req, list);
258 list_del(&req->list); 261 list_del(&req->list);
259 fc->active_background++; 262 fc->active_background++;
263 req->in.h.unique = fuse_get_unique(fc);
260 queue_request(fc, req); 264 queue_request(fc, req);
261 } 265 }
262} 266}
@@ -272,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
272 * Called with fc->lock, unlocks it 276 * Called with fc->lock, unlocks it
273 */ 277 */
274static void request_end(struct fuse_conn *fc, struct fuse_req *req) 278static void request_end(struct fuse_conn *fc, struct fuse_req *req)
275__releases(&fc->lock) 279__releases(fc->lock)
276{ 280{
277 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; 281 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
278 req->end = NULL; 282 req->end = NULL;
@@ -302,8 +306,8 @@ __releases(&fc->lock)
302 306
303static void wait_answer_interruptible(struct fuse_conn *fc, 307static void wait_answer_interruptible(struct fuse_conn *fc,
304 struct fuse_req *req) 308 struct fuse_req *req)
305__releases(&fc->lock) 309__releases(fc->lock)
306__acquires(&fc->lock) 310__acquires(fc->lock)
307{ 311{
308 if (signal_pending(current)) 312 if (signal_pending(current))
309 return; 313 return;
@@ -321,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
321} 325}
322 326
323static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) 327static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
324__releases(&fc->lock) 328__releases(fc->lock)
325__acquires(&fc->lock) 329__acquires(fc->lock)
326{ 330{
327 if (!fc->no_interrupt) { 331 if (!fc->no_interrupt) {
328 /* Any signal may interrupt this */ 332 /* Any signal may interrupt this */
@@ -394,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
394 else if (fc->conn_error) 398 else if (fc->conn_error)
395 req->out.h.error = -ECONNREFUSED; 399 req->out.h.error = -ECONNREFUSED;
396 else { 400 else {
401 req->in.h.unique = fuse_get_unique(fc);
397 queue_request(fc, req); 402 queue_request(fc, req);
398 /* acquire extra reference, since request is still needed 403 /* acquire extra reference, since request is still needed
399 after request_end() */ 404 after request_end() */
@@ -446,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
446} 451}
447EXPORT_SYMBOL_GPL(fuse_request_send_background); 452EXPORT_SYMBOL_GPL(fuse_request_send_background);
448 453
454static int fuse_request_send_notify_reply(struct fuse_conn *fc,
455 struct fuse_req *req, u64 unique)
456{
457 int err = -ENODEV;
458
459 req->isreply = 0;
460 req->in.h.unique = unique;
461 spin_lock(&fc->lock);
462 if (fc->connected) {
463 queue_request(fc, req);
464 err = 0;
465 }
466 spin_unlock(&fc->lock);
467
468 return err;
469}
470
449/* 471/*
450 * Called under fc->lock 472 * Called under fc->lock
451 * 473 *
@@ -498,6 +520,9 @@ struct fuse_copy_state {
498 int write; 520 int write;
499 struct fuse_req *req; 521 struct fuse_req *req;
500 const struct iovec *iov; 522 const struct iovec *iov;
523 struct pipe_buffer *pipebufs;
524 struct pipe_buffer *currbuf;
525 struct pipe_inode_info *pipe;
501 unsigned long nr_segs; 526 unsigned long nr_segs;
502 unsigned long seglen; 527 unsigned long seglen;
503 unsigned long addr; 528 unsigned long addr;
@@ -505,16 +530,16 @@ struct fuse_copy_state {
505 void *mapaddr; 530 void *mapaddr;
506 void *buf; 531 void *buf;
507 unsigned len; 532 unsigned len;
533 unsigned move_pages:1;
508}; 534};
509 535
510static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, 536static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
511 int write, struct fuse_req *req, 537 int write,
512 const struct iovec *iov, unsigned long nr_segs) 538 const struct iovec *iov, unsigned long nr_segs)
513{ 539{
514 memset(cs, 0, sizeof(*cs)); 540 memset(cs, 0, sizeof(*cs));
515 cs->fc = fc; 541 cs->fc = fc;
516 cs->write = write; 542 cs->write = write;
517 cs->req = req;
518 cs->iov = iov; 543 cs->iov = iov;
519 cs->nr_segs = nr_segs; 544 cs->nr_segs = nr_segs;
520} 545}
@@ -522,8 +547,19 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
522/* Unmap and put previous page of userspace buffer */ 547/* Unmap and put previous page of userspace buffer */
523static void fuse_copy_finish(struct fuse_copy_state *cs) 548static void fuse_copy_finish(struct fuse_copy_state *cs)
524{ 549{
525 if (cs->mapaddr) { 550 if (cs->currbuf) {
526 kunmap_atomic(cs->mapaddr, KM_USER0); 551 struct pipe_buffer *buf = cs->currbuf;
552
553 if (!cs->write) {
554 buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
555 } else {
556 kunmap(buf->page);
557 buf->len = PAGE_SIZE - cs->len;
558 }
559 cs->currbuf = NULL;
560 cs->mapaddr = NULL;
561 } else if (cs->mapaddr) {
562 kunmap(cs->pg);
527 if (cs->write) { 563 if (cs->write) {
528 flush_dcache_page(cs->pg); 564 flush_dcache_page(cs->pg);
529 set_page_dirty_lock(cs->pg); 565 set_page_dirty_lock(cs->pg);
@@ -544,26 +580,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
544 580
545 unlock_request(cs->fc, cs->req); 581 unlock_request(cs->fc, cs->req);
546 fuse_copy_finish(cs); 582 fuse_copy_finish(cs);
547 if (!cs->seglen) { 583 if (cs->pipebufs) {
548 BUG_ON(!cs->nr_segs); 584 struct pipe_buffer *buf = cs->pipebufs;
549 cs->seglen = cs->iov[0].iov_len; 585
550 cs->addr = (unsigned long) cs->iov[0].iov_base; 586 if (!cs->write) {
551 cs->iov++; 587 err = buf->ops->confirm(cs->pipe, buf);
552 cs->nr_segs--; 588 if (err)
589 return err;
590
591 BUG_ON(!cs->nr_segs);
592 cs->currbuf = buf;
593 cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
594 cs->len = buf->len;
595 cs->buf = cs->mapaddr + buf->offset;
596 cs->pipebufs++;
597 cs->nr_segs--;
598 } else {
599 struct page *page;
600
601 if (cs->nr_segs == cs->pipe->buffers)
602 return -EIO;
603
604 page = alloc_page(GFP_HIGHUSER);
605 if (!page)
606 return -ENOMEM;
607
608 buf->page = page;
609 buf->offset = 0;
610 buf->len = 0;
611
612 cs->currbuf = buf;
613 cs->mapaddr = kmap(page);
614 cs->buf = cs->mapaddr;
615 cs->len = PAGE_SIZE;
616 cs->pipebufs++;
617 cs->nr_segs++;
618 }
619 } else {
620 if (!cs->seglen) {
621 BUG_ON(!cs->nr_segs);
622 cs->seglen = cs->iov[0].iov_len;
623 cs->addr = (unsigned long) cs->iov[0].iov_base;
624 cs->iov++;
625 cs->nr_segs--;
626 }
627 err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
628 if (err < 0)
629 return err;
630 BUG_ON(err != 1);
631 offset = cs->addr % PAGE_SIZE;
632 cs->mapaddr = kmap(cs->pg);
633 cs->buf = cs->mapaddr + offset;
634 cs->len = min(PAGE_SIZE - offset, cs->seglen);
635 cs->seglen -= cs->len;
636 cs->addr += cs->len;
553 } 637 }
554 down_read(&current->mm->mmap_sem);
555 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
556 &cs->pg, NULL);
557 up_read(&current->mm->mmap_sem);
558 if (err < 0)
559 return err;
560 BUG_ON(err != 1);
561 offset = cs->addr % PAGE_SIZE;
562 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
563 cs->buf = cs->mapaddr + offset;
564 cs->len = min(PAGE_SIZE - offset, cs->seglen);
565 cs->seglen -= cs->len;
566 cs->addr += cs->len;
567 638
568 return lock_request(cs->fc, cs->req); 639 return lock_request(cs->fc, cs->req);
569} 640}
@@ -585,23 +656,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
585 return ncpy; 656 return ncpy;
586} 657}
587 658
659static int fuse_check_page(struct page *page)
660{
661 if (page_mapcount(page) ||
662 page->mapping != NULL ||
663 page_count(page) != 1 ||
664 (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
665 ~(1 << PG_locked |
666 1 << PG_referenced |
667 1 << PG_uptodate |
668 1 << PG_lru |
669 1 << PG_active |
670 1 << PG_reclaim))) {
671 printk(KERN_WARNING "fuse: trying to steal weird page\n");
672 printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
673 return 1;
674 }
675 return 0;
676}
677
678static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
679{
680 int err;
681 struct page *oldpage = *pagep;
682 struct page *newpage;
683 struct pipe_buffer *buf = cs->pipebufs;
684 struct address_space *mapping;
685 pgoff_t index;
686
687 unlock_request(cs->fc, cs->req);
688 fuse_copy_finish(cs);
689
690 err = buf->ops->confirm(cs->pipe, buf);
691 if (err)
692 return err;
693
694 BUG_ON(!cs->nr_segs);
695 cs->currbuf = buf;
696 cs->len = buf->len;
697 cs->pipebufs++;
698 cs->nr_segs--;
699
700 if (cs->len != PAGE_SIZE)
701 goto out_fallback;
702
703 if (buf->ops->steal(cs->pipe, buf) != 0)
704 goto out_fallback;
705
706 newpage = buf->page;
707
708 if (WARN_ON(!PageUptodate(newpage)))
709 return -EIO;
710
711 ClearPageMappedToDisk(newpage);
712
713 if (fuse_check_page(newpage) != 0)
714 goto out_fallback_unlock;
715
716 mapping = oldpage->mapping;
717 index = oldpage->index;
718
719 /*
720 * This is a new and locked page, it shouldn't be mapped or
721 * have any special flags on it
722 */
723 if (WARN_ON(page_mapped(oldpage)))
724 goto out_fallback_unlock;
725 if (WARN_ON(page_has_private(oldpage)))
726 goto out_fallback_unlock;
727 if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
728 goto out_fallback_unlock;
729 if (WARN_ON(PageMlocked(oldpage)))
730 goto out_fallback_unlock;
731
732 remove_from_page_cache(oldpage);
733 page_cache_release(oldpage);
734
735 err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
736 if (err) {
737 printk(KERN_WARNING "fuse_try_move_page: failed to add page");
738 goto out_fallback_unlock;
739 }
740 page_cache_get(newpage);
741
742 if (!(buf->flags & PIPE_BUF_FLAG_LRU))
743 lru_cache_add_file(newpage);
744
745 err = 0;
746 spin_lock(&cs->fc->lock);
747 if (cs->req->aborted)
748 err = -ENOENT;
749 else
750 *pagep = newpage;
751 spin_unlock(&cs->fc->lock);
752
753 if (err) {
754 unlock_page(newpage);
755 page_cache_release(newpage);
756 return err;
757 }
758
759 unlock_page(oldpage);
760 page_cache_release(oldpage);
761 cs->len = 0;
762
763 return 0;
764
765out_fallback_unlock:
766 unlock_page(newpage);
767out_fallback:
768 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
769 cs->buf = cs->mapaddr + buf->offset;
770
771 err = lock_request(cs->fc, cs->req);
772 if (err)
773 return err;
774
775 return 1;
776}
777
778static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
779 unsigned offset, unsigned count)
780{
781 struct pipe_buffer *buf;
782
783 if (cs->nr_segs == cs->pipe->buffers)
784 return -EIO;
785
786 unlock_request(cs->fc, cs->req);
787 fuse_copy_finish(cs);
788
789 buf = cs->pipebufs;
790 page_cache_get(page);
791 buf->page = page;
792 buf->offset = offset;
793 buf->len = count;
794
795 cs->pipebufs++;
796 cs->nr_segs++;
797 cs->len = 0;
798
799 return 0;
800}
801
588/* 802/*
589 * Copy a page in the request to/from the userspace buffer. Must be 803 * Copy a page in the request to/from the userspace buffer. Must be
590 * done atomically 804 * done atomically
591 */ 805 */
592static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, 806static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
593 unsigned offset, unsigned count, int zeroing) 807 unsigned offset, unsigned count, int zeroing)
594{ 808{
809 int err;
810 struct page *page = *pagep;
811
595 if (page && zeroing && count < PAGE_SIZE) { 812 if (page && zeroing && count < PAGE_SIZE) {
596 void *mapaddr = kmap_atomic(page, KM_USER1); 813 void *mapaddr = kmap_atomic(page, KM_USER1);
597 memset(mapaddr, 0, PAGE_SIZE); 814 memset(mapaddr, 0, PAGE_SIZE);
598 kunmap_atomic(mapaddr, KM_USER1); 815 kunmap_atomic(mapaddr, KM_USER1);
599 } 816 }
600 while (count) { 817 while (count) {
601 if (!cs->len) { 818 if (cs->write && cs->pipebufs && page) {
602 int err = fuse_copy_fill(cs); 819 return fuse_ref_page(cs, page, offset, count);
603 if (err) 820 } else if (!cs->len) {
604 return err; 821 if (cs->move_pages && page &&
822 offset == 0 && count == PAGE_SIZE) {
823 err = fuse_try_move_page(cs, pagep);
824 if (err <= 0)
825 return err;
826 } else {
827 err = fuse_copy_fill(cs);
828 if (err)
829 return err;
830 }
605 } 831 }
606 if (page) { 832 if (page) {
607 void *mapaddr = kmap_atomic(page, KM_USER1); 833 void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -626,8 +852,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
626 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); 852 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
627 853
628 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { 854 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
629 struct page *page = req->pages[i]; 855 int err;
630 int err = fuse_copy_page(cs, page, offset, count, zeroing); 856
857 err = fuse_copy_page(cs, &req->pages[i], offset, count,
858 zeroing);
631 if (err) 859 if (err)
632 return err; 860 return err;
633 861
@@ -677,8 +905,8 @@ static int request_pending(struct fuse_conn *fc)
677 905
678/* Wait until a request is available on the pending list */ 906/* Wait until a request is available on the pending list */
679static void request_wait(struct fuse_conn *fc) 907static void request_wait(struct fuse_conn *fc)
680__releases(&fc->lock) 908__releases(fc->lock)
681__acquires(&fc->lock) 909__acquires(fc->lock)
682{ 910{
683 DECLARE_WAITQUEUE(wait, current); 911 DECLARE_WAITQUEUE(wait, current);
684 912
@@ -704,11 +932,10 @@ __acquires(&fc->lock)
704 * 932 *
705 * Called with fc->lock held, releases it 933 * Called with fc->lock held, releases it
706 */ 934 */
707static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, 935static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
708 const struct iovec *iov, unsigned long nr_segs) 936 size_t nbytes, struct fuse_req *req)
709__releases(&fc->lock) 937__releases(fc->lock)
710{ 938{
711 struct fuse_copy_state cs;
712 struct fuse_in_header ih; 939 struct fuse_in_header ih;
713 struct fuse_interrupt_in arg; 940 struct fuse_interrupt_in arg;
714 unsigned reqsize = sizeof(ih) + sizeof(arg); 941 unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -724,14 +951,13 @@ __releases(&fc->lock)
724 arg.unique = req->in.h.unique; 951 arg.unique = req->in.h.unique;
725 952
726 spin_unlock(&fc->lock); 953 spin_unlock(&fc->lock);
727 if (iov_length(iov, nr_segs) < reqsize) 954 if (nbytes < reqsize)
728 return -EINVAL; 955 return -EINVAL;
729 956
730 fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs); 957 err = fuse_copy_one(cs, &ih, sizeof(ih));
731 err = fuse_copy_one(&cs, &ih, sizeof(ih));
732 if (!err) 958 if (!err)
733 err = fuse_copy_one(&cs, &arg, sizeof(arg)); 959 err = fuse_copy_one(cs, &arg, sizeof(arg));
734 fuse_copy_finish(&cs); 960 fuse_copy_finish(cs);
735 961
736 return err ? err : reqsize; 962 return err ? err : reqsize;
737} 963}
@@ -745,18 +971,13 @@ __releases(&fc->lock)
745 * request_end(). Otherwise add it to the processing list, and set 971 * request_end(). Otherwise add it to the processing list, and set
746 * the 'sent' flag. 972 * the 'sent' flag.
747 */ 973 */
748static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, 974static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
749 unsigned long nr_segs, loff_t pos) 975 struct fuse_copy_state *cs, size_t nbytes)
750{ 976{
751 int err; 977 int err;
752 struct fuse_req *req; 978 struct fuse_req *req;
753 struct fuse_in *in; 979 struct fuse_in *in;
754 struct fuse_copy_state cs;
755 unsigned reqsize; 980 unsigned reqsize;
756 struct file *file = iocb->ki_filp;
757 struct fuse_conn *fc = fuse_get_conn(file);
758 if (!fc)
759 return -EPERM;
760 981
761 restart: 982 restart:
762 spin_lock(&fc->lock); 983 spin_lock(&fc->lock);
@@ -776,7 +997,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
776 if (!list_empty(&fc->interrupts)) { 997 if (!list_empty(&fc->interrupts)) {
777 req = list_entry(fc->interrupts.next, struct fuse_req, 998 req = list_entry(fc->interrupts.next, struct fuse_req,
778 intr_entry); 999 intr_entry);
779 return fuse_read_interrupt(fc, req, iov, nr_segs); 1000 return fuse_read_interrupt(fc, cs, nbytes, req);
780 } 1001 }
781 1002
782 req = list_entry(fc->pending.next, struct fuse_req, list); 1003 req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -786,7 +1007,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
786 in = &req->in; 1007 in = &req->in;
787 reqsize = in->h.len; 1008 reqsize = in->h.len;
788 /* If request is too large, reply with an error and restart the read */ 1009 /* If request is too large, reply with an error and restart the read */
789 if (iov_length(iov, nr_segs) < reqsize) { 1010 if (nbytes < reqsize) {
790 req->out.h.error = -EIO; 1011 req->out.h.error = -EIO;
791 /* SETXATTR is special, since it may contain too large data */ 1012 /* SETXATTR is special, since it may contain too large data */
792 if (in->h.opcode == FUSE_SETXATTR) 1013 if (in->h.opcode == FUSE_SETXATTR)
@@ -795,12 +1016,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
795 goto restart; 1016 goto restart;
796 } 1017 }
797 spin_unlock(&fc->lock); 1018 spin_unlock(&fc->lock);
798 fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); 1019 cs->req = req;
799 err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); 1020 err = fuse_copy_one(cs, &in->h, sizeof(in->h));
800 if (!err) 1021 if (!err)
801 err = fuse_copy_args(&cs, in->numargs, in->argpages, 1022 err = fuse_copy_args(cs, in->numargs, in->argpages,
802 (struct fuse_arg *) in->args, 0); 1023 (struct fuse_arg *) in->args, 0);
803 fuse_copy_finish(&cs); 1024 fuse_copy_finish(cs);
804 spin_lock(&fc->lock); 1025 spin_lock(&fc->lock);
805 req->locked = 0; 1026 req->locked = 0;
806 if (req->aborted) { 1027 if (req->aborted) {
@@ -828,6 +1049,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
828 return err; 1049 return err;
829} 1050}
830 1051
1052static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1053 unsigned long nr_segs, loff_t pos)
1054{
1055 struct fuse_copy_state cs;
1056 struct file *file = iocb->ki_filp;
1057 struct fuse_conn *fc = fuse_get_conn(file);
1058 if (!fc)
1059 return -EPERM;
1060
1061 fuse_copy_init(&cs, fc, 1, iov, nr_segs);
1062
1063 return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
1064}
1065
1066static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
1067 struct pipe_buffer *buf)
1068{
1069 return 1;
1070}
1071
1072static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
1073 .can_merge = 0,
1074 .map = generic_pipe_buf_map,
1075 .unmap = generic_pipe_buf_unmap,
1076 .confirm = generic_pipe_buf_confirm,
1077 .release = generic_pipe_buf_release,
1078 .steal = fuse_dev_pipe_buf_steal,
1079 .get = generic_pipe_buf_get,
1080};
1081
1082static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1083 struct pipe_inode_info *pipe,
1084 size_t len, unsigned int flags)
1085{
1086 int ret;
1087 int page_nr = 0;
1088 int do_wakeup = 0;
1089 struct pipe_buffer *bufs;
1090 struct fuse_copy_state cs;
1091 struct fuse_conn *fc = fuse_get_conn(in);
1092 if (!fc)
1093 return -EPERM;
1094
1095 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1096 if (!bufs)
1097 return -ENOMEM;
1098
1099 fuse_copy_init(&cs, fc, 1, NULL, 0);
1100 cs.pipebufs = bufs;
1101 cs.pipe = pipe;
1102 ret = fuse_dev_do_read(fc, in, &cs, len);
1103 if (ret < 0)
1104 goto out;
1105
1106 ret = 0;
1107 pipe_lock(pipe);
1108
1109 if (!pipe->readers) {
1110 send_sig(SIGPIPE, current, 0);
1111 if (!ret)
1112 ret = -EPIPE;
1113 goto out_unlock;
1114 }
1115
1116 if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
1117 ret = -EIO;
1118 goto out_unlock;
1119 }
1120
1121 while (page_nr < cs.nr_segs) {
1122 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
1123 struct pipe_buffer *buf = pipe->bufs + newbuf;
1124
1125 buf->page = bufs[page_nr].page;
1126 buf->offset = bufs[page_nr].offset;
1127 buf->len = bufs[page_nr].len;
1128 buf->ops = &fuse_dev_pipe_buf_ops;
1129
1130 pipe->nrbufs++;
1131 page_nr++;
1132 ret += buf->len;
1133
1134 if (pipe->inode)
1135 do_wakeup = 1;
1136 }
1137
1138out_unlock:
1139 pipe_unlock(pipe);
1140
1141 if (do_wakeup) {
1142 smp_mb();
1143 if (waitqueue_active(&pipe->wait))
1144 wake_up_interruptible(&pipe->wait);
1145 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1146 }
1147
1148out:
1149 for (; page_nr < cs.nr_segs; page_nr++)
1150 page_cache_release(bufs[page_nr].page);
1151
1152 kfree(bufs);
1153 return ret;
1154}
1155
831static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, 1156static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
832 struct fuse_copy_state *cs) 1157 struct fuse_copy_state *cs)
833{ 1158{
@@ -924,6 +1249,199 @@ err:
924 return err; 1249 return err;
925} 1250}
926 1251
1252static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
1253 struct fuse_copy_state *cs)
1254{
1255 struct fuse_notify_store_out outarg;
1256 struct inode *inode;
1257 struct address_space *mapping;
1258 u64 nodeid;
1259 int err;
1260 pgoff_t index;
1261 unsigned int offset;
1262 unsigned int num;
1263 loff_t file_size;
1264 loff_t end;
1265
1266 err = -EINVAL;
1267 if (size < sizeof(outarg))
1268 goto out_finish;
1269
1270 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1271 if (err)
1272 goto out_finish;
1273
1274 err = -EINVAL;
1275 if (size - sizeof(outarg) != outarg.size)
1276 goto out_finish;
1277
1278 nodeid = outarg.nodeid;
1279
1280 down_read(&fc->killsb);
1281
1282 err = -ENOENT;
1283 if (!fc->sb)
1284 goto out_up_killsb;
1285
1286 inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
1287 if (!inode)
1288 goto out_up_killsb;
1289
1290 mapping = inode->i_mapping;
1291 index = outarg.offset >> PAGE_CACHE_SHIFT;
1292 offset = outarg.offset & ~PAGE_CACHE_MASK;
1293 file_size = i_size_read(inode);
1294 end = outarg.offset + outarg.size;
1295 if (end > file_size) {
1296 file_size = end;
1297 fuse_write_update_size(inode, file_size);
1298 }
1299
1300 num = outarg.size;
1301 while (num) {
1302 struct page *page;
1303 unsigned int this_num;
1304
1305 err = -ENOMEM;
1306 page = find_or_create_page(mapping, index,
1307 mapping_gfp_mask(mapping));
1308 if (!page)
1309 goto out_iput;
1310
1311 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1312 err = fuse_copy_page(cs, &page, offset, this_num, 0);
1313 if (!err && offset == 0 && (num != 0 || file_size == end))
1314 SetPageUptodate(page);
1315 unlock_page(page);
1316 page_cache_release(page);
1317
1318 if (err)
1319 goto out_iput;
1320
1321 num -= this_num;
1322 offset = 0;
1323 index++;
1324 }
1325
1326 err = 0;
1327
1328out_iput:
1329 iput(inode);
1330out_up_killsb:
1331 up_read(&fc->killsb);
1332out_finish:
1333 fuse_copy_finish(cs);
1334 return err;
1335}
1336
1337static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1338{
1339 int i;
1340
1341 for (i = 0; i < req->num_pages; i++) {
1342 struct page *page = req->pages[i];
1343 page_cache_release(page);
1344 }
1345}
1346
1347static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1348 struct fuse_notify_retrieve_out *outarg)
1349{
1350 int err;
1351 struct address_space *mapping = inode->i_mapping;
1352 struct fuse_req *req;
1353 pgoff_t index;
1354 loff_t file_size;
1355 unsigned int num;
1356 unsigned int offset;
1357 size_t total_len = 0;
1358
1359 req = fuse_get_req(fc);
1360 if (IS_ERR(req))
1361 return PTR_ERR(req);
1362
1363 offset = outarg->offset & ~PAGE_CACHE_MASK;
1364
1365 req->in.h.opcode = FUSE_NOTIFY_REPLY;
1366 req->in.h.nodeid = outarg->nodeid;
1367 req->in.numargs = 2;
1368 req->in.argpages = 1;
1369 req->page_offset = offset;
1370 req->end = fuse_retrieve_end;
1371
1372 index = outarg->offset >> PAGE_CACHE_SHIFT;
1373 file_size = i_size_read(inode);
1374 num = outarg->size;
1375 if (outarg->offset > file_size)
1376 num = 0;
1377 else if (outarg->offset + num > file_size)
1378 num = file_size - outarg->offset;
1379
1380 while (num) {
1381 struct page *page;
1382 unsigned int this_num;
1383
1384 page = find_get_page(mapping, index);
1385 if (!page)
1386 break;
1387
1388 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1389 req->pages[req->num_pages] = page;
1390 req->num_pages++;
1391
1392 num -= this_num;
1393 total_len += this_num;
1394 }
1395 req->misc.retrieve_in.offset = outarg->offset;
1396 req->misc.retrieve_in.size = total_len;
1397 req->in.args[0].size = sizeof(req->misc.retrieve_in);
1398 req->in.args[0].value = &req->misc.retrieve_in;
1399 req->in.args[1].size = total_len;
1400
1401 err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
1402 if (err)
1403 fuse_retrieve_end(fc, req);
1404
1405 return err;
1406}
1407
1408static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
1409 struct fuse_copy_state *cs)
1410{
1411 struct fuse_notify_retrieve_out outarg;
1412 struct inode *inode;
1413 int err;
1414
1415 err = -EINVAL;
1416 if (size != sizeof(outarg))
1417 goto copy_finish;
1418
1419 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1420 if (err)
1421 goto copy_finish;
1422
1423 fuse_copy_finish(cs);
1424
1425 down_read(&fc->killsb);
1426 err = -ENOENT;
1427 if (fc->sb) {
1428 u64 nodeid = outarg.nodeid;
1429
1430 inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
1431 if (inode) {
1432 err = fuse_retrieve(fc, inode, &outarg);
1433 iput(inode);
1434 }
1435 }
1436 up_read(&fc->killsb);
1437
1438 return err;
1439
1440copy_finish:
1441 fuse_copy_finish(cs);
1442 return err;
1443}
1444
927static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, 1445static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
928 unsigned int size, struct fuse_copy_state *cs) 1446 unsigned int size, struct fuse_copy_state *cs)
929{ 1447{
@@ -937,6 +1455,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
937 case FUSE_NOTIFY_INVAL_ENTRY: 1455 case FUSE_NOTIFY_INVAL_ENTRY:
938 return fuse_notify_inval_entry(fc, size, cs); 1456 return fuse_notify_inval_entry(fc, size, cs);
939 1457
1458 case FUSE_NOTIFY_STORE:
1459 return fuse_notify_store(fc, size, cs);
1460
1461 case FUSE_NOTIFY_RETRIEVE:
1462 return fuse_notify_retrieve(fc, size, cs);
1463
940 default: 1464 default:
941 fuse_copy_finish(cs); 1465 fuse_copy_finish(cs);
942 return -EINVAL; 1466 return -EINVAL;
@@ -987,23 +1511,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
987 * it from the list and copy the rest of the buffer to the request. 1511 * it from the list and copy the rest of the buffer to the request.
988 * The request is finished by calling request_end() 1512 * The request is finished by calling request_end()
989 */ 1513 */
990static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, 1514static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
991 unsigned long nr_segs, loff_t pos) 1515 struct fuse_copy_state *cs, size_t nbytes)
992{ 1516{
993 int err; 1517 int err;
994 size_t nbytes = iov_length(iov, nr_segs);
995 struct fuse_req *req; 1518 struct fuse_req *req;
996 struct fuse_out_header oh; 1519 struct fuse_out_header oh;
997 struct fuse_copy_state cs;
998 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
999 if (!fc)
1000 return -EPERM;
1001 1520
1002 fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
1003 if (nbytes < sizeof(struct fuse_out_header)) 1521 if (nbytes < sizeof(struct fuse_out_header))
1004 return -EINVAL; 1522 return -EINVAL;
1005 1523
1006 err = fuse_copy_one(&cs, &oh, sizeof(oh)); 1524 err = fuse_copy_one(cs, &oh, sizeof(oh));
1007 if (err) 1525 if (err)
1008 goto err_finish; 1526 goto err_finish;
1009 1527
@@ -1016,7 +1534,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1016 * and error contains notification code. 1534 * and error contains notification code.
1017 */ 1535 */
1018 if (!oh.unique) { 1536 if (!oh.unique) {
1019 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); 1537 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
1020 return err ? err : nbytes; 1538 return err ? err : nbytes;
1021 } 1539 }
1022 1540
@@ -1035,7 +1553,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1035 1553
1036 if (req->aborted) { 1554 if (req->aborted) {
1037 spin_unlock(&fc->lock); 1555 spin_unlock(&fc->lock);
1038 fuse_copy_finish(&cs); 1556 fuse_copy_finish(cs);
1039 spin_lock(&fc->lock); 1557 spin_lock(&fc->lock);
1040 request_end(fc, req); 1558 request_end(fc, req);
1041 return -ENOENT; 1559 return -ENOENT;
@@ -1052,7 +1570,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1052 queue_interrupt(fc, req); 1570 queue_interrupt(fc, req);
1053 1571
1054 spin_unlock(&fc->lock); 1572 spin_unlock(&fc->lock);
1055 fuse_copy_finish(&cs); 1573 fuse_copy_finish(cs);
1056 return nbytes; 1574 return nbytes;
1057 } 1575 }
1058 1576
@@ -1060,11 +1578,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1060 list_move(&req->list, &fc->io); 1578 list_move(&req->list, &fc->io);
1061 req->out.h = oh; 1579 req->out.h = oh;
1062 req->locked = 1; 1580 req->locked = 1;
1063 cs.req = req; 1581 cs->req = req;
1582 if (!req->out.page_replace)
1583 cs->move_pages = 0;
1064 spin_unlock(&fc->lock); 1584 spin_unlock(&fc->lock);
1065 1585
1066 err = copy_out_args(&cs, &req->out, nbytes); 1586 err = copy_out_args(cs, &req->out, nbytes);
1067 fuse_copy_finish(&cs); 1587 fuse_copy_finish(cs);
1068 1588
1069 spin_lock(&fc->lock); 1589 spin_lock(&fc->lock);
1070 req->locked = 0; 1590 req->locked = 0;
@@ -1080,10 +1600,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1080 err_unlock: 1600 err_unlock:
1081 spin_unlock(&fc->lock); 1601 spin_unlock(&fc->lock);
1082 err_finish: 1602 err_finish:
1083 fuse_copy_finish(&cs); 1603 fuse_copy_finish(cs);
1084 return err; 1604 return err;
1085} 1605}
1086 1606
1607static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1608 unsigned long nr_segs, loff_t pos)
1609{
1610 struct fuse_copy_state cs;
1611 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
1612 if (!fc)
1613 return -EPERM;
1614
1615 fuse_copy_init(&cs, fc, 0, iov, nr_segs);
1616
1617 return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
1618}
1619
1620static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1621 struct file *out, loff_t *ppos,
1622 size_t len, unsigned int flags)
1623{
1624 unsigned nbuf;
1625 unsigned idx;
1626 struct pipe_buffer *bufs;
1627 struct fuse_copy_state cs;
1628 struct fuse_conn *fc;
1629 size_t rem;
1630 ssize_t ret;
1631
1632 fc = fuse_get_conn(out);
1633 if (!fc)
1634 return -EPERM;
1635
1636 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1637 if (!bufs)
1638 return -ENOMEM;
1639
1640 pipe_lock(pipe);
1641 nbuf = 0;
1642 rem = 0;
1643 for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
1644 rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
1645
1646 ret = -EINVAL;
1647 if (rem < len) {
1648 pipe_unlock(pipe);
1649 goto out;
1650 }
1651
1652 rem = len;
1653 while (rem) {
1654 struct pipe_buffer *ibuf;
1655 struct pipe_buffer *obuf;
1656
1657 BUG_ON(nbuf >= pipe->buffers);
1658 BUG_ON(!pipe->nrbufs);
1659 ibuf = &pipe->bufs[pipe->curbuf];
1660 obuf = &bufs[nbuf];
1661
1662 if (rem >= ibuf->len) {
1663 *obuf = *ibuf;
1664 ibuf->ops = NULL;
1665 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
1666 pipe->nrbufs--;
1667 } else {
1668 ibuf->ops->get(pipe, ibuf);
1669 *obuf = *ibuf;
1670 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1671 obuf->len = rem;
1672 ibuf->offset += obuf->len;
1673 ibuf->len -= obuf->len;
1674 }
1675 nbuf++;
1676 rem -= obuf->len;
1677 }
1678 pipe_unlock(pipe);
1679
1680 fuse_copy_init(&cs, fc, 0, NULL, nbuf);
1681 cs.pipebufs = bufs;
1682 cs.pipe = pipe;
1683
1684 if (flags & SPLICE_F_MOVE)
1685 cs.move_pages = 1;
1686
1687 ret = fuse_dev_do_write(fc, &cs, len);
1688
1689 for (idx = 0; idx < nbuf; idx++) {
1690 struct pipe_buffer *buf = &bufs[idx];
1691 buf->ops->release(pipe, buf);
1692 }
1693out:
1694 kfree(bufs);
1695 return ret;
1696}
1697
1087static unsigned fuse_dev_poll(struct file *file, poll_table *wait) 1698static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1088{ 1699{
1089 unsigned mask = POLLOUT | POLLWRNORM; 1700 unsigned mask = POLLOUT | POLLWRNORM;
@@ -1109,8 +1720,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1109 * This function releases and reacquires fc->lock 1720 * This function releases and reacquires fc->lock
1110 */ 1721 */
1111static void end_requests(struct fuse_conn *fc, struct list_head *head) 1722static void end_requests(struct fuse_conn *fc, struct list_head *head)
1112__releases(&fc->lock) 1723__releases(fc->lock)
1113__acquires(&fc->lock) 1724__acquires(fc->lock)
1114{ 1725{
1115 while (!list_empty(head)) { 1726 while (!list_empty(head)) {
1116 struct fuse_req *req; 1727 struct fuse_req *req;
@@ -1133,8 +1744,8 @@ __acquires(&fc->lock)
1133 * locked). 1744 * locked).
1134 */ 1745 */
1135static void end_io_requests(struct fuse_conn *fc) 1746static void end_io_requests(struct fuse_conn *fc)
1136__releases(&fc->lock) 1747__releases(fc->lock)
1137__acquires(&fc->lock) 1748__acquires(fc->lock)
1138{ 1749{
1139 while (!list_empty(&fc->io)) { 1750 while (!list_empty(&fc->io)) {
1140 struct fuse_req *req = 1751 struct fuse_req *req =
@@ -1158,6 +1769,16 @@ __acquires(&fc->lock)
1158 } 1769 }
1159} 1770}
1160 1771
1772static void end_queued_requests(struct fuse_conn *fc)
1773__releases(fc->lock)
1774__acquires(fc->lock)
1775{
1776 fc->max_background = UINT_MAX;
1777 flush_bg_queue(fc);
1778 end_requests(fc, &fc->pending);
1779 end_requests(fc, &fc->processing);
1780}
1781
1161/* 1782/*
1162 * Abort all requests. 1783 * Abort all requests.
1163 * 1784 *
@@ -1184,8 +1805,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
1184 fc->connected = 0; 1805 fc->connected = 0;
1185 fc->blocked = 0; 1806 fc->blocked = 0;
1186 end_io_requests(fc); 1807 end_io_requests(fc);
1187 end_requests(fc, &fc->pending); 1808 end_queued_requests(fc);
1188 end_requests(fc, &fc->processing);
1189 wake_up_all(&fc->waitq); 1809 wake_up_all(&fc->waitq);
1190 wake_up_all(&fc->blocked_waitq); 1810 wake_up_all(&fc->blocked_waitq);
1191 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 1811 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1200,8 +1820,9 @@ int fuse_dev_release(struct inode *inode, struct file *file)
1200 if (fc) { 1820 if (fc) {
1201 spin_lock(&fc->lock); 1821 spin_lock(&fc->lock);
1202 fc->connected = 0; 1822 fc->connected = 0;
1203 end_requests(fc, &fc->pending); 1823 fc->blocked = 0;
1204 end_requests(fc, &fc->processing); 1824 end_queued_requests(fc);
1825 wake_up_all(&fc->blocked_waitq);
1205 spin_unlock(&fc->lock); 1826 spin_unlock(&fc->lock);
1206 fuse_conn_put(fc); 1827 fuse_conn_put(fc);
1207 } 1828 }
@@ -1225,8 +1846,10 @@ const struct file_operations fuse_dev_operations = {
1225 .llseek = no_llseek, 1846 .llseek = no_llseek,
1226 .read = do_sync_read, 1847 .read = do_sync_read,
1227 .aio_read = fuse_dev_read, 1848 .aio_read = fuse_dev_read,
1849 .splice_read = fuse_dev_splice_read,
1228 .write = do_sync_write, 1850 .write = do_sync_write,
1229 .aio_write = fuse_dev_write, 1851 .aio_write = fuse_dev_write,
1852 .splice_write = fuse_dev_splice_write,
1230 .poll = fuse_dev_poll, 1853 .poll = fuse_dev_poll,
1231 .release = fuse_dev_release, 1854 .release = fuse_dev_release,
1232 .fasync = fuse_dev_fasync, 1855 .fasync = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..c9627c95482d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask)
1016 exist. So if permissions are revoked this won't be 1016 exist. So if permissions are revoked this won't be
1017 noticed immediately, only after the attribute 1017 noticed immediately, only after the attribute
1018 timeout has expired */ 1018 timeout has expired */
1019 } else if (mask & MAY_ACCESS) { 1019 } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
1020 err = fuse_access(inode, mask); 1020 err = fuse_access(inode, mask);
1021 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { 1021 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
1022 if (!(inode->i_mode & S_IXUGO)) { 1022 if (!(inode->i_mode & S_IXUGO)) {
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
1156 return 0; 1156 return 0;
1157} 1157}
1158 1158
1159static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) 1159static int fuse_dir_fsync(struct file *file, int datasync)
1160{ 1160{
1161 /* nfsd can call this with no file */ 1161 return fuse_fsync_common(file, datasync, 1);
1162 return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
1163} 1162}
1164 1163
1165static bool update_mtime(unsigned ivalid) 1164static bool update_mtime(unsigned ivalid)
@@ -1271,21 +1270,18 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1271 if (!fuse_allow_task(fc, current)) 1270 if (!fuse_allow_task(fc, current))
1272 return -EACCES; 1271 return -EACCES;
1273 1272
1274 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { 1273 if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
1275 err = inode_change_ok(inode, attr); 1274 attr->ia_valid |= ATTR_FORCE;
1276 if (err) 1275
1277 return err; 1276 err = inode_change_ok(inode, attr);
1278 } 1277 if (err)
1278 return err;
1279 1279
1280 if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc) 1280 if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
1281 return 0; 1281 return 0;
1282 1282
1283 if (attr->ia_valid & ATTR_SIZE) { 1283 if (attr->ia_valid & ATTR_SIZE)
1284 err = inode_newsize_ok(inode, attr->ia_size);
1285 if (err)
1286 return err;
1287 is_truncate = true; 1284 is_truncate = true;
1288 }
1289 1285
1290 req = fuse_get_req(fc); 1286 req = fuse_get_req(fc);
1291 if (IS_ERR(req)) 1287 if (IS_ERR(req))
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..c8224587123f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
351 fuse_release_nowrite(inode); 351 fuse_release_nowrite(inode);
352} 352}
353 353
354int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 354int fuse_fsync_common(struct file *file, int datasync, int isdir)
355 int isdir)
356{ 355{
357 struct inode *inode = de->d_inode; 356 struct inode *inode = file->f_mapping->host;
358 struct fuse_conn *fc = get_fuse_conn(inode); 357 struct fuse_conn *fc = get_fuse_conn(inode);
359 struct fuse_file *ff = file->private_data; 358 struct fuse_file *ff = file->private_data;
360 struct fuse_req *req; 359 struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
403 return err; 402 return err;
404} 403}
405 404
406static int fuse_fsync(struct file *file, struct dentry *de, int datasync) 405static int fuse_fsync(struct file *file, int datasync)
407{ 406{
408 return fuse_fsync_common(file, de, datasync, 0); 407 return fuse_fsync_common(file, datasync, 0);
409} 408}
410 409
411void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, 410void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
517 int i; 516 int i;
518 size_t count = req->misc.read.in.size; 517 size_t count = req->misc.read.in.size;
519 size_t num_read = req->out.args[0].size; 518 size_t num_read = req->out.args[0].size;
520 struct inode *inode = req->pages[0]->mapping->host; 519 struct address_space *mapping = NULL;
521 520
522 /* 521 for (i = 0; mapping == NULL && i < req->num_pages; i++)
523 * Short read means EOF. If file size is larger, truncate it 522 mapping = req->pages[i]->mapping;
524 */
525 if (!req->out.h.error && num_read < count) {
526 loff_t pos = page_offset(req->pages[0]) + num_read;
527 fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
528 }
529 523
530 fuse_invalidate_attr(inode); /* atime changed */ 524 if (mapping) {
525 struct inode *inode = mapping->host;
526
527 /*
528 * Short read means EOF. If file size is larger, truncate it
529 */
530 if (!req->out.h.error && num_read < count) {
531 loff_t pos;
532
533 pos = page_offset(req->pages[0]) + num_read;
534 fuse_read_update_size(inode, pos,
535 req->misc.read.attr_ver);
536 }
537 fuse_invalidate_attr(inode); /* atime changed */
538 }
531 539
532 for (i = 0; i < req->num_pages; i++) { 540 for (i = 0; i < req->num_pages; i++) {
533 struct page *page = req->pages[i]; 541 struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
536 else 544 else
537 SetPageError(page); 545 SetPageError(page);
538 unlock_page(page); 546 unlock_page(page);
547 page_cache_release(page);
539 } 548 }
540 if (req->ff) 549 if (req->ff)
541 fuse_file_put(req->ff); 550 fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
550 559
551 req->out.argpages = 1; 560 req->out.argpages = 1;
552 req->out.page_zeroing = 1; 561 req->out.page_zeroing = 1;
562 req->out.page_replace = 1;
553 fuse_read_fill(req, file, pos, count, FUSE_READ); 563 fuse_read_fill(req, file, pos, count, FUSE_READ);
554 req->misc.read.attr_ver = fuse_get_attr_version(fc); 564 req->misc.read.attr_ver = fuse_get_attr_version(fc);
555 if (fc->async_read) { 565 if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
589 return PTR_ERR(req); 599 return PTR_ERR(req);
590 } 600 }
591 } 601 }
602 page_cache_get(page);
592 req->pages[req->num_pages] = page; 603 req->pages[req->num_pages] = page;
593 req->num_pages++; 604 req->num_pages++;
594 return 0; 605 return 0;
@@ -695,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
695 return 0; 706 return 0;
696} 707}
697 708
698static void fuse_write_update_size(struct inode *inode, loff_t pos) 709void fuse_write_update_size(struct inode *inode, loff_t pos)
699{ 710{
700 struct fuse_conn *fc = get_fuse_conn(inode); 711 struct fuse_conn *fc = get_fuse_conn(inode);
701 struct fuse_inode *fi = get_fuse_inode(inode); 712 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
994 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 1005 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
995 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 1006 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
996 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 1007 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
997 down_read(&current->mm->mmap_sem); 1008 npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
998 npages = get_user_pages(current, current->mm, user_addr, npages, !write,
999 0, req->pages, NULL);
1000 up_read(&current->mm->mmap_sem);
1001 if (npages < 0) 1009 if (npages < 0)
1002 return npages; 1010 return npages;
1003 1011
@@ -1136,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1136 1144
1137/* Called under fc->lock, may release and reacquire it */ 1145/* Called under fc->lock, may release and reacquire it */
1138static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) 1146static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1139__releases(&fc->lock) 1147__releases(fc->lock)
1140__acquires(&fc->lock) 1148__acquires(fc->lock)
1141{ 1149{
1142 struct fuse_inode *fi = get_fuse_inode(req->inode); 1150 struct fuse_inode *fi = get_fuse_inode(req->inode);
1143 loff_t size = i_size_read(req->inode); 1151 loff_t size = i_size_read(req->inode);
@@ -1175,8 +1183,8 @@ __acquires(&fc->lock)
1175 * Called with fc->lock 1183 * Called with fc->lock
1176 */ 1184 */
1177void fuse_flush_writepages(struct inode *inode) 1185void fuse_flush_writepages(struct inode *inode)
1178__releases(&fc->lock) 1186__releases(fc->lock)
1179__acquires(&fc->lock) 1187__acquires(fc->lock)
1180{ 1188{
1181 struct fuse_conn *fc = get_fuse_conn(inode); 1189 struct fuse_conn *fc = get_fuse_conn(inode);
1182 struct fuse_inode *fi = get_fuse_inode(inode); 1190 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1580 while (iov_iter_count(&ii)) { 1588 while (iov_iter_count(&ii)) {
1581 struct page *page = pages[page_idx++]; 1589 struct page *page = pages[page_idx++];
1582 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); 1590 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1583 void *kaddr, *map; 1591 void *kaddr;
1584 1592
1585 kaddr = map = kmap(page); 1593 kaddr = kmap(page);
1586 1594
1587 while (todo) { 1595 while (todo) {
1588 char __user *uaddr = ii.iov->iov_base + ii.iov_offset; 1596 char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..57d4a3a0f102 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
177 /** Zero partially or not copied pages */ 177 /** Zero partially or not copied pages */
178 unsigned page_zeroing:1; 178 unsigned page_zeroing:1;
179 179
180 /** Pages may be replaced with new ones */
181 unsigned page_replace:1;
182
180 /** Number or arguments */ 183 /** Number or arguments */
181 unsigned numargs; 184 unsigned numargs;
182 185
@@ -269,6 +272,7 @@ struct fuse_req {
269 struct fuse_write_in in; 272 struct fuse_write_in in;
270 struct fuse_write_out out; 273 struct fuse_write_out out;
271 } write; 274 } write;
275 struct fuse_notify_retrieve_in retrieve_in;
272 struct fuse_lk_in lk_in; 276 struct fuse_lk_in lk_in;
273 } misc; 277 } misc;
274 278
@@ -568,8 +572,7 @@ void fuse_release_common(struct file *file, int opcode);
568/** 572/**
569 * Send FSYNC or FSYNCDIR request 573 * Send FSYNC or FSYNCDIR request
570 */ 574 */
571int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 575int fuse_fsync_common(struct file *file, int datasync, int isdir);
572 int isdir);
573 576
574/** 577/**
575 * Notify poll wakeup 578 * Notify poll wakeup
@@ -746,4 +749,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
746unsigned fuse_file_poll(struct file *file, poll_table *wait); 749unsigned fuse_file_poll(struct file *file, poll_table *wait);
747int fuse_dev_release(struct inode *inode, struct file *file); 750int fuse_dev_release(struct inode *inode, struct file *file);
748 751
752void fuse_write_update_size(struct inode *inode, loff_t pos);
753
749#endif /* _FS_FUSE_I_H */ 754#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ec14d19ce501..da9e6e11374c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -122,8 +122,10 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
122 fuse_request_send_noreply(fc, req); 122 fuse_request_send_noreply(fc, req);
123} 123}
124 124
125static void fuse_clear_inode(struct inode *inode) 125static void fuse_evict_inode(struct inode *inode)
126{ 126{
127 truncate_inode_pages(&inode->i_data, 0);
128 end_writeback(inode);
127 if (inode->i_sb->s_flags & MS_ACTIVE) { 129 if (inode->i_sb->s_flags & MS_ACTIVE) {
128 struct fuse_conn *fc = get_fuse_conn(inode); 130 struct fuse_conn *fc = get_fuse_conn(inode);
129 struct fuse_inode *fi = get_fuse_inode(inode); 131 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -736,7 +738,7 @@ static const struct export_operations fuse_export_operations = {
736static const struct super_operations fuse_super_operations = { 738static const struct super_operations fuse_super_operations = {
737 .alloc_inode = fuse_alloc_inode, 739 .alloc_inode = fuse_alloc_inode,
738 .destroy_inode = fuse_destroy_inode, 740 .destroy_inode = fuse_destroy_inode,
739 .clear_inode = fuse_clear_inode, 741 .evict_inode = fuse_evict_inode,
740 .drop_inode = generic_delete_inode, 742 .drop_inode = generic_delete_inode,
741 .remount_fs = fuse_remount_fs, 743 .remount_fs = fuse_remount_fs,
742 .put_super = fuse_put_super, 744 .put_super = fuse_put_super,
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index fe5df5457656..6bc9e3a5a693 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
94 if (error < 0) 94 if (error < 0)
95 goto failed; 95 goto failed;
96 inode->i_mode = mode; 96 inode->i_mode = mode;
97 inode->i_ctime = CURRENT_TIME;
97 if (error == 0) { 98 if (error == 0) {
98 posix_acl_release(acl); 99 posix_acl_release(acl);
99 acl = NULL; 100 acl = NULL;
@@ -201,7 +202,7 @@ generic_check_acl(struct inode *inode, int mask)
201 return -EAGAIN; 202 return -EAGAIN;
202} 203}
203 204
204struct xattr_handler generic_acl_access_handler = { 205const struct xattr_handler generic_acl_access_handler = {
205 .prefix = POSIX_ACL_XATTR_ACCESS, 206 .prefix = POSIX_ACL_XATTR_ACCESS,
206 .flags = ACL_TYPE_ACCESS, 207 .flags = ACL_TYPE_ACCESS,
207 .list = generic_acl_list, 208 .list = generic_acl_list,
@@ -209,7 +210,7 @@ struct xattr_handler generic_acl_access_handler = {
209 .set = generic_acl_set, 210 .set = generic_acl_set,
210}; 211};
211 212
212struct xattr_handler generic_acl_default_handler = { 213const struct xattr_handler generic_acl_default_handler = {
213 .prefix = POSIX_ACL_XATTR_DEFAULT, 214 .prefix = POSIX_ACL_XATTR_DEFAULT,
214 .flags = ACL_TYPE_DEFAULT, 215 .flags = ACL_TYPE_DEFAULT,
215 .list = generic_acl_list, 216 .list = generic_acl_list,
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index a47b43107112..cc9665522148 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,7 +7,6 @@ config GFS2_FS
7 select IP_SCTP if DLM_SCTP 7 select IP_SCTP if DLM_SCTP
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK
11 select QUOTACTL 10 select QUOTACTL
12 help 11 help
13 A cluster filesystem. 12 A cluster filesystem.
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87ee309d4c24..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
236 void *buffer, size_t size, int xtype) 236 void *buffer, size_t size, int xtype)
237{ 237{
238 struct inode *inode = dentry->d_inode; 238 struct inode *inode = dentry->d_inode;
239 struct gfs2_sbd *sdp = GFS2_SB(inode);
239 struct posix_acl *acl; 240 struct posix_acl *acl;
240 int type; 241 int type;
241 int error; 242 int error;
242 243
244 if (!sdp->sd_args.ar_posix_acl)
245 return -EOPNOTSUPP;
246
243 type = gfs2_acl_type(name); 247 type = gfs2_acl_type(name);
244 if (type < 0) 248 if (type < 0)
245 return type; 249 return type;
@@ -335,7 +339,7 @@ out:
335 return error; 339 return error;
336} 340}
337 341
338struct xattr_handler gfs2_xattr_system_handler = { 342const struct xattr_handler gfs2_xattr_system_handler = {
339 .prefix = XATTR_SYSTEM_PREFIX, 343 .prefix = XATTR_SYSTEM_PREFIX,
340 .flags = GFS2_EATYPE_SYS, 344 .flags = GFS2_EATYPE_SYS,
341 .get = gfs2_xattr_system_get, 345 .get = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620c..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
19extern int gfs2_check_acl(struct inode *inode, int mask); 19extern int gfs2_check_acl(struct inode *inode, int mask);
20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); 20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); 21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
22extern struct xattr_handler gfs2_xattr_system_handler; 22extern const struct xattr_handler gfs2_xattr_system_handler;
23 23
24#endif /* __ACL_DOT_H__ */ 24#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0c1d0b82dcf1..194fe16d8418 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -136,10 +136,7 @@ static int gfs2_writeback_writepage(struct page *page,
136 if (ret <= 0) 136 if (ret <= 0)
137 return ret; 137 return ret;
138 138
139 ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc); 139 return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
140 if (ret == -EAGAIN)
141 ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
142 return ret;
143} 140}
144 141
145/** 142/**
@@ -418,6 +415,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
418static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) 415static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
419{ 416{
420 struct buffer_head *dibh; 417 struct buffer_head *dibh;
418 u64 dsize = i_size_read(&ip->i_inode);
421 void *kaddr; 419 void *kaddr;
422 int error; 420 int error;
423 421
@@ -437,9 +435,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
437 return error; 435 return error;
438 436
439 kaddr = kmap_atomic(page, KM_USER0); 437 kaddr = kmap_atomic(page, KM_USER0);
440 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 438 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
441 ip->i_disksize); 439 dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
442 memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); 440 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
441 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
443 kunmap_atomic(kaddr, KM_USER0); 442 kunmap_atomic(kaddr, KM_USER0);
444 flush_dcache_page(page); 443 flush_dcache_page(page);
445 brelse(dibh); 444 brelse(dibh);
@@ -635,9 +634,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
635 } 634 }
636 } 635 }
637 636
638 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); 637 alloc_required = gfs2_write_alloc_required(ip, pos, len);
639 if (error)
640 goto out_unlock;
641 638
642 if (alloc_required || gfs2_is_jdata(ip)) 639 if (alloc_required || gfs2_is_jdata(ip))
643 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); 640 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
@@ -698,8 +695,14 @@ out:
698 return 0; 695 return 0;
699 696
700 page_cache_release(page); 697 page_cache_release(page);
698
699 /*
700 * XXX(truncate): the call below should probably be replaced with
701 * a call to the gfs2-specific truncate blocks helper to actually
702 * release disk blocks..
703 */
701 if (pos + len > ip->i_inode.i_size) 704 if (pos + len > ip->i_inode.i_size)
702 vmtruncate(&ip->i_inode, ip->i_inode.i_size); 705 truncate_setsize(&ip->i_inode, ip->i_inode.i_size);
703out_endtrans: 706out_endtrans:
704 gfs2_trans_end(sdp); 707 gfs2_trans_end(sdp);
705out_trans_fail: 708out_trans_fail:
@@ -1039,9 +1042,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1039 if (rv != 1) 1042 if (rv != 1)
1040 goto out; /* dio not valid, fall back to buffered i/o */ 1043 goto out; /* dio not valid, fall back to buffered i/o */
1041 1044
1042 rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, 1045 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1043 iov, offset, nr_segs, 1046 offset, nr_segs, gfs2_get_block_direct,
1044 gfs2_get_block_direct, NULL); 1047 NULL, NULL, 0);
1045out: 1048out:
1046 gfs2_glock_dq_m(1, &gh); 1049 gfs2_glock_dq_m(1, &gh);
1047 gfs2_holder_uninit(&gh); 1050 gfs2_holder_uninit(&gh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5e411d5f4697..6f482809d1a3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -71,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
71 71
72 if (!PageUptodate(page)) { 72 if (!PageUptodate(page)) {
73 void *kaddr = kmap(page); 73 void *kaddr = kmap(page);
74 u64 dsize = i_size_read(inode);
75
76 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
77 dsize = dibh->b_size - sizeof(struct gfs2_dinode);
74 78
75 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 79 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
76 ip->i_disksize); 80 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
77 memset(kaddr + ip->i_disksize, 0,
78 PAGE_CACHE_SIZE - ip->i_disksize);
79 kunmap(page); 81 kunmap(page);
80 82
81 SetPageUptodate(page); 83 SetPageUptodate(page);
@@ -1038,13 +1040,15 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1038 goto out; 1040 goto out;
1039 1041
1040 if (gfs2_is_stuffed(ip)) { 1042 if (gfs2_is_stuffed(ip)) {
1043 u64 dsize = size + sizeof(struct gfs2_dinode);
1041 ip->i_disksize = size; 1044 ip->i_disksize = size;
1042 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1045 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1043 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1046 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1044 gfs2_dinode_out(ip, dibh->b_data); 1047 gfs2_dinode_out(ip, dibh->b_data);
1045 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); 1048 if (dsize > dibh->b_size)
1049 dsize = dibh->b_size;
1050 gfs2_buffer_clear_tail(dibh, dsize);
1046 error = 1; 1051 error = 1;
1047
1048 } else { 1052 } else {
1049 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 1053 if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
1050 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 1054 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
@@ -1240,13 +1244,12 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
1240 * @ip: the file being written to 1244 * @ip: the file being written to
1241 * @offset: the offset to write to 1245 * @offset: the offset to write to
1242 * @len: the number of bytes being written 1246 * @len: the number of bytes being written
1243 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1244 * 1247 *
1245 * Returns: errno 1248 * Returns: 1 if an alloc is required, 0 otherwise
1246 */ 1249 */
1247 1250
1248int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 1251int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1249 unsigned int len, int *alloc_required) 1252 unsigned int len)
1250{ 1253{
1251 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1254 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1252 struct buffer_head bh; 1255 struct buffer_head bh;
@@ -1254,26 +1257,23 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1254 u64 lblock, lblock_stop, size; 1257 u64 lblock, lblock_stop, size;
1255 u64 end_of_file; 1258 u64 end_of_file;
1256 1259
1257 *alloc_required = 0;
1258
1259 if (!len) 1260 if (!len)
1260 return 0; 1261 return 0;
1261 1262
1262 if (gfs2_is_stuffed(ip)) { 1263 if (gfs2_is_stuffed(ip)) {
1263 if (offset + len > 1264 if (offset + len >
1264 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) 1265 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1265 *alloc_required = 1; 1266 return 1;
1266 return 0; 1267 return 0;
1267 } 1268 }
1268 1269
1269 *alloc_required = 1;
1270 shift = sdp->sd_sb.sb_bsize_shift; 1270 shift = sdp->sd_sb.sb_bsize_shift;
1271 BUG_ON(gfs2_is_dir(ip)); 1271 BUG_ON(gfs2_is_dir(ip));
1272 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; 1272 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
1273 lblock = offset >> shift; 1273 lblock = offset >> shift;
1274 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1274 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1275 if (lblock_stop > end_of_file) 1275 if (lblock_stop > end_of_file)
1276 return 0; 1276 return 1;
1277 1277
1278 size = (lblock_stop - lblock) << shift; 1278 size = (lblock_stop - lblock) << shift;
1279 do { 1279 do {
@@ -1281,12 +1281,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1281 bh.b_size = size; 1281 bh.b_size = size;
1282 gfs2_block_map(&ip->i_inode, lblock, &bh, 0); 1282 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1283 if (!buffer_mapped(&bh)) 1283 if (!buffer_mapped(&bh))
1284 return 0; 1284 return 1;
1285 size -= bh.b_size; 1285 size -= bh.b_size;
1286 lblock += (bh.b_size >> ip->i_inode.i_blkbits); 1286 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1287 } while(size > 0); 1287 } while(size > 0);
1288 1288
1289 *alloc_required = 0;
1290 return 0; 1289 return 0;
1291} 1290}
1292 1291
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index c983177e05ac..a20a5213135a 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -52,6 +52,6 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
52int gfs2_truncatei_resume(struct gfs2_inode *ip); 52int gfs2_truncatei_resume(struct gfs2_inode *ip);
53int gfs2_file_dealloc(struct gfs2_inode *ip); 53int gfs2_file_dealloc(struct gfs2_inode *ip);
54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
55 unsigned int len, int *alloc_required); 55 unsigned int len);
56 56
57#endif /* __BMAP_DOT_H__ */ 57#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc100f18..b9dd88a78dd4 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
392 unsigned totlen = be16_to_cpu(dent->de_rec_len); 392 unsigned totlen = be16_to_cpu(dent->de_rec_len);
393 393
394 if (gfs2_dirent_sentinel(dent)) 394 if (gfs2_dirent_sentinel(dent))
395 actual = GFS2_DIRENT_SIZE(0); 395 actual = 0;
396 if (totlen - actual >= required) 396 if (totlen - actual >= required)
397 return 1; 397 return 1;
398 return 0; 398 return 0;
@@ -955,7 +955,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
955 /* Change the pointers. 955 /* Change the pointers.
956 Don't bother distinguishing stuffed from non-stuffed. 956 Don't bother distinguishing stuffed from non-stuffed.
957 This code is complicated enough already. */ 957 This code is complicated enough already. */
958 lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL); 958 lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS);
959 if (!lp) {
960 error = -ENOMEM;
961 goto fail_brelse;
962 }
963
959 /* Change the pointers */ 964 /* Change the pointers */
960 for (x = 0; x < half_len; x++) 965 for (x = 0; x < half_len; x++)
961 lp[x] = cpu_to_be64(bn); 966 lp[x] = cpu_to_be64(bn);
@@ -1063,7 +1068,9 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1063 1068
1064 /* Allocate both the "from" and "to" buffers in one big chunk */ 1069 /* Allocate both the "from" and "to" buffers in one big chunk */
1065 1070
1066 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); 1071 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS);
1072 if (!buf)
1073 return -ENOMEM;
1067 1074
1068 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { 1075 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
1069 error = gfs2_dir_read_data(dip, (char *)buf, 1076 error = gfs2_dir_read_data(dip, (char *)buf,
@@ -1231,6 +1238,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1231 return 0; 1238 return 0;
1232} 1239}
1233 1240
1241static void *gfs2_alloc_sort_buffer(unsigned size)
1242{
1243 void *ptr = NULL;
1244
1245 if (size < KMALLOC_MAX_SIZE)
1246 ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
1247 if (!ptr)
1248 ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
1249 return ptr;
1250}
1251
1252static void gfs2_free_sort_buffer(void *ptr)
1253{
1254 if (is_vmalloc_addr(ptr))
1255 vfree(ptr);
1256 else
1257 kfree(ptr);
1258}
1259
1234static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, 1260static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1235 filldir_t filldir, int *copied, unsigned *depth, 1261 filldir_t filldir, int *copied, unsigned *depth,
1236 u64 leaf_no) 1262 u64 leaf_no)
@@ -1271,7 +1297,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1271 * 99 is the maximum number of entries that can fit in a single 1297 * 99 is the maximum number of entries that can fit in a single
1272 * leaf block. 1298 * leaf block.
1273 */ 1299 */
1274 larr = vmalloc((leaves + entries + 99) * sizeof(void *)); 1300 larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
1275 if (!larr) 1301 if (!larr)
1276 goto out; 1302 goto out;
1277 darr = (const struct gfs2_dirent **)(larr + leaves); 1303 darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1282,7 +1308,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1282 do { 1308 do {
1283 error = get_leaf(ip, lfn, &bh); 1309 error = get_leaf(ip, lfn, &bh);
1284 if (error) 1310 if (error)
1285 goto out_kfree; 1311 goto out_free;
1286 lf = (struct gfs2_leaf *)bh->b_data; 1312 lf = (struct gfs2_leaf *)bh->b_data;
1287 lfn = be64_to_cpu(lf->lf_next); 1313 lfn = be64_to_cpu(lf->lf_next);
1288 if (lf->lf_entries) { 1314 if (lf->lf_entries) {
@@ -1291,7 +1317,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1291 gfs2_dirent_gather, NULL, &g); 1317 gfs2_dirent_gather, NULL, &g);
1292 error = PTR_ERR(dent); 1318 error = PTR_ERR(dent);
1293 if (IS_ERR(dent)) 1319 if (IS_ERR(dent))
1294 goto out_kfree; 1320 goto out_free;
1295 if (entries2 != g.offset) { 1321 if (entries2 != g.offset) {
1296 fs_warn(sdp, "Number of entries corrupt in dir " 1322 fs_warn(sdp, "Number of entries corrupt in dir "
1297 "leaf %llu, entries2 (%u) != " 1323 "leaf %llu, entries2 (%u) != "
@@ -1300,7 +1326,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1300 entries2, g.offset); 1326 entries2, g.offset);
1301 1327
1302 error = -EIO; 1328 error = -EIO;
1303 goto out_kfree; 1329 goto out_free;
1304 } 1330 }
1305 error = 0; 1331 error = 0;
1306 larr[leaf++] = bh; 1332 larr[leaf++] = bh;
@@ -1312,10 +1338,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1312 BUG_ON(entries2 != entries); 1338 BUG_ON(entries2 != entries);
1313 error = do_filldir_main(ip, offset, opaque, filldir, darr, 1339 error = do_filldir_main(ip, offset, opaque, filldir, darr,
1314 entries, copied); 1340 entries, copied);
1315out_kfree: 1341out_free:
1316 for(i = 0; i < leaf; i++) 1342 for(i = 0; i < leaf; i++)
1317 brelse(larr[i]); 1343 brelse(larr[i]);
1318 vfree(larr); 1344 gfs2_free_sort_buffer(larr);
1319out: 1345out:
1320 return error; 1346 return error;
1321} 1347}
@@ -1475,7 +1501,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
1475 inode = gfs2_inode_lookup(dir->i_sb, 1501 inode = gfs2_inode_lookup(dir->i_sb,
1476 be16_to_cpu(dent->de_type), 1502 be16_to_cpu(dent->de_type),
1477 be64_to_cpu(dent->de_inum.no_addr), 1503 be64_to_cpu(dent->de_inum.no_addr),
1478 be64_to_cpu(dent->de_inum.no_formal_ino), 0); 1504 be64_to_cpu(dent->de_inum.no_formal_ino));
1479 brelse(bh); 1505 brelse(bh);
1480 return inode; 1506 return inode;
1481 } 1507 }
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index c22c21174833..dfe237a3f8ad 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -168,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
168 if (error) 168 if (error)
169 goto fail; 169 goto fail;
170 170
171 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); 171 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
172 if (IS_ERR(inode)) { 172 if (IS_ERR(inode)) {
173 error = PTR_ERR(inode); 173 error = PTR_ERR(inode);
174 goto fail; 174 goto fail;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..4edd662c8232 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
218 if (error) 218 if (error)
219 goto out_drop_write; 219 goto out_drop_write;
220 220
221 error = -EACCES;
222 if (!is_owner_or_cap(inode))
223 goto out;
224
225 error = 0;
221 flags = ip->i_diskflags; 226 flags = ip->i_diskflags;
222 new_flags = (flags & ~mask) | (reqflags & mask); 227 new_flags = (flags & ~mask) | (reqflags & mask);
223 if ((new_flags ^ flags) == 0) 228 if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
275{ 280{
276 struct inode *inode = filp->f_path.dentry->d_inode; 281 struct inode *inode = filp->f_path.dentry->d_inode;
277 u32 fsflags, gfsflags; 282 u32 fsflags, gfsflags;
283
278 if (get_user(fsflags, ptr)) 284 if (get_user(fsflags, ptr))
279 return -EFAULT; 285 return -EFAULT;
286
280 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); 287 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
281 if (!S_ISDIR(inode->i_mode)) { 288 if (!S_ISDIR(inode->i_mode)) {
282 if (gfsflags & GFS2_DIF_INHERIT_JDATA) 289 if (gfsflags & GFS2_DIF_INHERIT_JDATA)
@@ -344,7 +351,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
344 unsigned long last_index; 351 unsigned long last_index;
345 u64 pos = page->index << PAGE_CACHE_SHIFT; 352 u64 pos = page->index << PAGE_CACHE_SHIFT;
346 unsigned int data_blocks, ind_blocks, rblocks; 353 unsigned int data_blocks, ind_blocks, rblocks;
347 int alloc_required = 0;
348 struct gfs2_holder gh; 354 struct gfs2_holder gh;
349 struct gfs2_alloc *al; 355 struct gfs2_alloc *al;
350 int ret; 356 int ret;
@@ -357,8 +363,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
357 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); 363 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
358 set_bit(GIF_SW_PAGED, &ip->i_flags); 364 set_bit(GIF_SW_PAGED, &ip->i_flags);
359 365
360 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); 366 if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
361 if (ret || !alloc_required)
362 goto out_unlock; 367 goto out_unlock;
363 ret = -ENOMEM; 368 ret = -ENOMEM;
364 al = gfs2_alloc_get(ip); 369 al = gfs2_alloc_get(ip);
@@ -547,9 +552,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
547 * Returns: errno 552 * Returns: errno
548 */ 553 */
549 554
550static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) 555static int gfs2_fsync(struct file *file, int datasync)
551{ 556{
552 struct inode *inode = dentry->d_inode; 557 struct inode *inode = file->f_mapping->host;
553 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); 558 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
554 int ret = 0; 559 int ret = 0;
555 560
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 454d4b4eb36b..9adf8f924e08 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -328,6 +328,30 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
328} 328}
329 329
330/** 330/**
331 * do_error - Something unexpected has happened during a lock request
332 *
333 */
334
335static inline void do_error(struct gfs2_glock *gl, const int ret)
336{
337 struct gfs2_holder *gh, *tmp;
338
339 list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
340 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
341 continue;
342 if (ret & LM_OUT_ERROR)
343 gh->gh_error = -EIO;
344 else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
345 gh->gh_error = GLR_TRYFAILED;
346 else
347 continue;
348 list_del_init(&gh->gh_list);
349 trace_gfs2_glock_queue(gh, 0);
350 gfs2_holder_wake(gh);
351 }
352}
353
354/**
331 * do_promote - promote as many requests as possible on the current queue 355 * do_promote - promote as many requests as possible on the current queue
332 * @gl: The glock 356 * @gl: The glock
333 * 357 *
@@ -375,36 +399,13 @@ restart:
375 } 399 }
376 if (gh->gh_list.prev == &gl->gl_holders) 400 if (gh->gh_list.prev == &gl->gl_holders)
377 return 1; 401 return 1;
402 do_error(gl, 0);
378 break; 403 break;
379 } 404 }
380 return 0; 405 return 0;
381} 406}
382 407
383/** 408/**
384 * do_error - Something unexpected has happened during a lock request
385 *
386 */
387
388static inline void do_error(struct gfs2_glock *gl, const int ret)
389{
390 struct gfs2_holder *gh, *tmp;
391
392 list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
393 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
394 continue;
395 if (ret & LM_OUT_ERROR)
396 gh->gh_error = -EIO;
397 else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
398 gh->gh_error = GLR_TRYFAILED;
399 else
400 continue;
401 list_del_init(&gh->gh_list);
402 trace_gfs2_glock_queue(gh, 0);
403 gfs2_holder_wake(gh);
404 }
405}
406
407/**
408 * find_first_waiter - find the first gh that's waiting for the glock 409 * find_first_waiter - find the first gh that's waiting for the glock
409 * @gl: the glock 410 * @gl: the glock
410 */ 411 */
@@ -855,6 +856,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
855 gh->gh_flags = flags; 856 gh->gh_flags = flags;
856 gh->gh_iflags = 0; 857 gh->gh_iflags = 0;
857 gh->gh_ip = (unsigned long)__builtin_return_address(0); 858 gh->gh_ip = (unsigned long)__builtin_return_address(0);
859 if (gh->gh_owner_pid)
860 put_pid(gh->gh_owner_pid);
861 gh->gh_owner_pid = get_pid(task_pid(current));
858} 862}
859 863
860/** 864/**
@@ -1059,6 +1063,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
1059 1063
1060 spin_lock(&gl->gl_spin); 1064 spin_lock(&gl->gl_spin);
1061 add_to_queue(gh); 1065 add_to_queue(gh);
1066 if ((LM_FLAG_NOEXP & gh->gh_flags) &&
1067 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
1068 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1062 run_queue(gl, 1); 1069 run_queue(gl, 1);
1063 spin_unlock(&gl->gl_spin); 1070 spin_unlock(&gl->gl_spin);
1064 1071
@@ -1316,6 +1323,36 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1316} 1323}
1317 1324
1318/** 1325/**
1326 * gfs2_should_freeze - Figure out if glock should be frozen
1327 * @gl: The glock in question
1328 *
1329 * Glocks are not frozen if (a) the result of the dlm operation is
1330 * an error, (b) the locking operation was an unlock operation or
1331 * (c) if there is a "noexp" flagged request anywhere in the queue
1332 *
1333 * Returns: 1 if freezing should occur, 0 otherwise
1334 */
1335
1336static int gfs2_should_freeze(const struct gfs2_glock *gl)
1337{
1338 const struct gfs2_holder *gh;
1339
1340 if (gl->gl_reply & ~LM_OUT_ST_MASK)
1341 return 0;
1342 if (gl->gl_target == LM_ST_UNLOCKED)
1343 return 0;
1344
1345 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
1346 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1347 continue;
1348 if (LM_FLAG_NOEXP & gh->gh_flags)
1349 return 0;
1350 }
1351
1352 return 1;
1353}
1354
1355/**
1319 * gfs2_glock_complete - Callback used by locking 1356 * gfs2_glock_complete - Callback used by locking
1320 * @gl: Pointer to the glock 1357 * @gl: Pointer to the glock
1321 * @ret: The return value from the dlm 1358 * @ret: The return value from the dlm
@@ -1325,18 +1362,17 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1325void gfs2_glock_complete(struct gfs2_glock *gl, int ret) 1362void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1326{ 1363{
1327 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 1364 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
1365
1328 gl->gl_reply = ret; 1366 gl->gl_reply = ret;
1367
1329 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { 1368 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
1330 struct gfs2_holder *gh;
1331 spin_lock(&gl->gl_spin); 1369 spin_lock(&gl->gl_spin);
1332 gh = find_first_waiter(gl); 1370 if (gfs2_should_freeze(gl)) {
1333 if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
1334 (gl->gl_target != LM_ST_UNLOCKED)) ||
1335 ((ret & ~LM_OUT_ST_MASK) != 0))
1336 set_bit(GLF_FROZEN, &gl->gl_flags); 1371 set_bit(GLF_FROZEN, &gl->gl_flags);
1337 spin_unlock(&gl->gl_spin); 1372 spin_unlock(&gl->gl_spin);
1338 if (test_bit(GLF_FROZEN, &gl->gl_flags))
1339 return; 1373 return;
1374 }
1375 spin_unlock(&gl->gl_spin);
1340 } 1376 }
1341 set_bit(GLF_REPLY_PENDING, &gl->gl_flags); 1377 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1342 gfs2_glock_hold(gl); 1378 gfs2_glock_hold(gl);
@@ -1345,7 +1381,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1345} 1381}
1346 1382
1347 1383
1348static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) 1384static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
1349{ 1385{
1350 struct gfs2_glock *gl; 1386 struct gfs2_glock *gl;
1351 int may_demote; 1387 int may_demote;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3aac46f6853e..fdbf4b366fa5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,7 +12,6 @@
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/workqueue.h> 14#include <linux/workqueue.h>
15#include <linux/slow-work.h>
16#include <linux/dlm.h> 15#include <linux/dlm.h>
17#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
18 17
@@ -383,7 +382,7 @@ struct gfs2_journal_extent {
383struct gfs2_jdesc { 382struct gfs2_jdesc {
384 struct list_head jd_list; 383 struct list_head jd_list;
385 struct list_head extent_list; 384 struct list_head extent_list;
386 struct slow_work jd_work; 385 struct work_struct jd_work;
387 struct inode *jd_inode; 386 struct inode *jd_inode;
388 unsigned long jd_flags; 387 unsigned long jd_flags;
389#define JDF_RECOVERY 1 388#define JDF_RECOVERY 1
@@ -439,9 +438,6 @@ struct gfs2_args {
439struct gfs2_tune { 438struct gfs2_tune {
440 spinlock_t gt_spin; 439 spinlock_t gt_spin;
441 440
442 unsigned int gt_incore_log_blocks;
443 unsigned int gt_log_flush_secs;
444
445 unsigned int gt_logd_secs; 441 unsigned int gt_logd_secs;
446 442
447 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ 443 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -462,6 +458,8 @@ enum {
462 SDF_SHUTDOWN = 2, 458 SDF_SHUTDOWN = 2,
463 SDF_NOBARRIERS = 3, 459 SDF_NOBARRIERS = 3,
464 SDF_NORECOVERY = 4, 460 SDF_NORECOVERY = 4,
461 SDF_DEMOTE = 5,
462 SDF_NOJOURNALID = 6,
465}; 463};
466 464
467#define GFS2_FSNAME_LEN 256 465#define GFS2_FSNAME_LEN 256
@@ -618,6 +616,7 @@ struct gfs2_sbd {
618 unsigned int sd_log_commited_databuf; 616 unsigned int sd_log_commited_databuf;
619 int sd_log_commited_revoke; 617 int sd_log_commited_revoke;
620 618
619 atomic_t sd_log_pinned;
621 unsigned int sd_log_num_buf; 620 unsigned int sd_log_num_buf;
622 unsigned int sd_log_num_revoke; 621 unsigned int sd_log_num_revoke;
623 unsigned int sd_log_num_rg; 622 unsigned int sd_log_num_rg;
@@ -629,15 +628,17 @@ struct gfs2_sbd {
629 struct list_head sd_log_le_databuf; 628 struct list_head sd_log_le_databuf;
630 struct list_head sd_log_le_ordered; 629 struct list_head sd_log_le_ordered;
631 630
631 atomic_t sd_log_thresh1;
632 atomic_t sd_log_thresh2;
632 atomic_t sd_log_blks_free; 633 atomic_t sd_log_blks_free;
633 struct mutex sd_log_reserve_mutex; 634 wait_queue_head_t sd_log_waitq;
635 wait_queue_head_t sd_logd_waitq;
634 636
635 u64 sd_log_sequence; 637 u64 sd_log_sequence;
636 unsigned int sd_log_head; 638 unsigned int sd_log_head;
637 unsigned int sd_log_tail; 639 unsigned int sd_log_tail;
638 int sd_log_idle; 640 int sd_log_idle;
639 641
640 unsigned long sd_log_flush_time;
641 struct rw_semaphore sd_log_flush_lock; 642 struct rw_semaphore sd_log_flush_lock;
642 atomic_t sd_log_in_flight; 643 atomic_t sd_log_in_flight;
643 wait_queue_head_t sd_log_flush_wait; 644 wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b1bf2694fb2b..08140f185a37 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -84,7 +84,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
84 struct gfs2_skip_data *data = opaque; 84 struct gfs2_skip_data *data = opaque;
85 85
86 if (ip->i_no_addr == data->no_addr) { 86 if (ip->i_no_addr == data->no_addr) {
87 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){ 87 if (inode->i_state & (I_FREEING|I_WILL_FREE)){
88 data->skipped = 1; 88 data->skipped = 1;
89 return 0; 89 return 0;
90 } 90 }
@@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
158 * @sb: The super block 158 * @sb: The super block
159 * @no_addr: The inode number 159 * @no_addr: The inode number
160 * @type: The type of the inode 160 * @type: The type of the inode
161 * @skip_freeing: set this not return an inode if it is currently being freed.
162 * 161 *
163 * Returns: A VFS inode, or an error 162 * Returns: A VFS inode, or an error
164 */ 163 */
@@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
166struct inode *gfs2_inode_lookup(struct super_block *sb, 165struct inode *gfs2_inode_lookup(struct super_block *sb,
167 unsigned int type, 166 unsigned int type,
168 u64 no_addr, 167 u64 no_addr,
169 u64 no_formal_ino, int skip_freeing) 168 u64 no_formal_ino)
170{ 169{
171 struct inode *inode; 170 struct inode *inode;
172 struct gfs2_inode *ip; 171 struct gfs2_inode *ip;
173 struct gfs2_glock *io_gl; 172 struct gfs2_glock *io_gl = NULL;
174 int error; 173 int error;
175 174
176 if (skip_freeing) 175 inode = gfs2_iget(sb, no_addr);
177 inode = gfs2_iget_skip(sb, no_addr);
178 else
179 inode = gfs2_iget(sb, no_addr);
180 ip = GFS2_I(inode); 176 ip = GFS2_I(inode);
181 177
182 if (!inode) 178 if (!inode)
@@ -202,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
202 ip->i_iopen_gh.gh_gl->gl_object = ip; 198 ip->i_iopen_gh.gh_gl->gl_object = ip;
203 199
204 gfs2_glock_put(io_gl); 200 gfs2_glock_put(io_gl);
201 io_gl = NULL;
205 202
206 if ((type == DT_UNKNOWN) && (no_formal_ino == 0)) 203 if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
207 goto gfs2_nfsbypass; 204 goto gfs2_nfsbypass;
@@ -232,13 +229,107 @@ gfs2_nfsbypass:
232fail_glock: 229fail_glock:
233 gfs2_glock_dq(&ip->i_iopen_gh); 230 gfs2_glock_dq(&ip->i_iopen_gh);
234fail_iopen: 231fail_iopen:
232 if (io_gl)
233 gfs2_glock_put(io_gl);
234fail_put:
235 if (inode->i_state & I_NEW)
236 ip->i_gl->gl_object = NULL;
237 gfs2_glock_put(ip->i_gl);
238fail:
239 if (inode->i_state & I_NEW)
240 iget_failed(inode);
241 else
242 iput(inode);
243 return ERR_PTR(error);
244}
245
246/**
247 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
248 * and try to reclaim it by doing iput.
249 *
250 * This function assumes no rgrp locks are currently held.
251 *
252 * @sb: The super block
253 * no_addr: The inode number
254 *
255 */
256
257void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
258{
259 struct gfs2_sbd *sdp;
260 struct gfs2_inode *ip;
261 struct gfs2_glock *io_gl = NULL;
262 int error;
263 struct gfs2_holder gh;
264 struct inode *inode;
265
266 inode = gfs2_iget_skip(sb, no_addr);
267
268 if (!inode)
269 return;
270
271 /* If it's not a new inode, someone's using it, so leave it alone. */
272 if (!(inode->i_state & I_NEW)) {
273 iput(inode);
274 return;
275 }
276
277 ip = GFS2_I(inode);
278 sdp = GFS2_SB(inode);
279 ip->i_no_formal_ino = -1;
280
281 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
282 if (unlikely(error))
283 goto fail;
284 ip->i_gl->gl_object = ip;
285
286 error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
287 if (unlikely(error))
288 goto fail_put;
289
290 set_bit(GIF_INVALID, &ip->i_flags);
291 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
292 &ip->i_iopen_gh);
293 if (unlikely(error))
294 goto fail_iopen;
295
296 ip->i_iopen_gh.gh_gl->gl_object = ip;
235 gfs2_glock_put(io_gl); 297 gfs2_glock_put(io_gl);
298 io_gl = NULL;
299
300 inode->i_mode = DT2IF(DT_UNKNOWN);
301
302 /*
303 * We must read the inode in order to work out its type in
304 * this case. Note that this doesn't happen often as we normally
305 * know the type beforehand. This code path only occurs during
306 * unlinked inode recovery (where it is safe to do this glock,
307 * which is not true in the general case).
308 */
309 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
310 &gh);
311 if (unlikely(error))
312 goto fail_glock;
313
314 /* Inode is now uptodate */
315 gfs2_glock_dq_uninit(&gh);
316 gfs2_set_iop(inode);
317
318 /* The iput will cause it to be deleted. */
319 iput(inode);
320 return;
321
322fail_glock:
323 gfs2_glock_dq(&ip->i_iopen_gh);
324fail_iopen:
325 if (io_gl)
326 gfs2_glock_put(io_gl);
236fail_put: 327fail_put:
237 ip->i_gl->gl_object = NULL; 328 ip->i_gl->gl_object = NULL;
238 gfs2_glock_put(ip->i_gl); 329 gfs2_glock_put(ip->i_gl);
239fail: 330fail:
240 iget_failed(inode); 331 iget_failed(inode);
241 return ERR_PTR(error); 332 return;
242} 333}
243 334
244static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 335static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -862,7 +953,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
862 goto fail_gunlock2; 953 goto fail_gunlock2;
863 954
864 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, 955 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
865 inum.no_formal_ino, 0); 956 inum.no_formal_ino);
866 if (IS_ERR(inode)) 957 if (IS_ERR(inode))
867 goto fail_gunlock2; 958 goto fail_gunlock2;
868 959
@@ -900,18 +991,29 @@ fail:
900 991
901static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) 992static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
902{ 993{
994 struct inode *inode = &ip->i_inode;
903 struct buffer_head *dibh; 995 struct buffer_head *dibh;
904 int error; 996 int error;
905 997
906 error = gfs2_meta_inode_buffer(ip, &dibh); 998 error = gfs2_meta_inode_buffer(ip, &dibh);
907 if (!error) { 999 if (error)
908 error = inode_setattr(&ip->i_inode, attr); 1000 return error;
909 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); 1001
910 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1002 if ((attr->ia_valid & ATTR_SIZE) &&
911 gfs2_dinode_out(ip, dibh->b_data); 1003 attr->ia_size != i_size_read(inode)) {
912 brelse(dibh); 1004 error = vmtruncate(inode, attr->ia_size);
1005 if (error)
1006 return error;
913 } 1007 }
914 return error; 1008
1009 setattr_copy(inode, attr);
1010 mark_inode_dirty(inode);
1011
1012 gfs2_assert_warn(GFS2_SB(inode), !error);
1013 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1014 gfs2_dinode_out(ip, dibh->b_data);
1015 brelse(dibh);
1016 return 0;
915} 1017}
916 1018
917/** 1019/**
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf67adb..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
83 83
84extern void gfs2_set_iop(struct inode *inode); 84extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
86 u64 no_addr, u64 no_formal_ino, 86 u64 no_addr, u64 no_formal_ino);
87 int skip_freeing); 87extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
89 89
90extern int gfs2_inode_refresh(struct gfs2_inode *ip); 90extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e5bf4b59d46e..ac750bd31a6f 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
168 return list_empty(&ai->ai_ail1_list); 168 return list_empty(&ai->ai_ail1_list);
169} 169}
170 170
171static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) 171static void gfs2_ail1_start(struct gfs2_sbd *sdp)
172{ 172{
173 struct list_head *head; 173 struct list_head *head;
174 u64 sync_gen; 174 u64 sync_gen;
175 struct list_head *first; 175 struct gfs2_ail *ai;
176 struct gfs2_ail *first_ai, *ai, *tmp;
177 int done = 0; 176 int done = 0;
178 177
179 gfs2_log_lock(sdp); 178 gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
184 } 183 }
185 sync_gen = sdp->sd_ail_sync_gen++; 184 sync_gen = sdp->sd_ail_sync_gen++;
186 185
187 first = head->prev;
188 first_ai = list_entry(first, struct gfs2_ail, ai_list);
189 first_ai->ai_sync_gen = sync_gen;
190 gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
191
192 if (flags & DIO_ALL)
193 first = NULL;
194
195 while(!done) { 186 while(!done) {
196 if (first && (head->prev != first ||
197 gfs2_ail1_empty_one(sdp, first_ai, 0)))
198 break;
199
200 done = 1; 187 done = 1;
201 list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) { 188 list_for_each_entry_reverse(ai, head, ai_list) {
202 if (ai->ai_sync_gen >= sync_gen) 189 if (ai->ai_sync_gen >= sync_gen)
203 continue; 190 continue;
204 ai->ai_sync_gen = sync_gen; 191 ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
290 * flush time, so we ensure that we have just enough free blocks at all 277 * flush time, so we ensure that we have just enough free blocks at all
291 * times to avoid running out during a log flush. 278 * times to avoid running out during a log flush.
292 * 279 *
280 * We no longer flush the log here, instead we wake up logd to do that
281 * for us. To avoid the thundering herd and to ensure that we deal fairly
282 * with queued waiters, we use an exclusive wait. This means that when we
283 * get woken with enough journal space to get our reservation, we need to
284 * wake the next waiter on the list.
285 *
293 * Returns: errno 286 * Returns: errno
294 */ 287 */
295 288
296int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) 289int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
297{ 290{
298 unsigned int try = 0;
299 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); 291 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
292 unsigned wanted = blks + reserved_blks;
293 DEFINE_WAIT(wait);
294 int did_wait = 0;
295 unsigned int free_blocks;
300 296
301 if (gfs2_assert_warn(sdp, blks) || 297 if (gfs2_assert_warn(sdp, blks) ||
302 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) 298 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
303 return -EINVAL; 299 return -EINVAL;
304 300retry:
305 mutex_lock(&sdp->sd_log_reserve_mutex); 301 free_blocks = atomic_read(&sdp->sd_log_blks_free);
306 gfs2_log_lock(sdp); 302 if (unlikely(free_blocks <= wanted)) {
307 while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) { 303 do {
308 gfs2_log_unlock(sdp); 304 prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
309 gfs2_ail1_empty(sdp, 0); 305 TASK_UNINTERRUPTIBLE);
310 gfs2_log_flush(sdp, NULL); 306 wake_up(&sdp->sd_logd_waitq);
311 307 did_wait = 1;
312 if (try++) 308 if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
313 gfs2_ail1_start(sdp, 0); 309 io_schedule();
314 gfs2_log_lock(sdp); 310 free_blocks = atomic_read(&sdp->sd_log_blks_free);
311 } while(free_blocks <= wanted);
312 finish_wait(&sdp->sd_log_waitq, &wait);
315 } 313 }
316 atomic_sub(blks, &sdp->sd_log_blks_free); 314 if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
315 free_blocks - blks) != free_blocks)
316 goto retry;
317 trace_gfs2_log_blocks(sdp, -blks); 317 trace_gfs2_log_blocks(sdp, -blks);
318 gfs2_log_unlock(sdp); 318
319 mutex_unlock(&sdp->sd_log_reserve_mutex); 319 /*
320 * If we waited, then so might others, wake them up _after_ we get
321 * our share of the log.
322 */
323 if (unlikely(did_wait))
324 wake_up(&sdp->sd_log_waitq);
320 325
321 down_read(&sdp->sd_log_flush_lock); 326 down_read(&sdp->sd_log_flush_lock);
322 327
323 return 0; 328 return 0;
324} 329}
325 330
326/**
327 * gfs2_log_release - Release a given number of log blocks
328 * @sdp: The GFS2 superblock
329 * @blks: The number of blocks
330 *
331 */
332
333void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
334{
335
336 gfs2_log_lock(sdp);
337 atomic_add(blks, &sdp->sd_log_blks_free);
338 trace_gfs2_log_blocks(sdp, blks);
339 gfs2_assert_withdraw(sdp,
340 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
341 gfs2_log_unlock(sdp);
342 up_read(&sdp->sd_log_flush_lock);
343}
344
345static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) 331static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
346{ 332{
347 struct gfs2_journal_extent *je; 333 struct gfs2_journal_extent *je;
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
559 545
560 ail2_empty(sdp, new_tail); 546 ail2_empty(sdp, new_tail);
561 547
562 gfs2_log_lock(sdp);
563 atomic_add(dist, &sdp->sd_log_blks_free); 548 atomic_add(dist, &sdp->sd_log_blks_free);
564 trace_gfs2_log_blocks(sdp, dist); 549 trace_gfs2_log_blocks(sdp, dist);
565 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); 550 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
566 gfs2_log_unlock(sdp); 551 sdp->sd_jdesc->jd_blocks);
567 552
568 sdp->sd_log_tail = new_tail; 553 sdp->sd_log_tail = new_tail;
569} 554}
@@ -610,16 +595,17 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
610 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 595 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
611 goto skip_barrier; 596 goto skip_barrier;
612 get_bh(bh); 597 get_bh(bh);
613 submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh); 598 submit_bh(WRITE_BARRIER | REQ_META, bh);
614 wait_on_buffer(bh); 599 wait_on_buffer(bh);
615 if (buffer_eopnotsupp(bh)) { 600 if (buffer_eopnotsupp(bh)) {
616 clear_buffer_eopnotsupp(bh); 601 clear_buffer_eopnotsupp(bh);
617 set_buffer_uptodate(bh); 602 set_buffer_uptodate(bh);
603 fs_info(sdp, "barrier sync failed - disabling barriers\n");
618 set_bit(SDF_NOBARRIERS, &sdp->sd_flags); 604 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
619 lock_buffer(bh); 605 lock_buffer(bh);
620skip_barrier: 606skip_barrier:
621 get_bh(bh); 607 get_bh(bh);
622 submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh); 608 submit_bh(WRITE_SYNC | REQ_META, bh);
623 wait_on_buffer(bh); 609 wait_on_buffer(bh);
624 } 610 }
625 if (!buffer_uptodate(bh)) 611 if (!buffer_uptodate(bh))
@@ -710,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
710 * 696 *
711 */ 697 */
712 698
713void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) 699void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
714{ 700{
715 struct gfs2_ail *ai; 701 struct gfs2_ail *ai;
716 702
@@ -822,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
822 * @sdp: the filesystem 808 * @sdp: the filesystem
823 * @tr: the transaction 809 * @tr: the transaction
824 * 810 *
811 * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
812 * or the total number of used blocks (pinned blocks plus AIL blocks)
813 * is greater than thresh2.
814 *
815 * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
816 * journal size.
817 *
825 * Returns: errno 818 * Returns: errno
826 */ 819 */
827 820
@@ -832,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
832 825
833 up_read(&sdp->sd_log_flush_lock); 826 up_read(&sdp->sd_log_flush_lock);
834 827
835 gfs2_log_lock(sdp); 828 if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
836 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) 829 ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
837 wake_up_process(sdp->sd_logd_process); 830 atomic_read(&sdp->sd_log_thresh2)))
838 gfs2_log_unlock(sdp); 831 wake_up(&sdp->sd_logd_waitq);
839} 832}
840 833
841/** 834/**
@@ -882,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
882{ 875{
883 gfs2_log_flush(sdp, NULL); 876 gfs2_log_flush(sdp, NULL);
884 for (;;) { 877 for (;;) {
885 gfs2_ail1_start(sdp, DIO_ALL); 878 gfs2_ail1_start(sdp);
886 if (gfs2_ail1_empty(sdp, DIO_ALL)) 879 if (gfs2_ail1_empty(sdp, DIO_ALL))
887 break; 880 break;
888 msleep(10); 881 msleep(10);
889 } 882 }
890} 883}
891 884
885static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
886{
887 return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
888}
889
890static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
891{
892 unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
893 return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
894}
892 895
893/** 896/**
894 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks 897 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -901,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
901int gfs2_logd(void *data) 904int gfs2_logd(void *data)
902{ 905{
903 struct gfs2_sbd *sdp = data; 906 struct gfs2_sbd *sdp = data;
904 unsigned long t; 907 unsigned long t = 1;
905 int need_flush; 908 DEFINE_WAIT(wait);
909 unsigned preflush;
906 910
907 while (!kthread_should_stop()) { 911 while (!kthread_should_stop()) {
908 /* Advance the log tail */
909 912
910 t = sdp->sd_log_flush_time + 913 preflush = atomic_read(&sdp->sd_log_pinned);
911 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; 914 if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
915 gfs2_ail1_empty(sdp, DIO_ALL);
916 gfs2_log_flush(sdp, NULL);
917 gfs2_ail1_empty(sdp, DIO_ALL);
918 }
912 919
913 gfs2_ail1_empty(sdp, DIO_ALL); 920 if (gfs2_ail_flush_reqd(sdp)) {
914 gfs2_log_lock(sdp); 921 gfs2_ail1_start(sdp);
915 need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); 922 io_schedule();
916 gfs2_log_unlock(sdp); 923 gfs2_ail1_empty(sdp, 0);
917 if (need_flush || time_after_eq(jiffies, t)) {
918 gfs2_log_flush(sdp, NULL); 924 gfs2_log_flush(sdp, NULL);
919 sdp->sd_log_flush_time = jiffies; 925 gfs2_ail1_empty(sdp, DIO_ALL);
920 } 926 }
921 927
928 wake_up(&sdp->sd_log_waitq);
922 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; 929 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
923 if (freezing(current)) 930 if (freezing(current))
924 refrigerator(); 931 refrigerator();
925 schedule_timeout_interruptible(t); 932
933 do {
934 prepare_to_wait(&sdp->sd_logd_waitq, &wait,
935 TASK_INTERRUPTIBLE);
936 if (!gfs2_ail_flush_reqd(sdp) &&
937 !gfs2_jrnl_flush_reqd(sdp) &&
938 !kthread_should_stop())
939 t = schedule_timeout(t);
940 } while(t && !gfs2_ail_flush_reqd(sdp) &&
941 !gfs2_jrnl_flush_reqd(sdp) &&
942 !kthread_should_stop());
943 finish_wait(&sdp->sd_logd_waitq, &wait);
926 } 944 }
927 945
928 return 0; 946 return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510ccfd2..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,29 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
47 sdp->sd_log_head = sdp->sd_log_tail = value; 47 sdp->sd_log_head = sdp->sd_log_tail = value;
48} 48}
49 49
50unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, 50extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
51 unsigned int ssize); 51 unsigned int ssize);
52 52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 53extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); 54extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
55void gfs2_log_incr_head(struct gfs2_sbd *sdp);
56 55
57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 56extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
58struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 57extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
59 struct buffer_head *real); 58 struct buffer_head *real);
60void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 59extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
60extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
61extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
61 62
62static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) 63extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
63{ 64extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
64 if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) 65extern int gfs2_logd(void *data);
65 __gfs2_log_flush(sbd, gl);
66}
67
68void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
69void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
70
71void gfs2_log_shutdown(struct gfs2_sbd *sdp);
72void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
73int gfs2_logd(void *data);
74 66
75#endif /* __LOG_DOT_H__ */ 67#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index adc260fbea90..bf33f822058d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
54 if (bd->bd_ail) 54 if (bd->bd_ail)
55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); 55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
56 get_bh(bh); 56 get_bh(bh);
57 atomic_inc(&sdp->sd_log_pinned);
57 trace_gfs2_pin(bd, 1); 58 trace_gfs2_pin(bd, 1);
58} 59}
59 60
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
94 trace_gfs2_pin(bd, 0); 95 trace_gfs2_pin(bd, 0);
95 gfs2_log_unlock(sdp); 96 gfs2_log_unlock(sdp);
96 unlock_buffer(bh); 97 unlock_buffer(bh);
98 atomic_dec(&sdp->sd_log_pinned);
97} 99}
98 100
99 101
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a88fadc704bb..b1e9630eb46a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,7 +15,6 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <asm/atomic.h> 17#include <asm/atomic.h>
18#include <linux/slow-work.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
21#include "incore.h" 20#include "incore.h"
@@ -24,6 +23,7 @@
24#include "util.h" 23#include "util.h"
25#include "glock.h" 24#include "glock.h"
26#include "quota.h" 25#include "quota.h"
26#include "recovery.h"
27 27
28static struct shrinker qd_shrinker = { 28static struct shrinker qd_shrinker = {
29 .shrink = gfs2_shrink_qd_memory, 29 .shrink = gfs2_shrink_qd_memory,
@@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void)
94 if (!gfs2_glock_cachep) 94 if (!gfs2_glock_cachep)
95 goto fail; 95 goto fail;
96 96
97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)", 97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
98 sizeof(struct gfs2_glock) + 98 sizeof(struct gfs2_glock) +
99 sizeof(struct address_space), 99 sizeof(struct address_space),
100 0, 0, gfs2_init_gl_aspace_once); 100 0, 0, gfs2_init_gl_aspace_once);
@@ -138,9 +138,11 @@ static int __init init_gfs2_fs(void)
138 if (error) 138 if (error)
139 goto fail_unregister; 139 goto fail_unregister;
140 140
141 error = slow_work_register_user(THIS_MODULE); 141 error = -ENOMEM;
142 if (error) 142 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
143 goto fail_slow; 143 WQ_NON_REENTRANT | WQ_RESCUER, 0);
144 if (!gfs_recovery_wq)
145 goto fail_wq;
144 146
145 gfs2_register_debugfs(); 147 gfs2_register_debugfs();
146 148
@@ -148,7 +150,7 @@ static int __init init_gfs2_fs(void)
148 150
149 return 0; 151 return 0;
150 152
151fail_slow: 153fail_wq:
152 unregister_filesystem(&gfs2meta_fs_type); 154 unregister_filesystem(&gfs2meta_fs_type);
153fail_unregister: 155fail_unregister:
154 unregister_filesystem(&gfs2_fs_type); 156 unregister_filesystem(&gfs2_fs_type);
@@ -190,7 +192,7 @@ static void __exit exit_gfs2_fs(void)
190 gfs2_unregister_debugfs(); 192 gfs2_unregister_debugfs();
191 unregister_filesystem(&gfs2_fs_type); 193 unregister_filesystem(&gfs2_fs_type);
192 unregister_filesystem(&gfs2meta_fs_type); 194 unregister_filesystem(&gfs2meta_fs_type);
193 slow_work_unregister_user(THIS_MODULE); 195 destroy_workqueue(gfs_recovery_wq);
194 196
195 kmem_cache_destroy(gfs2_quotad_cachep); 197 kmem_cache_destroy(gfs2_quotad_cachep);
196 kmem_cache_destroy(gfs2_rgrpd_cachep); 198 kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0bb12c80937a..f3b071f921aa 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,11 +34,10 @@
34 34
35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) 35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
36{ 36{
37 int err;
38 struct buffer_head *bh, *head; 37 struct buffer_head *bh, *head;
39 int nr_underway = 0; 38 int nr_underway = 0;
40 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? 39 int write_op = REQ_META |
41 WRITE_SYNC_PLUG : WRITE)); 40 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE);
42 41
43 BUG_ON(!PageLocked(page)); 42 BUG_ON(!PageLocked(page));
44 BUG_ON(!page_has_buffers(page)); 43 BUG_ON(!page_has_buffers(page));
@@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
86 } while (bh != head); 85 } while (bh != head);
87 unlock_page(page); 86 unlock_page(page);
88 87
89 err = 0;
90 if (nr_underway == 0) 88 if (nr_underway == 0)
91 end_page_writeback(page); 89 end_page_writeback(page);
92 90
93 return err; 91 return 0;
94} 92}
95 93
96const struct address_space_operations gfs2_meta_aops = { 94const struct address_space_operations gfs2_meta_aops = {
@@ -227,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
227 } 225 }
228 bh->b_end_io = end_buffer_read_sync; 226 bh->b_end_io = end_buffer_read_sync;
229 get_bh(bh); 227 get_bh(bh);
230 submit_bh(READ_SYNC | (1 << BIO_RW_META), bh); 228 submit_bh(READ_SYNC | REQ_META, bh);
231 if (!(flags & DIO_WAIT)) 229 if (!(flags & DIO_WAIT))
232 return 0; 230 return 0;
233 231
@@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
313 struct gfs2_bufdata *bd = bh->b_private; 311 struct gfs2_bufdata *bd = bh->b_private;
314 312
315 if (test_clear_buffer_pinned(bh)) { 313 if (test_clear_buffer_pinned(bh)) {
314 atomic_dec(&sdp->sd_log_pinned);
316 list_del_init(&bd->bd_le.le_list); 315 list_del_init(&bd->bd_le.le_list);
317 if (meta) { 316 if (meta) {
318 gfs2_assert_warn(sdp, sdp->sd_log_num_buf); 317 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
@@ -433,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
433 if (buffer_uptodate(first_bh)) 432 if (buffer_uptodate(first_bh))
434 goto out; 433 goto out;
435 if (!buffer_locked(first_bh)) 434 if (!buffer_locked(first_bh))
436 ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh); 435 ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
437 436
438 dblock++; 437 dblock++;
439 extlen--; 438 extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1309ed1c496..4d4b1e8ac64c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/slow-work.h>
21#include <linux/quotaops.h> 20#include <linux/quotaops.h>
22 21
23#include "gfs2.h" 22#include "gfs2.h"
@@ -57,8 +56,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
57{ 56{
58 spin_lock_init(&gt->gt_spin); 57 spin_lock_init(&gt->gt_spin);
59 58
60 gt->gt_incore_log_blocks = 1024;
61 gt->gt_logd_secs = 1;
62 gt->gt_quota_simul_sync = 64; 59 gt->gt_quota_simul_sync = 64;
63 gt->gt_quota_warn_period = 10; 60 gt->gt_quota_warn_period = 10;
64 gt->gt_quota_scale_num = 1; 61 gt->gt_quota_scale_num = 1;
@@ -78,7 +75,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
78 75
79 sb->s_fs_info = sdp; 76 sb->s_fs_info = sdp;
80 sdp->sd_vfs = sb; 77 sdp->sd_vfs = sb;
81 78 set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
82 gfs2_tune_init(&sdp->sd_tune); 79 gfs2_tune_init(&sdp->sd_tune);
83 80
84 init_waitqueue_head(&sdp->sd_glock_wait); 81 init_waitqueue_head(&sdp->sd_glock_wait);
@@ -101,14 +98,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
101 spin_lock_init(&sdp->sd_trunc_lock); 98 spin_lock_init(&sdp->sd_trunc_lock);
102 99
103 spin_lock_init(&sdp->sd_log_lock); 100 spin_lock_init(&sdp->sd_log_lock);
104 101 atomic_set(&sdp->sd_log_pinned, 0);
105 INIT_LIST_HEAD(&sdp->sd_log_le_buf); 102 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
106 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 103 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
107 INIT_LIST_HEAD(&sdp->sd_log_le_rg); 104 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
108 INIT_LIST_HEAD(&sdp->sd_log_le_databuf); 105 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
109 INIT_LIST_HEAD(&sdp->sd_log_le_ordered); 106 INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
110 107
111 mutex_init(&sdp->sd_log_reserve_mutex); 108 init_waitqueue_head(&sdp->sd_log_waitq);
109 init_waitqueue_head(&sdp->sd_logd_waitq);
112 INIT_LIST_HEAD(&sdp->sd_ail1_list); 110 INIT_LIST_HEAD(&sdp->sd_ail1_list);
113 INIT_LIST_HEAD(&sdp->sd_ail2_list); 111 INIT_LIST_HEAD(&sdp->sd_ail2_list);
114 112
@@ -276,7 +274,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
276 274
277 bio->bi_end_io = end_bio_io_page; 275 bio->bi_end_io = end_bio_io_page;
278 bio->bi_private = page; 276 bio->bi_private = page;
279 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); 277 submit_bio(READ_SYNC | REQ_META, bio);
280 wait_on_page_locked(page); 278 wait_on_page_locked(page);
281 bio_put(bio); 279 bio_put(bio);
282 if (!PageUptodate(page)) { 280 if (!PageUptodate(page)) {
@@ -487,7 +485,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
487 struct dentry *dentry; 485 struct dentry *dentry;
488 struct inode *inode; 486 struct inode *inode;
489 487
490 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); 488 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
491 if (IS_ERR(inode)) { 489 if (IS_ERR(inode)) {
492 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); 490 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
493 return PTR_ERR(inode); 491 return PTR_ERR(inode);
@@ -674,7 +672,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
674 break; 672 break;
675 673
676 INIT_LIST_HEAD(&jd->extent_list); 674 INIT_LIST_HEAD(&jd->extent_list);
677 slow_work_init(&jd->jd_work, &gfs2_recover_ops); 675 INIT_WORK(&jd->jd_work, gfs2_recover_func);
678 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); 676 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
679 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 677 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
680 if (!jd->jd_inode) 678 if (!jd->jd_inode)
@@ -733,6 +731,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
733 if (sdp->sd_args.ar_spectator) { 731 if (sdp->sd_args.ar_spectator) {
734 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); 732 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
735 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); 733 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
734 atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
735 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
736 } else { 736 } else {
737 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { 737 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
738 fs_err(sdp, "can't mount journal #%u\n", 738 fs_err(sdp, "can't mount journal #%u\n",
@@ -770,6 +770,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
770 goto fail_jinode_gh; 770 goto fail_jinode_gh;
771 } 771 }
772 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); 772 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
773 atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
774 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
773 775
774 /* Map the extents for this journal's blocks */ 776 /* Map the extents for this journal's blocks */
775 map_journal_extents(sdp); 777 map_journal_extents(sdp);
@@ -779,7 +781,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
779 if (sdp->sd_lockstruct.ls_first) { 781 if (sdp->sd_lockstruct.ls_first) {
780 unsigned int x; 782 unsigned int x;
781 for (x = 0; x < sdp->sd_journals; x++) { 783 for (x = 0; x < sdp->sd_journals; x++) {
782 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x)); 784 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x),
785 true);
783 if (error) { 786 if (error) {
784 fs_err(sdp, "error recovering journal %u: %d\n", 787 fs_err(sdp, "error recovering journal %u: %d\n",
785 x, error); 788 x, error);
@@ -789,7 +792,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
789 792
790 gfs2_others_may_mount(sdp); 793 gfs2_others_may_mount(sdp);
791 } else if (!sdp->sd_args.ar_spectator) { 794 } else if (!sdp->sd_args.ar_spectator) {
792 error = gfs2_recover_journal(sdp->sd_jdesc); 795 error = gfs2_recover_journal(sdp->sd_jdesc, true);
793 if (error) { 796 if (error) {
794 fs_err(sdp, "error recovering my journal: %d\n", error); 797 fs_err(sdp, "error recovering my journal: %d\n", error);
795 goto fail_jinode_gh; 798 goto fail_jinode_gh;
@@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
951 if (undo) 954 if (undo)
952 goto fail_quotad; 955 goto fail_quotad;
953 956
954 sdp->sd_log_flush_time = jiffies;
955
956 p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); 957 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
957 error = IS_ERR(p); 958 error = IS_ERR(p);
958 if (error) { 959 if (error) {
@@ -1049,7 +1050,8 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1049 ret = match_int(&tmp[0], &option); 1050 ret = match_int(&tmp[0], &option);
1050 if (ret || option < 0) 1051 if (ret || option < 0)
1051 goto hostdata_error; 1052 goto hostdata_error;
1052 ls->ls_jid = option; 1053 if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags))
1054 ls->ls_jid = option;
1053 break; 1055 break;
1054 case Opt_id: 1056 case Opt_id:
1055 /* Obsolete, but left for backward compat purposes */ 1057 /* Obsolete, but left for backward compat purposes */
@@ -1101,6 +1103,24 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
1101 lm->lm_unmount(sdp); 1103 lm->lm_unmount(sdp);
1102} 1104}
1103 1105
1106static int gfs2_journalid_wait(void *word)
1107{
1108 if (signal_pending(current))
1109 return -EINTR;
1110 schedule();
1111 return 0;
1112}
1113
1114static int wait_on_journal(struct gfs2_sbd *sdp)
1115{
1116 if (sdp->sd_args.ar_spectator)
1117 return 0;
1118 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
1119 return 0;
1120
1121 return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE);
1122}
1123
1104void gfs2_online_uevent(struct gfs2_sbd *sdp) 1124void gfs2_online_uevent(struct gfs2_sbd *sdp)
1105{ 1125{
1106 struct super_block *sb = sdp->sd_vfs; 1126 struct super_block *sb = sdp->sd_vfs;
@@ -1160,7 +1180,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1160 GFS2_BASIC_BLOCK_SHIFT; 1180 GFS2_BASIC_BLOCK_SHIFT;
1161 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; 1181 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
1162 1182
1163 sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; 1183 sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
1164 sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; 1184 sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
1165 if (sdp->sd_args.ar_statfs_quantum) { 1185 if (sdp->sd_args.ar_statfs_quantum) {
1166 sdp->sd_tune.gt_statfs_slow = 0; 1186 sdp->sd_tune.gt_statfs_slow = 0;
@@ -1193,6 +1213,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1193 if (error) 1213 if (error)
1194 goto fail_locking; 1214 goto fail_locking;
1195 1215
1216 error = wait_on_journal(sdp);
1217 if (error)
1218 goto fail_sb;
1219
1196 error = init_inodes(sdp, DO); 1220 error = init_inodes(sdp, DO);
1197 if (error) 1221 if (error)
1198 goto fail_sb; 1222 goto fail_sb;
@@ -1323,7 +1347,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1323 memset(&args, 0, sizeof(args)); 1347 memset(&args, 0, sizeof(args));
1324 args.ar_quota = GFS2_QUOTA_DEFAULT; 1348 args.ar_quota = GFS2_QUOTA_DEFAULT;
1325 args.ar_data = GFS2_DATA_DEFAULT; 1349 args.ar_data = GFS2_DATA_DEFAULT;
1326 args.ar_commit = 60; 1350 args.ar_commit = 30;
1327 args.ar_statfs_quantum = 30; 1351 args.ar_statfs_quantum = 30;
1328 args.ar_quota_quantum = 60; 1352 args.ar_quota_quantum = 60;
1329 args.ar_errors = GFS2_ERRORS_DEFAULT; 1353 args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..1009be2c9737 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1071 return error;
1072} 1072}
1073 1073
1074/*
1075 * XXX(truncate): the truncate_setsize calls should be moved to the end.
1076 */
1074static int setattr_size(struct inode *inode, struct iattr *attr) 1077static int setattr_size(struct inode *inode, struct iattr *attr)
1075{ 1078{
1076 struct gfs2_inode *ip = GFS2_I(inode); 1079 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,10 +1084,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
1081 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1082 if (error) 1085 if (error)
1083 return error; 1086 return error;
1084 error = vmtruncate(inode, attr->ia_size); 1087 truncate_setsize(inode, attr->ia_size);
1085 gfs2_trans_end(sdp); 1088 gfs2_trans_end(sdp);
1086 if (error)
1087 return error;
1088 } 1089 }
1089 1090
1090 error = gfs2_truncatei(ip, attr->ia_size); 1091 error = gfs2_truncatei(ip, attr->ia_size);
@@ -1133,8 +1134,16 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1133 if (error) 1134 if (error)
1134 goto out_end_trans; 1135 goto out_end_trans;
1135 1136
1136 error = inode_setattr(inode, attr); 1137 if ((attr->ia_valid & ATTR_SIZE) &&
1137 gfs2_assert_warn(sdp, !error); 1138 attr->ia_size != i_size_read(inode)) {
1139 int error;
1140
1141 error = vmtruncate(inode, attr->ia_size);
1142 gfs2_assert_warn(sdp, !error);
1143 }
1144
1145 setattr_copy(inode, attr);
1146 mark_inode_dirty(inode);
1138 1147
1139 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1148 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1140 gfs2_dinode_out(ip, dibh->b_data); 1149 gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6dbcbad6ab17..1bc6b5695e6d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list);
77static atomic_t qd_lru_count = ATOMIC_INIT(0); 77static atomic_t qd_lru_count = ATOMIC_INIT(0);
78static DEFINE_SPINLOCK(qd_lru_lock); 78static DEFINE_SPINLOCK(qd_lru_lock);
79 79
80int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) 80int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
81{ 81{
82 struct gfs2_quota_data *qd; 82 struct gfs2_quota_data *qd;
83 struct gfs2_sbd *sdp; 83 struct gfs2_sbd *sdp;
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
637 unsigned blocksize, iblock, pos; 637 unsigned blocksize, iblock, pos;
638 struct buffer_head *bh, *dibh; 638 struct buffer_head *bh, *dibh;
639 struct page *page; 639 struct page *page;
640 void *kaddr; 640 void *kaddr, *ptr;
641 struct gfs2_quota *qp; 641 struct gfs2_quota q, *qp;
642 s64 value; 642 int err, nbytes;
643 int err = -EIO;
644 u64 size; 643 u64 size;
645 644
646 if (gfs2_is_stuffed(ip)) 645 if (gfs2_is_stuffed(ip))
647 gfs2_unstuff_dinode(ip, NULL); 646 gfs2_unstuff_dinode(ip, NULL);
648 647
648 memset(&q, 0, sizeof(struct gfs2_quota));
649 err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
650 if (err < 0)
651 return err;
652
653 err = -EIO;
654 qp = &q;
655 qp->qu_value = be64_to_cpu(qp->qu_value);
656 qp->qu_value += change;
657 qp->qu_value = cpu_to_be64(qp->qu_value);
658 qd->qd_qb.qb_value = qp->qu_value;
659 if (fdq) {
660 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
661 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
662 qd->qd_qb.qb_warn = qp->qu_warn;
663 }
664 if (fdq->d_fieldmask & FS_DQ_BHARD) {
665 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
666 qd->qd_qb.qb_limit = qp->qu_limit;
667 }
668 }
669
670 /* Write the quota into the quota file on disk */
671 ptr = qp;
672 nbytes = sizeof(struct gfs2_quota);
673get_a_page:
649 page = grab_cache_page(mapping, index); 674 page = grab_cache_page(mapping, index);
650 if (!page) 675 if (!page)
651 return -ENOMEM; 676 return -ENOMEM;
@@ -667,7 +692,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
667 if (!buffer_mapped(bh)) { 692 if (!buffer_mapped(bh)) {
668 gfs2_block_map(inode, iblock, bh, 1); 693 gfs2_block_map(inode, iblock, bh, 1);
669 if (!buffer_mapped(bh)) 694 if (!buffer_mapped(bh))
670 goto unlock; 695 goto unlock_out;
696 /* If it's a newly allocated disk block for quota, zero it */
697 if (buffer_new(bh))
698 zero_user(page, pos - blocksize, bh->b_size);
671 } 699 }
672 700
673 if (PageUptodate(page)) 701 if (PageUptodate(page))
@@ -677,32 +705,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
677 ll_rw_block(READ_META, 1, &bh); 705 ll_rw_block(READ_META, 1, &bh);
678 wait_on_buffer(bh); 706 wait_on_buffer(bh);
679 if (!buffer_uptodate(bh)) 707 if (!buffer_uptodate(bh))
680 goto unlock; 708 goto unlock_out;
681 } 709 }
682 710
683 gfs2_trans_add_bh(ip->i_gl, bh, 0); 711 gfs2_trans_add_bh(ip->i_gl, bh, 0);
684 712
685 kaddr = kmap_atomic(page, KM_USER0); 713 kaddr = kmap_atomic(page, KM_USER0);
686 qp = kaddr + offset; 714 if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
687 value = (s64)be64_to_cpu(qp->qu_value) + change; 715 nbytes = PAGE_CACHE_SIZE - offset;
688 qp->qu_value = cpu_to_be64(value); 716 memcpy(kaddr + offset, ptr, nbytes);
689 qd->qd_qb.qb_value = qp->qu_value;
690 if (fdq) {
691 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
692 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
693 qd->qd_qb.qb_warn = qp->qu_warn;
694 }
695 if (fdq->d_fieldmask & FS_DQ_BHARD) {
696 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
697 qd->qd_qb.qb_limit = qp->qu_limit;
698 }
699 }
700 flush_dcache_page(page); 717 flush_dcache_page(page);
701 kunmap_atomic(kaddr, KM_USER0); 718 kunmap_atomic(kaddr, KM_USER0);
719 unlock_page(page);
720 page_cache_release(page);
721
722 /* If quota straddles page boundary, we need to update the rest of the
723 * quota at the beginning of the next page */
724 if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
725 ptr = ptr + nbytes;
726 nbytes = sizeof(struct gfs2_quota) - nbytes;
727 offset = 0;
728 index++;
729 goto get_a_page;
730 }
702 731
732 /* Update the disk inode timestamp and size (if extended) */
703 err = gfs2_meta_inode_buffer(ip, &dibh); 733 err = gfs2_meta_inode_buffer(ip, &dibh);
704 if (err) 734 if (err)
705 goto unlock; 735 goto out;
706 736
707 size = loc + sizeof(struct gfs2_quota); 737 size = loc + sizeof(struct gfs2_quota);
708 if (size > inode->i_size) { 738 if (size > inode->i_size) {
@@ -715,7 +745,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
715 brelse(dibh); 745 brelse(dibh);
716 mark_inode_dirty(inode); 746 mark_inode_dirty(inode);
717 747
718unlock: 748out:
749 return err;
750unlock_out:
719 unlock_page(page); 751 unlock_page(page);
720 page_cache_release(page); 752 page_cache_release(page);
721 return err; 753 return err;
@@ -755,15 +787,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
755 goto out; 787 goto out;
756 788
757 for (x = 0; x < num_qd; x++) { 789 for (x = 0; x < num_qd; x++) {
758 int alloc_required;
759
760 offset = qd2offset(qda[x]); 790 offset = qd2offset(qda[x]);
761 error = gfs2_write_alloc_required(ip, offset, 791 if (gfs2_write_alloc_required(ip, offset,
762 sizeof(struct gfs2_quota), 792 sizeof(struct gfs2_quota)))
763 &alloc_required);
764 if (error)
765 goto out_gunlock;
766 if (alloc_required)
767 nalloc++; 793 nalloc++;
768 } 794 }
769 795
@@ -779,8 +805,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
779 * rgrp since it won't be allocated during the transaction 805 * rgrp since it won't be allocated during the transaction
780 */ 806 */
781 al->al_requested = 1; 807 al->al_requested = 1;
782 /* +1 in the end for block requested above for unstuffing */ 808 /* +3 in the end for unstuffing block, inode size update block
783 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1; 809 * and another block in case quota straddles page boundary and
810 * two blocks need to be updated instead of 1 */
811 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
784 812
785 if (nalloc) 813 if (nalloc)
786 al->al_requested += nalloc * (data_blocks + ind_blocks); 814 al->al_requested += nalloc * (data_blocks + ind_blocks);
@@ -1418,10 +1446,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1418 1446
1419 memset(fqs, 0, sizeof(struct fs_quota_stat)); 1447 memset(fqs, 0, sizeof(struct fs_quota_stat));
1420 fqs->qs_version = FS_QSTAT_VERSION; 1448 fqs->qs_version = FS_QSTAT_VERSION;
1421 if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON) 1449
1422 fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); 1450 switch (sdp->sd_args.ar_quota) {
1423 else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) 1451 case GFS2_QUOTA_ON:
1424 fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); 1452 fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
1453 /*FALLTHRU*/
1454 case GFS2_QUOTA_ACCOUNT:
1455 fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
1456 break;
1457 case GFS2_QUOTA_OFF:
1458 break;
1459 }
1460
1425 if (sdp->sd_quota_inode) { 1461 if (sdp->sd_quota_inode) {
1426 fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; 1462 fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
1427 fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; 1463 fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
@@ -1432,8 +1468,8 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1432 return 0; 1468 return 0;
1433} 1469}
1434 1470
1435static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id, 1471static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1436 struct fs_disk_quota *fdq) 1472 struct fs_disk_quota *fdq)
1437{ 1473{
1438 struct gfs2_sbd *sdp = sb->s_fs_info; 1474 struct gfs2_sbd *sdp = sb->s_fs_info;
1439 struct gfs2_quota_lvb *qlvb; 1475 struct gfs2_quota_lvb *qlvb;
@@ -1462,7 +1498,7 @@ static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
1462 1498
1463 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 1499 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
1464 fdq->d_version = FS_DQUOT_VERSION; 1500 fdq->d_version = FS_DQUOT_VERSION;
1465 fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA; 1501 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
1466 fdq->d_id = id; 1502 fdq->d_id = id;
1467 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); 1503 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
1468 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); 1504 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
@@ -1477,8 +1513,8 @@ out:
1477/* GFS2 only supports a subset of the XFS fields */ 1513/* GFS2 only supports a subset of the XFS fields */
1478#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) 1514#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
1479 1515
1480static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id, 1516static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1481 struct fs_disk_quota *fdq) 1517 struct fs_disk_quota *fdq)
1482{ 1518{
1483 struct gfs2_sbd *sdp = sb->s_fs_info; 1519 struct gfs2_sbd *sdp = sb->s_fs_info;
1484 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); 1520 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1497,12 +1533,12 @@ static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
1497 switch(type) { 1533 switch(type) {
1498 case USRQUOTA: 1534 case USRQUOTA:
1499 type = QUOTA_USER; 1535 type = QUOTA_USER;
1500 if (fdq->d_flags != XFS_USER_QUOTA) 1536 if (fdq->d_flags != FS_USER_QUOTA)
1501 return -EINVAL; 1537 return -EINVAL;
1502 break; 1538 break;
1503 case GRPQUOTA: 1539 case GRPQUOTA:
1504 type = QUOTA_GROUP; 1540 type = QUOTA_GROUP;
1505 if (fdq->d_flags != XFS_GROUP_QUOTA) 1541 if (fdq->d_flags != FS_GROUP_QUOTA)
1506 return -EINVAL; 1542 return -EINVAL;
1507 break; 1543 break;
1508 default: 1544 default:
@@ -1542,10 +1578,7 @@ static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
1542 goto out_i; 1578 goto out_i;
1543 1579
1544 offset = qd2offset(qd); 1580 offset = qd2offset(qd);
1545 error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota), 1581 alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
1546 &alloc_required);
1547 if (error)
1548 goto out_i;
1549 if (alloc_required) { 1582 if (alloc_required) {
1550 al = gfs2_alloc_get(ip); 1583 al = gfs2_alloc_get(ip);
1551 if (al == NULL) 1584 if (al == NULL)
@@ -1585,7 +1618,7 @@ out_put:
1585const struct quotactl_ops gfs2_quotactl_ops = { 1618const struct quotactl_ops gfs2_quotactl_ops = {
1586 .quota_sync = gfs2_quota_sync, 1619 .quota_sync = gfs2_quota_sync,
1587 .get_xstate = gfs2_quota_get_xstate, 1620 .get_xstate = gfs2_quota_get_xstate,
1588 .get_xquota = gfs2_xquota_get, 1621 .get_dqblk = gfs2_get_dqblk,
1589 .set_xquota = gfs2_xquota_set, 1622 .set_dqblk = gfs2_set_dqblk,
1590}; 1623};
1591 1624
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 195f60c8bd14..e7d236ca48bd 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
51 return ret; 51 return ret;
52} 52}
53 53
54extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); 54extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
55extern const struct quotactl_ops gfs2_quotactl_ops; 55extern const struct quotactl_ops gfs2_quotactl_ops;
56 56
57#endif /* __QUOTA_DOT_H__ */ 57#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 4b9bece3d437..f7f89a94a5a4 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,7 +14,6 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h> 16#include <linux/crc32.h>
17#include <linux/slow-work.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
20#include "incore.h" 19#include "incore.h"
@@ -28,6 +27,8 @@
28#include "util.h" 27#include "util.h"
29#include "dir.h" 28#include "dir.h"
30 29
30struct workqueue_struct *gfs_recovery_wq;
31
31int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, 32int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
32 struct buffer_head **bh) 33 struct buffer_head **bh)
33{ 34{
@@ -443,23 +444,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
443 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 444 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
444} 445}
445 446
446static int gfs2_recover_get_ref(struct slow_work *work) 447void gfs2_recover_func(struct work_struct *work)
447{
448 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
449 if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
450 return -EBUSY;
451 return 0;
452}
453
454static void gfs2_recover_put_ref(struct slow_work *work)
455{
456 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
457 clear_bit(JDF_RECOVERY, &jd->jd_flags);
458 smp_mb__after_clear_bit();
459 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
460}
461
462static void gfs2_recover_work(struct slow_work *work)
463{ 448{
464 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); 449 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
465 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 450 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -578,7 +563,7 @@ static void gfs2_recover_work(struct slow_work *work)
578 gfs2_glock_dq_uninit(&j_gh); 563 gfs2_glock_dq_uninit(&j_gh);
579 564
580 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 565 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
581 return; 566 goto done;
582 567
583fail_gunlock_tr: 568fail_gunlock_tr:
584 gfs2_glock_dq_uninit(&t_gh); 569 gfs2_glock_dq_uninit(&t_gh);
@@ -590,32 +575,35 @@ fail_gunlock_j:
590 } 575 }
591 576
592 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); 577 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
593
594fail: 578fail:
595 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); 579 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
580done:
581 clear_bit(JDF_RECOVERY, &jd->jd_flags);
582 smp_mb__after_clear_bit();
583 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
596} 584}
597 585
598struct slow_work_ops gfs2_recover_ops = {
599 .owner = THIS_MODULE,
600 .get_ref = gfs2_recover_get_ref,
601 .put_ref = gfs2_recover_put_ref,
602 .execute = gfs2_recover_work,
603};
604
605
606static int gfs2_recovery_wait(void *word) 586static int gfs2_recovery_wait(void *word)
607{ 587{
608 schedule(); 588 schedule();
609 return 0; 589 return 0;
610} 590}
611 591
612int gfs2_recover_journal(struct gfs2_jdesc *jd) 592int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
613{ 593{
614 int rv; 594 int rv;
615 rv = slow_work_enqueue(&jd->jd_work); 595
616 if (rv) 596 if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
617 return rv; 597 return -EBUSY;
618 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); 598
599 /* we have JDF_RECOVERY, queue should always succeed */
600 rv = queue_work(gfs_recovery_wq, &jd->jd_work);
601 BUG_ON(!rv);
602
603 if (wait)
604 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
605 TASK_UNINTERRUPTIBLE);
606
619 return 0; 607 return 0;
620} 608}
621 609
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 1616ac22569a..2226136c7647 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -12,6 +12,8 @@
12 12
13#include "incore.h" 13#include "incore.h"
14 14
15extern struct workqueue_struct *gfs_recovery_wq;
16
15static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) 17static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
16{ 18{
17 if (++*blk == sdp->sd_jdesc->jd_blocks) 19 if (++*blk == sdp->sd_jdesc->jd_blocks)
@@ -27,8 +29,8 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
27 29
28extern int gfs2_find_jhead(struct gfs2_jdesc *jd, 30extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header_host *head); 31 struct gfs2_log_header_host *head);
30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); 32extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
31extern struct slow_work_ops gfs2_recover_ops; 33extern void gfs2_recover_func(struct work_struct *work);
32 34
33#endif /* __RECOVERY_DOT_H__ */ 35#endif /* __RECOVERY_DOT_H__ */
34 36
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 854 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 855 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 856 nr_sects, GFP_NOFS,
857 DISCARD_FL_BARRIER); 857 BLKDEV_IFL_WAIT |
858 BLKDEV_IFL_BARRIER);
858 if (rv) 859 if (rv)
859 goto fail; 860 goto fail;
860 nr_sects = 0; 861 nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
869 } 870 }
870 if (nr_sects) { 871 if (nr_sects) {
871 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
872 DISCARD_FL_BARRIER); 873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
873 if (rv) 874 if (rv)
874 goto fail; 875 goto fail;
875 } 876 }
@@ -948,13 +949,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
948 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes 949 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
949 * @rgd: The rgrp 950 * @rgd: The rgrp
950 * 951 *
951 * Returns: The inode, if one has been found 952 * Returns: 0 if no error
953 * The inode, if one has been found, in inode.
952 */ 954 */
953 955
954static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, 956static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
955 u64 skip) 957 u64 skip)
956{ 958{
957 struct inode *inode;
958 u32 goal = 0, block; 959 u32 goal = 0, block;
959 u64 no_addr; 960 u64 no_addr;
960 struct gfs2_sbd *sdp = rgd->rd_sbd; 961 struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -979,14 +980,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
979 if (no_addr == skip) 980 if (no_addr == skip)
980 continue; 981 continue;
981 *last_unlinked = no_addr; 982 *last_unlinked = no_addr;
982 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, 983 return no_addr;
983 no_addr, -1, 1);
984 if (!IS_ERR(inode))
985 return inode;
986 } 984 }
987 985
988 rgd->rd_flags &= ~GFS2_RDF_CHECK; 986 rgd->rd_flags &= ~GFS2_RDF_CHECK;
989 return NULL; 987 return 0;
990} 988}
991 989
992/** 990/**
@@ -1067,11 +1065,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1067 * Try to acquire rgrp in way which avoids contending with others. 1065 * Try to acquire rgrp in way which avoids contending with others.
1068 * 1066 *
1069 * Returns: errno 1067 * Returns: errno
1068 * unlinked: the block address of an unlinked block to be reclaimed
1070 */ 1069 */
1071 1070
1072static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) 1071static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1072 u64 *last_unlinked)
1073{ 1073{
1074 struct inode *inode = NULL;
1075 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1074 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1076 struct gfs2_rgrpd *rgd, *begin = NULL; 1075 struct gfs2_rgrpd *rgd, *begin = NULL;
1077 struct gfs2_alloc *al = ip->i_alloc; 1076 struct gfs2_alloc *al = ip->i_alloc;
@@ -1080,6 +1079,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1080 int loops = 0; 1079 int loops = 0;
1081 int error, rg_locked; 1080 int error, rg_locked;
1082 1081
1082 *unlinked = 0;
1083 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 1083 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1084 1084
1085 while (rgd) { 1085 while (rgd) {
@@ -1096,19 +1096,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1096 case 0: 1096 case 0:
1097 if (try_rgrp_fit(rgd, al)) 1097 if (try_rgrp_fit(rgd, al))
1098 goto out; 1098 goto out;
1099 if (rgd->rd_flags & GFS2_RDF_CHECK) 1099 /* If the rg came in already locked, there's no
1100 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1100 way we can recover from a failed try_rgrp_unlink
1101 because that would require an iput which can only
1102 happen after the rgrp is unlocked. */
1103 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1104 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1105 ip->i_no_addr);
1101 if (!rg_locked) 1106 if (!rg_locked)
1102 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1107 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1103 if (inode) 1108 if (*unlinked)
1104 return inode; 1109 return -EAGAIN;
1105 /* fall through */ 1110 /* fall through */
1106 case GLR_TRYFAILED: 1111 case GLR_TRYFAILED:
1107 rgd = recent_rgrp_next(rgd); 1112 rgd = recent_rgrp_next(rgd);
1108 break; 1113 break;
1109 1114
1110 default: 1115 default:
1111 return ERR_PTR(error); 1116 return error;
1112 } 1117 }
1113 } 1118 }
1114 1119
@@ -1130,12 +1135,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1130 case 0: 1135 case 0:
1131 if (try_rgrp_fit(rgd, al)) 1136 if (try_rgrp_fit(rgd, al))
1132 goto out; 1137 goto out;
1133 if (rgd->rd_flags & GFS2_RDF_CHECK) 1138 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1134 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1139 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1140 ip->i_no_addr);
1135 if (!rg_locked) 1141 if (!rg_locked)
1136 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1142 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1137 if (inode) 1143 if (*unlinked)
1138 return inode; 1144 return -EAGAIN;
1139 break; 1145 break;
1140 1146
1141 case GLR_TRYFAILED: 1147 case GLR_TRYFAILED:
@@ -1143,7 +1149,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1143 break; 1149 break;
1144 1150
1145 default: 1151 default:
1146 return ERR_PTR(error); 1152 return error;
1147 } 1153 }
1148 1154
1149 rgd = gfs2_rgrpd_get_next(rgd); 1155 rgd = gfs2_rgrpd_get_next(rgd);
@@ -1152,7 +1158,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1152 1158
1153 if (rgd == begin) { 1159 if (rgd == begin) {
1154 if (++loops >= 3) 1160 if (++loops >= 3)
1155 return ERR_PTR(-ENOSPC); 1161 return -ENOSPC;
1156 if (!skipped) 1162 if (!skipped)
1157 loops++; 1163 loops++;
1158 flags = 0; 1164 flags = 0;
@@ -1172,7 +1178,7 @@ out:
1172 forward_rgrp_set(sdp, rgd); 1178 forward_rgrp_set(sdp, rgd);
1173 } 1179 }
1174 1180
1175 return NULL; 1181 return 0;
1176} 1182}
1177 1183
1178/** 1184/**
@@ -1186,9 +1192,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1186{ 1192{
1187 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1188 struct gfs2_alloc *al = ip->i_alloc; 1194 struct gfs2_alloc *al = ip->i_alloc;
1189 struct inode *inode;
1190 int error = 0; 1195 int error = 0;
1191 u64 last_unlinked = NO_BLOCK; 1196 u64 last_unlinked = NO_BLOCK, unlinked;
1192 1197
1193 if (gfs2_assert_warn(sdp, al->al_requested)) 1198 if (gfs2_assert_warn(sdp, al->al_requested))
1194 return -EINVAL; 1199 return -EINVAL;
@@ -1204,17 +1209,27 @@ try_again:
1204 if (error) 1209 if (error)
1205 return error; 1210 return error;
1206 1211
1207 inode = get_local_rgrp(ip, &last_unlinked); 1212 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1208 if (inode) { 1213 dinodes along the way, error will equal -EAGAIN and unlinked will
1214 contains it block address. We then need to look up that inode and
1215 try to free it, and try the allocation again. */
1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1217 if (error) {
1209 if (ip != GFS2_I(sdp->sd_rindex)) 1218 if (ip != GFS2_I(sdp->sd_rindex))
1210 gfs2_glock_dq_uninit(&al->al_ri_gh); 1219 gfs2_glock_dq_uninit(&al->al_ri_gh);
1211 if (IS_ERR(inode)) 1220 if (error != -EAGAIN)
1212 return PTR_ERR(inode); 1221 return error;
1213 iput(inode); 1222
1223 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1224 /* regardless of whether or not gfs2_process_unlinked_inode
1225 was successful, we don't want to repeat it again. */
1226 last_unlinked = unlinked;
1214 gfs2_log_flush(sdp, NULL); 1227 gfs2_log_flush(sdp, NULL);
1228 error = 0;
1229
1215 goto try_again; 1230 goto try_again;
1216 } 1231 }
1217 1232 /* no error, so we have the rgrp set in the inode's allocation. */
1218 al->al_file = file; 1233 al->al_file = file;
1219 al->al_line = line; 1234 al->al_line = line;
1220 1235
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50aac606b990..77cb9f830ee4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -342,8 +342,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
342{ 342{
343 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 343 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
344 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 344 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
345 int ar;
346 int error;
347 345
348 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || 346 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
349 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) { 347 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
@@ -352,13 +350,12 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
352 } 350 }
353 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; 351 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
354 352
355 error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar); 353 if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
356 if (!error && ar) {
357 gfs2_consist_inode(ip); 354 gfs2_consist_inode(ip);
358 error = -EIO; 355 return -EIO;
359 } 356 }
360 357
361 return error; 358 return 0;
362} 359}
363 360
364/** 361/**
@@ -1113,7 +1110,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1113 int error; 1110 int error;
1114 1111
1115 spin_lock(&gt->gt_spin); 1112 spin_lock(&gt->gt_spin);
1116 args.ar_commit = gt->gt_log_flush_secs; 1113 args.ar_commit = gt->gt_logd_secs;
1117 args.ar_quota_quantum = gt->gt_quota_quantum; 1114 args.ar_quota_quantum = gt->gt_quota_quantum;
1118 if (gt->gt_statfs_slow) 1115 if (gt->gt_statfs_slow)
1119 args.ar_statfs_quantum = 0; 1116 args.ar_statfs_quantum = 0;
@@ -1160,7 +1157,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1160 else 1157 else
1161 clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); 1158 clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
1162 spin_lock(&gt->gt_spin); 1159 spin_lock(&gt->gt_spin);
1163 gt->gt_log_flush_secs = args.ar_commit; 1160 gt->gt_logd_secs = args.ar_commit;
1164 gt->gt_quota_quantum = args.ar_quota_quantum; 1161 gt->gt_quota_quantum = args.ar_quota_quantum;
1165 if (args.ar_statfs_quantum) { 1162 if (args.ar_statfs_quantum) {
1166 gt->gt_statfs_slow = 0; 1163 gt->gt_statfs_slow = 0;
@@ -1191,7 +1188,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1191 * node for later deallocation. 1188 * node for later deallocation.
1192 */ 1189 */
1193 1190
1194static void gfs2_drop_inode(struct inode *inode) 1191static int gfs2_drop_inode(struct inode *inode)
1195{ 1192{
1196 struct gfs2_inode *ip = GFS2_I(inode); 1193 struct gfs2_inode *ip = GFS2_I(inode);
1197 1194
@@ -1200,26 +1197,7 @@ static void gfs2_drop_inode(struct inode *inode)
1200 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1197 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1201 clear_nlink(inode); 1198 clear_nlink(inode);
1202 } 1199 }
1203 generic_drop_inode(inode); 1200 return generic_drop_inode(inode);
1204}
1205
1206/**
1207 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
1208 * @inode: The VFS inode
1209 *
1210 */
1211
1212static void gfs2_clear_inode(struct inode *inode)
1213{
1214 struct gfs2_inode *ip = GFS2_I(inode);
1215
1216 ip->i_gl->gl_object = NULL;
1217 gfs2_glock_put(ip->i_gl);
1218 ip->i_gl = NULL;
1219 if (ip->i_iopen_gh.gh_gl) {
1220 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1221 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1222 }
1223} 1201}
1224 1202
1225static int is_ancestor(const struct dentry *d1, const struct dentry *d2) 1203static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
@@ -1305,8 +1283,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1305 } 1283 }
1306 if (args->ar_discard) 1284 if (args->ar_discard)
1307 seq_printf(s, ",discard"); 1285 seq_printf(s, ",discard");
1308 val = sdp->sd_tune.gt_log_flush_secs; 1286 val = sdp->sd_tune.gt_logd_secs;
1309 if (val != 60) 1287 if (val != 30)
1310 seq_printf(s, ",commit=%d", val); 1288 seq_printf(s, ",commit=%d", val);
1311 val = sdp->sd_tune.gt_statfs_quantum; 1289 val = sdp->sd_tune.gt_statfs_quantum;
1312 if (val != 30) 1290 if (val != 30)
@@ -1334,7 +1312,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1334 } 1312 }
1335 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 1313 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
1336 seq_printf(s, ",nobarrier"); 1314 seq_printf(s, ",nobarrier");
1337 1315 if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
1316 seq_printf(s, ",demote_interface_used");
1338 return 0; 1317 return 0;
1339} 1318}
1340 1319
@@ -1346,13 +1325,16 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1346 * is safe, just less efficient. 1325 * is safe, just less efficient.
1347 */ 1326 */
1348 1327
1349static void gfs2_delete_inode(struct inode *inode) 1328static void gfs2_evict_inode(struct inode *inode)
1350{ 1329{
1351 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; 1330 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
1352 struct gfs2_inode *ip = GFS2_I(inode); 1331 struct gfs2_inode *ip = GFS2_I(inode);
1353 struct gfs2_holder gh; 1332 struct gfs2_holder gh;
1354 int error; 1333 int error;
1355 1334
1335 if (inode->i_nlink)
1336 goto out;
1337
1356 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 1338 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1357 if (unlikely(error)) { 1339 if (unlikely(error)) {
1358 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 1340 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
@@ -1406,10 +1388,18 @@ out_unlock:
1406 gfs2_holder_uninit(&ip->i_iopen_gh); 1388 gfs2_holder_uninit(&ip->i_iopen_gh);
1407 gfs2_glock_dq_uninit(&gh); 1389 gfs2_glock_dq_uninit(&gh);
1408 if (error && error != GLR_TRYFAILED && error != -EROFS) 1390 if (error && error != GLR_TRYFAILED && error != -EROFS)
1409 fs_warn(sdp, "gfs2_delete_inode: %d\n", error); 1391 fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
1410out: 1392out:
1411 truncate_inode_pages(&inode->i_data, 0); 1393 truncate_inode_pages(&inode->i_data, 0);
1412 clear_inode(inode); 1394 end_writeback(inode);
1395
1396 ip->i_gl->gl_object = NULL;
1397 gfs2_glock_put(ip->i_gl);
1398 ip->i_gl = NULL;
1399 if (ip->i_iopen_gh.gh_gl) {
1400 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1401 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1402 }
1413} 1403}
1414 1404
1415static struct inode *gfs2_alloc_inode(struct super_block *sb) 1405static struct inode *gfs2_alloc_inode(struct super_block *sb)
@@ -1433,14 +1423,13 @@ const struct super_operations gfs2_super_ops = {
1433 .alloc_inode = gfs2_alloc_inode, 1423 .alloc_inode = gfs2_alloc_inode,
1434 .destroy_inode = gfs2_destroy_inode, 1424 .destroy_inode = gfs2_destroy_inode,
1435 .write_inode = gfs2_write_inode, 1425 .write_inode = gfs2_write_inode,
1436 .delete_inode = gfs2_delete_inode, 1426 .evict_inode = gfs2_evict_inode,
1437 .put_super = gfs2_put_super, 1427 .put_super = gfs2_put_super,
1438 .sync_fs = gfs2_sync_fs, 1428 .sync_fs = gfs2_sync_fs,
1439 .freeze_fs = gfs2_freeze, 1429 .freeze_fs = gfs2_freeze,
1440 .unfreeze_fs = gfs2_unfreeze, 1430 .unfreeze_fs = gfs2_unfreeze,
1441 .statfs = gfs2_statfs, 1431 .statfs = gfs2_statfs,
1442 .remount_fs = gfs2_remount_fs, 1432 .remount_fs = gfs2_remount_fs,
1443 .clear_inode = gfs2_clear_inode,
1444 .drop_inode = gfs2_drop_inode, 1433 .drop_inode = gfs2_drop_inode,
1445 .show_options = gfs2_show_options, 1434 .show_options = gfs2_show_options,
1446}; 1435};
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e3..a0464680af0b 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
54extern const struct export_operations gfs2_export_ops; 54extern const struct export_operations gfs2_export_ops;
55extern const struct super_operations gfs2_super_ops; 55extern const struct super_operations gfs2_super_ops;
56extern const struct dentry_operations gfs2_dops; 56extern const struct dentry_operations gfs2_dops;
57extern struct xattr_handler *gfs2_xattr_handlers[]; 57extern const struct xattr_handler *gfs2_xattr_handlers[];
58 58
59#endif /* __SUPER_DOT_H__ */ 59#endif /* __SUPER_DOT_H__ */
60 60
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 54fd98425991..ccacffd2faaa 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -25,6 +25,7 @@
25#include "quota.h" 25#include "quota.h"
26#include "util.h" 26#include "util.h"
27#include "glops.h" 27#include "glops.h"
28#include "recovery.h"
28 29
29struct gfs2_attr { 30struct gfs2_attr {
30 struct attribute attr; 31 struct attribute attr;
@@ -232,6 +233,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
232 glops = gfs2_glops_list[gltype]; 233 glops = gfs2_glops_list[gltype];
233 if (glops == NULL) 234 if (glops == NULL)
234 return -EINVAL; 235 return -EINVAL;
236 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
237 fs_info(sdp, "demote interface used\n");
235 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); 238 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
236 if (rv) 239 if (rv)
237 return rv; 240 return rv;
@@ -323,6 +326,30 @@ static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
323 return sprintf(buf, "%d\n", ls->ls_first); 326 return sprintf(buf, "%d\n", ls->ls_first);
324} 327}
325 328
329static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
330{
331 unsigned first;
332 int rv;
333
334 rv = sscanf(buf, "%u", &first);
335 if (rv != 1 || first > 1)
336 return -EINVAL;
337 spin_lock(&sdp->sd_jindex_spin);
338 rv = -EBUSY;
339 if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
340 goto out;
341 rv = -EINVAL;
342 if (sdp->sd_args.ar_spectator)
343 goto out;
344 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
345 goto out;
346 sdp->sd_lockstruct.ls_first = first;
347 rv = 0;
348out:
349 spin_unlock(&sdp->sd_jindex_spin);
350 return rv ? rv : len;
351}
352
326static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) 353static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
327{ 354{
328 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 355 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -350,7 +377,7 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
350 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 377 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
351 if (jd->jd_jid != jid) 378 if (jd->jd_jid != jid)
352 continue; 379 continue;
353 rv = slow_work_enqueue(&jd->jd_work); 380 rv = gfs2_recover_journal(jd, false);
354 break; 381 break;
355 } 382 }
356out: 383out:
@@ -375,14 +402,41 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
375 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); 402 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
376} 403}
377 404
405static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
406{
407 unsigned jid;
408 int rv;
409
410 rv = sscanf(buf, "%u", &jid);
411 if (rv != 1)
412 return -EINVAL;
413
414 spin_lock(&sdp->sd_jindex_spin);
415 rv = -EINVAL;
416 if (sdp->sd_args.ar_spectator)
417 goto out;
418 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
419 goto out;
420 rv = -EBUSY;
421 if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
422 goto out;
423 sdp->sd_lockstruct.ls_jid = jid;
424 smp_mb__after_clear_bit();
425 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
426 rv = 0;
427out:
428 spin_unlock(&sdp->sd_jindex_spin);
429 return rv ? rv : len;
430}
431
378#define GDLM_ATTR(_name,_mode,_show,_store) \ 432#define GDLM_ATTR(_name,_mode,_show,_store) \
379static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) 433static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
380 434
381GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); 435GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
382GDLM_ATTR(block, 0644, block_show, block_store); 436GDLM_ATTR(block, 0644, block_show, block_store);
383GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 437GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
384GDLM_ATTR(jid, 0444, jid_show, NULL); 438GDLM_ATTR(jid, 0644, jid_show, jid_store);
385GDLM_ATTR(first, 0444, lkfirst_show, NULL); 439GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store);
386GDLM_ATTR(first_done, 0444, first_done_show, NULL); 440GDLM_ATTR(first_done, 0444, first_done_show, NULL);
387GDLM_ATTR(recover, 0600, NULL, recover_store); 441GDLM_ATTR(recover, 0600, NULL, recover_store);
388GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); 442GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
@@ -468,8 +522,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
468} \ 522} \
469TUNE_ATTR_2(name, name##_store) 523TUNE_ATTR_2(name, name##_store)
470 524
471TUNE_ATTR(incore_log_blocks, 0);
472TUNE_ATTR(log_flush_secs, 0);
473TUNE_ATTR(quota_warn_period, 0); 525TUNE_ATTR(quota_warn_period, 0);
474TUNE_ATTR(quota_quantum, 0); 526TUNE_ATTR(quota_quantum, 0);
475TUNE_ATTR(max_readahead, 0); 527TUNE_ATTR(max_readahead, 0);
@@ -481,8 +533,6 @@ TUNE_ATTR(statfs_quantum, 1);
481TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 533TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
482 534
483static struct attribute *tune_attrs[] = { 535static struct attribute *tune_attrs[] = {
484 &tune_attr_incore_log_blocks.attr,
485 &tune_attr_log_flush_secs.attr,
486 &tune_attr_quota_warn_period.attr, 536 &tune_attr_quota_warn_period.attr,
487 &tune_attr_quota_quantum.attr, 537 &tune_attr_quota_quantum.attr,
488 &tune_attr_max_readahead.attr, 538 &tune_attr_max_readahead.attr,
@@ -566,7 +616,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
566 616
567 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 617 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
568 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 618 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
569 if (!sdp->sd_args.ar_spectator) 619 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
570 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); 620 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
571 if (gfs2_uuid_valid(uuid)) 621 if (gfs2_uuid_valid(uuid))
572 add_uevent_var(env, "UUID=%pUB", uuid); 622 add_uevent_var(env, "UUID=%pUB", uuid);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9fa3549..9ec73a854111 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
23#include "meta_io.h" 23#include "meta_io.h"
24#include "trans.h" 24#include "trans.h"
25#include "util.h" 25#include "util.h"
26#include "trace_gfs2.h"
26 27
27int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
28 unsigned int revokes) 29 unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
75 return error; 76 return error;
76} 77}
77 78
79/**
80 * gfs2_log_release - Release a given number of log blocks
81 * @sdp: The GFS2 superblock
82 * @blks: The number of blocks
83 *
84 */
85
86static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
87{
88
89 atomic_add(blks, &sdp->sd_log_blks_free);
90 trace_gfs2_log_blocks(sdp, blks);
91 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
92 sdp->sd_jdesc->jd_blocks);
93 up_read(&sdp->sd_log_flush_lock);
94}
95
78void gfs2_trans_end(struct gfs2_sbd *sdp) 96void gfs2_trans_end(struct gfs2_sbd *sdp)
79{ 97{
80 struct gfs2_trans *tr = current->journal_info; 98 struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d4..776af6eb4bcb 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,6 +1296,7 @@ fail:
1296 1296
1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) 1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1298{ 1298{
1299 struct inode *inode = &ip->i_inode;
1299 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1300 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1300 struct gfs2_ea_location el; 1301 struct gfs2_ea_location el;
1301 struct buffer_head *dibh; 1302 struct buffer_head *dibh;
@@ -1321,14 +1322,25 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1321 return error; 1322 return error;
1322 1323
1323 error = gfs2_meta_inode_buffer(ip, &dibh); 1324 error = gfs2_meta_inode_buffer(ip, &dibh);
1324 if (!error) { 1325 if (error)
1325 error = inode_setattr(&ip->i_inode, attr); 1326 goto out_trans_end;
1326 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); 1327
1327 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1328 if ((attr->ia_valid & ATTR_SIZE) &&
1328 gfs2_dinode_out(ip, dibh->b_data); 1329 attr->ia_size != i_size_read(inode)) {
1329 brelse(dibh); 1330 int error;
1331
1332 error = vmtruncate(inode, attr->ia_size);
1333 gfs2_assert_warn(GFS2_SB(inode), !error);
1330 } 1334 }
1331 1335
1336 setattr_copy(inode, attr);
1337 mark_inode_dirty(inode);
1338
1339 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1340 gfs2_dinode_out(ip, dibh->b_data);
1341 brelse(dibh);
1342
1343out_trans_end:
1332 gfs2_trans_end(sdp); 1344 gfs2_trans_end(sdp);
1333 return error; 1345 return error;
1334} 1346}
@@ -1535,21 +1547,21 @@ out_alloc:
1535 return error; 1547 return error;
1536} 1548}
1537 1549
1538static struct xattr_handler gfs2_xattr_user_handler = { 1550static const struct xattr_handler gfs2_xattr_user_handler = {
1539 .prefix = XATTR_USER_PREFIX, 1551 .prefix = XATTR_USER_PREFIX,
1540 .flags = GFS2_EATYPE_USR, 1552 .flags = GFS2_EATYPE_USR,
1541 .get = gfs2_xattr_get, 1553 .get = gfs2_xattr_get,
1542 .set = gfs2_xattr_set, 1554 .set = gfs2_xattr_set,
1543}; 1555};
1544 1556
1545static struct xattr_handler gfs2_xattr_security_handler = { 1557static const struct xattr_handler gfs2_xattr_security_handler = {
1546 .prefix = XATTR_SECURITY_PREFIX, 1558 .prefix = XATTR_SECURITY_PREFIX,
1547 .flags = GFS2_EATYPE_SECURITY, 1559 .flags = GFS2_EATYPE_SECURITY,
1548 .get = gfs2_xattr_get, 1560 .get = gfs2_xattr_get,
1549 .set = gfs2_xattr_set, 1561 .set = gfs2_xattr_set,
1550}; 1562};
1551 1563
1552struct xattr_handler *gfs2_xattr_handlers[] = { 1564const struct xattr_handler *gfs2_xattr_handlers[] = {
1553 &gfs2_xattr_user_handler, 1565 &gfs2_xattr_user_handler,
1554 &gfs2_xattr_security_handler, 1566 &gfs2_xattr_security_handler,
1555 &gfs2_xattr_system_handler, 1567 &gfs2_xattr_system_handler,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index fe35e3b626c4..4f55651aaa51 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -193,7 +193,7 @@ extern int hfs_inode_setattr(struct dentry *, struct iattr *);
193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, 193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
194 __be32 log_size, __be32 phys_size, u32 clump_size); 194 __be32 log_size, __be32 phys_size, u32 clump_size);
195extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *); 195extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *);
196extern void hfs_clear_inode(struct inode *); 196extern void hfs_evict_inode(struct inode *);
197extern void hfs_delete_inode(struct inode *); 197extern void hfs_delete_inode(struct inode *);
198 198
199/* attr.c */ 199/* attr.c */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 14f5cb1b9fdc..397b7adc7ce6 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -39,10 +39,19 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned flags, 39 loff_t pos, unsigned len, unsigned flags,
40 struct page **pagep, void **fsdata) 40 struct page **pagep, void **fsdata)
41{ 41{
42 int ret;
43
42 *pagep = NULL; 44 *pagep = NULL;
43 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 45 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
44 hfs_get_block, 46 hfs_get_block,
45 &HFS_I(mapping->host)->phys_size); 47 &HFS_I(mapping->host)->phys_size);
48 if (unlikely(ret)) {
49 loff_t isize = mapping->host->i_size;
50 if (pos + len > isize)
51 vmtruncate(mapping->host, isize);
52 }
53
54 return ret;
46} 55}
47 56
48static sector_t hfs_bmap(struct address_space *mapping, sector_t block) 57static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
@@ -112,9 +121,24 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
112{ 121{
113 struct file *file = iocb->ki_filp; 122 struct file *file = iocb->ki_filp;
114 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 123 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
124 ssize_t ret;
115 125
116 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 126 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
117 offset, nr_segs, hfs_get_block, NULL); 127 offset, nr_segs, hfs_get_block, NULL);
128
129 /*
130 * In case of error extending write may have instantiated a few
131 * blocks outside i_size. Trim these off again.
132 */
133 if (unlikely((rw & WRITE) && ret < 0)) {
134 loff_t isize = i_size_read(inode);
135 loff_t end = offset + iov_length(iov, nr_segs);
136
137 if (end > isize)
138 vmtruncate(inode, isize);
139 }
140
141 return ret;
118} 142}
119 143
120static int hfs_writepages(struct address_space *mapping, 144static int hfs_writepages(struct address_space *mapping,
@@ -507,8 +531,10 @@ out:
507 return NULL; 531 return NULL;
508} 532}
509 533
510void hfs_clear_inode(struct inode *inode) 534void hfs_evict_inode(struct inode *inode)
511{ 535{
536 truncate_inode_pages(&inode->i_data, 0);
537 end_writeback(inode);
512 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { 538 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
513 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 539 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
514 iput(HFS_I(inode)->rsrc_inode); 540 iput(HFS_I(inode)->rsrc_inode);
@@ -588,13 +614,43 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
588 attr->ia_mode = inode->i_mode & ~S_IWUGO; 614 attr->ia_mode = inode->i_mode & ~S_IWUGO;
589 attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask; 615 attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
590 } 616 }
591 error = inode_setattr(inode, attr);
592 if (error)
593 return error;
594 617
618 if ((attr->ia_valid & ATTR_SIZE) &&
619 attr->ia_size != i_size_read(inode)) {
620 error = vmtruncate(inode, attr->ia_size);
621 if (error)
622 return error;
623 }
624
625 setattr_copy(inode, attr);
626 mark_inode_dirty(inode);
595 return 0; 627 return 0;
596} 628}
597 629
630static int hfs_file_fsync(struct file *filp, int datasync)
631{
632 struct inode *inode = filp->f_mapping->host;
633 struct super_block * sb;
634 int ret, err;
635
636 /* sync the inode to buffers */
637 ret = write_inode_now(inode, 0);
638
639 /* sync the superblock to buffers */
640 sb = inode->i_sb;
641 if (sb->s_dirt) {
642 lock_super(sb);
643 sb->s_dirt = 0;
644 if (!(sb->s_flags & MS_RDONLY))
645 hfs_mdb_commit(sb);
646 unlock_super(sb);
647 }
648 /* .. finally sync the buffers to disk */
649 err = sync_blockdev(sb->s_bdev);
650 if (!ret)
651 ret = err;
652 return ret;
653}
598 654
599static const struct file_operations hfs_file_operations = { 655static const struct file_operations hfs_file_operations = {
600 .llseek = generic_file_llseek, 656 .llseek = generic_file_llseek,
@@ -604,7 +660,7 @@ static const struct file_operations hfs_file_operations = {
604 .aio_write = generic_file_aio_write, 660 .aio_write = generic_file_aio_write,
605 .mmap = generic_file_mmap, 661 .mmap = generic_file_mmap,
606 .splice_read = generic_file_splice_read, 662 .splice_read = generic_file_splice_read,
607 .fsync = file_fsync, 663 .fsync = hfs_file_fsync,
608 .open = hfs_file_open, 664 .open = hfs_file_open,
609 .release = hfs_file_release, 665 .release = hfs_file_release,
610}; 666};
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 0a81eb7111f3..34235d4bf08b 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -181,7 +181,7 @@ static const struct super_operations hfs_super_operations = {
181 .alloc_inode = hfs_alloc_inode, 181 .alloc_inode = hfs_alloc_inode,
182 .destroy_inode = hfs_destroy_inode, 182 .destroy_inode = hfs_destroy_inode,
183 .write_inode = hfs_write_inode, 183 .write_inode = hfs_write_inode,
184 .clear_inode = hfs_clear_inode, 184 .evict_inode = hfs_evict_inode,
185 .put_super = hfs_put_super, 185 .put_super = hfs_put_super,
186 .write_super = hfs_write_super, 186 .write_super = hfs_write_super,
187 .sync_fs = hfs_sync_fs, 187 .sync_fs = hfs_sync_fs,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
494const struct file_operations hfsplus_dir_operations = { 494const struct file_operations hfsplus_dir_operations = {
495 .read = generic_read_dir, 495 .read = generic_read_dir,
496 .readdir = hfsplus_readdir, 496 .readdir = hfsplus_readdir,
497 .ioctl = hfsplus_ioctl, 497 .unlocked_ioctl = hfsplus_ioctl,
498 .llseek = generic_file_llseek, 498 .llseek = generic_file_llseek,
499 .release = hfsplus_dir_release, 499 .release = hfsplus_dir_release,
500}; 500};
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..dc856be3c2b0 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
337void hfsplus_delete_inode(struct inode *); 337void hfsplus_delete_inode(struct inode *);
338 338
339/* ioctl.c */ 339/* ioctl.c */
340int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 340long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
341 unsigned long arg);
342int hfsplus_setxattr(struct dentry *dentry, const char *name, 341int hfsplus_setxattr(struct dentry *dentry, const char *name,
343 const void *value, size_t size, int flags); 342 const void *value, size_t size, int flags);
344ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 343ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
@@ -352,6 +351,7 @@ int hfsplus_show_options(struct seq_file *, struct vfsmount *);
352 351
353/* super.c */ 352/* super.c */
354struct inode *hfsplus_iget(struct super_block *, unsigned long); 353struct inode *hfsplus_iget(struct super_block *, unsigned long);
354int hfsplus_sync_fs(struct super_block *sb, int wait);
355 355
356/* tables.c */ 356/* tables.c */
357extern u16 hfsplus_case_fold_table[]; 357extern u16 hfsplus_case_fold_table[];
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..c5a979d62c65 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -31,10 +31,19 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
31 loff_t pos, unsigned len, unsigned flags, 31 loff_t pos, unsigned len, unsigned flags,
32 struct page **pagep, void **fsdata) 32 struct page **pagep, void **fsdata)
33{ 33{
34 int ret;
35
34 *pagep = NULL; 36 *pagep = NULL;
35 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
36 hfsplus_get_block, 38 hfsplus_get_block,
37 &HFSPLUS_I(mapping->host).phys_size); 39 &HFSPLUS_I(mapping->host).phys_size);
40 if (unlikely(ret)) {
41 loff_t isize = mapping->host->i_size;
42 if (pos + len > isize)
43 vmtruncate(mapping->host, isize);
44 }
45
46 return ret;
38} 47}
39 48
40static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block) 49static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
@@ -105,9 +114,24 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
105{ 114{
106 struct file *file = iocb->ki_filp; 115 struct file *file = iocb->ki_filp;
107 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 116 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
117 ssize_t ret;
108 118
109 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 119 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
110 offset, nr_segs, hfsplus_get_block, NULL); 120 offset, nr_segs, hfsplus_get_block, NULL);
121
122 /*
123 * In case of error extending write may have instantiated a few
124 * blocks outside i_size. Trim these off again.
125 */
126 if (unlikely((rw & WRITE) && ret < 0)) {
127 loff_t isize = i_size_read(inode);
128 loff_t end = offset + iov_length(iov, nr_segs);
129
130 if (end > isize)
131 vmtruncate(inode, isize);
132 }
133
134 return ret;
111} 135}
112 136
113static int hfsplus_writepages(struct address_space *mapping, 137static int hfsplus_writepages(struct address_space *mapping,
@@ -266,9 +290,56 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
266 return 0; 290 return 0;
267} 291}
268 292
293static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
294{
295 struct inode *inode = dentry->d_inode;
296 int error;
297
298 error = inode_change_ok(inode, attr);
299 if (error)
300 return error;
301
302 if ((attr->ia_valid & ATTR_SIZE) &&
303 attr->ia_size != i_size_read(inode)) {
304 error = vmtruncate(inode, attr->ia_size);
305 if (error)
306 return error;
307 }
308
309 setattr_copy(inode, attr);
310 mark_inode_dirty(inode);
311 return 0;
312}
313
314static int hfsplus_file_fsync(struct file *filp, int datasync)
315{
316 struct inode *inode = filp->f_mapping->host;
317 struct super_block * sb;
318 int ret, err;
319
320 /* sync the inode to buffers */
321 ret = write_inode_now(inode, 0);
322
323 /* sync the superblock to buffers */
324 sb = inode->i_sb;
325 if (sb->s_dirt) {
326 if (!(sb->s_flags & MS_RDONLY))
327 hfsplus_sync_fs(sb, 1);
328 else
329 sb->s_dirt = 0;
330 }
331
332 /* .. finally sync the buffers to disk */
333 err = sync_blockdev(sb->s_bdev);
334 if (!ret)
335 ret = err;
336 return ret;
337}
338
269static const struct inode_operations hfsplus_file_inode_operations = { 339static const struct inode_operations hfsplus_file_inode_operations = {
270 .lookup = hfsplus_file_lookup, 340 .lookup = hfsplus_file_lookup,
271 .truncate = hfsplus_file_truncate, 341 .truncate = hfsplus_file_truncate,
342 .setattr = hfsplus_setattr,
272 .setxattr = hfsplus_setxattr, 343 .setxattr = hfsplus_setxattr,
273 .getxattr = hfsplus_getxattr, 344 .getxattr = hfsplus_getxattr,
274 .listxattr = hfsplus_listxattr, 345 .listxattr = hfsplus_listxattr,
@@ -282,10 +353,10 @@ static const struct file_operations hfsplus_file_operations = {
282 .aio_write = generic_file_aio_write, 353 .aio_write = generic_file_aio_write,
283 .mmap = generic_file_mmap, 354 .mmap = generic_file_mmap,
284 .splice_read = generic_file_splice_read, 355 .splice_read = generic_file_splice_read,
285 .fsync = file_fsync, 356 .fsync = hfsplus_file_fsync,
286 .open = hfsplus_file_open, 357 .open = hfsplus_file_open,
287 .release = hfsplus_file_release, 358 .release = hfsplus_file_release,
288 .ioctl = hfsplus_ioctl, 359 .unlocked_ioctl = hfsplus_ioctl,
289}; 360};
290 361
291struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 362struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include "hfsplus_fs.h" 22#include "hfsplus_fs.h"
22 23
23int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24 unsigned long arg)
25{ 25{
26 struct inode *inode = filp->f_path.dentry->d_inode;
26 unsigned int flags; 27 unsigned int flags;
27 28
29 lock_kernel();
28 switch (cmd) { 30 switch (cmd) {
29 case HFSPLUS_IOC_EXT2_GETFLAGS: 31 case HFSPLUS_IOC_EXT2_GETFLAGS:
30 flags = 0; 32 flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
38 case HFSPLUS_IOC_EXT2_SETFLAGS: { 40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
39 int err = 0; 41 int err = 0;
40 err = mnt_want_write(filp->f_path.mnt); 42 err = mnt_want_write(filp->f_path.mnt);
41 if (err) 43 if (err) {
44 unlock_kernel();
42 return err; 45 return err;
46 }
43 47
44 if (!is_owner_or_cap(inode)) { 48 if (!is_owner_or_cap(inode)) {
45 err = -EACCES; 49 err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
85 mark_inode_dirty(inode); 89 mark_inode_dirty(inode);
86setflags_out: 90setflags_out:
87 mnt_drop_write(filp->f_path.mnt); 91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
88 return err; 93 return err;
89 } 94 }
90 default: 95 default:
96 unlock_kernel();
91 return -ENOTTY; 97 return -ENOTTY;
92 } 98 }
93} 99}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 74b473a8ef92..3b55c050c742 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -145,16 +145,18 @@ static int hfsplus_write_inode(struct inode *inode,
145 return ret; 145 return ret;
146} 146}
147 147
148static void hfsplus_clear_inode(struct inode *inode) 148static void hfsplus_evict_inode(struct inode *inode)
149{ 149{
150 dprint(DBG_INODE, "hfsplus_clear_inode: %lu\n", inode->i_ino); 150 dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
151 truncate_inode_pages(&inode->i_data, 0);
152 end_writeback(inode);
151 if (HFSPLUS_IS_RSRC(inode)) { 153 if (HFSPLUS_IS_RSRC(inode)) {
152 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; 154 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
153 iput(HFSPLUS_I(inode).rsrc_inode); 155 iput(HFSPLUS_I(inode).rsrc_inode);
154 } 156 }
155} 157}
156 158
157static int hfsplus_sync_fs(struct super_block *sb, int wait) 159int hfsplus_sync_fs(struct super_block *sb, int wait)
158{ 160{
159 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 161 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
160 162
@@ -293,7 +295,7 @@ static const struct super_operations hfsplus_sops = {
293 .alloc_inode = hfsplus_alloc_inode, 295 .alloc_inode = hfsplus_alloc_inode,
294 .destroy_inode = hfsplus_destroy_inode, 296 .destroy_inode = hfsplus_destroy_inode,
295 .write_inode = hfsplus_write_inode, 297 .write_inode = hfsplus_write_inode,
296 .clear_inode = hfsplus_clear_inode, 298 .evict_inode = hfsplus_evict_inode,
297 .put_super = hfsplus_put_super, 299 .put_super = hfsplus_put_super,
298 .write_super = hfsplus_write_super, 300 .write_super = hfsplus_write_super,
299 .sync_fs = hfsplus_sync_fs, 301 .sync_fs = hfsplus_sync_fs,
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 2f34f8f2134b..6bbd75c5589b 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -53,18 +53,28 @@ struct hostfs_iattr {
53 struct timespec ia_ctime; 53 struct timespec ia_ctime;
54}; 54};
55 55
56extern int stat_file(const char *path, unsigned long long *inode_out, 56struct hostfs_stat {
57 int *mode_out, int *nlink_out, int *uid_out, int *gid_out, 57 unsigned long long ino;
58 unsigned long long *size_out, struct timespec *atime_out, 58 unsigned int mode;
59 struct timespec *mtime_out, struct timespec *ctime_out, 59 unsigned int nlink;
60 int *blksize_out, unsigned long long *blocks_out, int fd); 60 unsigned int uid;
61 unsigned int gid;
62 unsigned long long size;
63 struct timespec atime, mtime, ctime;
64 unsigned int blksize;
65 unsigned long long blocks;
66 unsigned int maj;
67 unsigned int min;
68};
69
70extern int stat_file(const char *path, struct hostfs_stat *p, int fd);
61extern int access_file(char *path, int r, int w, int x); 71extern int access_file(char *path, int r, int w, int x);
62extern int open_file(char *path, int r, int w, int append); 72extern int open_file(char *path, int r, int w, int append);
63extern int file_type(const char *path, int *maj, int *min);
64extern void *open_dir(char *path, int *err_out); 73extern void *open_dir(char *path, int *err_out);
65extern char *read_dir(void *stream, unsigned long long *pos, 74extern char *read_dir(void *stream, unsigned long long *pos,
66 unsigned long long *ino_out, int *len_out); 75 unsigned long long *ino_out, int *len_out);
67extern void close_file(void *stream); 76extern void close_file(void *stream);
77extern int replace_file(int oldfd, int fd);
68extern void close_dir(void *stream); 78extern void close_dir(void *stream);
69extern int read_file(int fd, unsigned long long *offset, char *buf, int len); 79extern int read_file(int fd, unsigned long long *offset, char *buf, int len);
70extern int write_file(int fd, unsigned long long *offset, const char *buf, 80extern int write_file(int fd, unsigned long long *offset, const char *buf,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8f4cf1..f7dc9b5f9ef8 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -14,12 +14,12 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h>
17#include "hostfs.h" 18#include "hostfs.h"
18#include "init.h" 19#include "init.h"
19#include "kern.h" 20#include "kern.h"
20 21
21struct hostfs_inode_info { 22struct hostfs_inode_info {
22 char *host_filename;
23 int fd; 23 int fd;
24 fmode_t mode; 24 fmode_t mode;
25 struct inode vfs_inode; 25 struct inode vfs_inode;
@@ -49,7 +49,7 @@ static int append = 0;
49 49
50static const struct inode_operations hostfs_iops; 50static const struct inode_operations hostfs_iops;
51static const struct inode_operations hostfs_dir_iops; 51static const struct inode_operations hostfs_dir_iops;
52static const struct address_space_operations hostfs_link_aops; 52static const struct inode_operations hostfs_link_iops;
53 53
54#ifndef MODULE 54#ifndef MODULE
55static int __init hostfs_args(char *options, int *add) 55static int __init hostfs_args(char *options, int *add)
@@ -90,71 +90,58 @@ __uml_setup("hostfs=", hostfs_args,
90); 90);
91#endif 91#endif
92 92
93static char *dentry_name(struct dentry *dentry, int extra) 93static char *__dentry_name(struct dentry *dentry, char *name)
94{ 94{
95 struct dentry *parent; 95 char *p = __dentry_path(dentry, name, PATH_MAX);
96 char *root, *name; 96 char *root;
97 int len; 97 size_t len;
98
99 len = 0;
100 parent = dentry;
101 while (parent->d_parent != parent) {
102 len += parent->d_name.len + 1;
103 parent = parent->d_parent;
104 }
105 98
106 root = HOSTFS_I(parent->d_inode)->host_filename; 99 spin_unlock(&dcache_lock);
107 len += strlen(root);
108 name = kmalloc(len + extra + 1, GFP_KERNEL);
109 if (name == NULL)
110 return NULL;
111 100
112 name[len] = '\0'; 101 root = dentry->d_sb->s_fs_info;
113 parent = dentry; 102 len = strlen(root);
114 while (parent->d_parent != parent) { 103 if (IS_ERR(p)) {
115 len -= parent->d_name.len + 1; 104 __putname(name);
116 name[len] = '/'; 105 return NULL;
117 strncpy(&name[len + 1], parent->d_name.name, 106 }
118 parent->d_name.len); 107 strlcpy(name, root, PATH_MAX);
119 parent = parent->d_parent; 108 if (len > p - name) {
109 __putname(name);
110 return NULL;
111 }
112 if (p > name + len) {
113 char *s = name + len;
114 while ((*s++ = *p++) != '\0')
115 ;
120 } 116 }
121 strncpy(name, root, strlen(root));
122 return name; 117 return name;
123} 118}
124 119
125static char *inode_name(struct inode *ino, int extra) 120static char *dentry_name(struct dentry *dentry)
126{ 121{
127 struct dentry *dentry; 122 char *name = __getname();
123 if (!name)
124 return NULL;
128 125
129 dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias); 126 spin_lock(&dcache_lock);
130 return dentry_name(dentry, extra); 127 return __dentry_name(dentry, name); /* will unlock */
131} 128}
132 129
133static int read_name(struct inode *ino, char *name) 130static char *inode_name(struct inode *ino)
134{ 131{
135 /* 132 struct dentry *dentry;
136 * The non-int inode fields are copied into ints by stat_file and 133 char *name = __getname();
137 * then copied into the inode because passing the actual pointers 134 if (!name)
138 * in and having them treated as int * breaks on big-endian machines 135 return NULL;
139 */
140 int err;
141 int i_mode, i_nlink, i_blksize;
142 unsigned long long i_size;
143 unsigned long long i_ino;
144 unsigned long long i_blocks;
145
146 err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid,
147 &ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime,
148 &ino->i_ctime, &i_blksize, &i_blocks, -1);
149 if (err)
150 return err;
151 136
152 ino->i_ino = i_ino; 137 spin_lock(&dcache_lock);
153 ino->i_mode = i_mode; 138 if (list_empty(&ino->i_dentry)) {
154 ino->i_nlink = i_nlink; 139 spin_unlock(&dcache_lock);
155 ino->i_size = i_size; 140 __putname(name);
156 ino->i_blocks = i_blocks; 141 return NULL;
157 return 0; 142 }
143 dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias);
144 return __dentry_name(dentry, name); /* will unlock */
158} 145}
159 146
160static char *follow_link(char *link) 147static char *follow_link(char *link)
@@ -205,53 +192,11 @@ static char *follow_link(char *link)
205 return ERR_PTR(n); 192 return ERR_PTR(n);
206} 193}
207 194
208static int hostfs_read_inode(struct inode *ino)
209{
210 char *name;
211 int err = 0;
212
213 /*
214 * Unfortunately, we are called from iget() when we don't have a dentry
215 * allocated yet.
216 */
217 if (list_empty(&ino->i_dentry))
218 goto out;
219
220 err = -ENOMEM;
221 name = inode_name(ino, 0);
222 if (name == NULL)
223 goto out;
224
225 if (file_type(name, NULL, NULL) == OS_TYPE_SYMLINK) {
226 name = follow_link(name);
227 if (IS_ERR(name)) {
228 err = PTR_ERR(name);
229 goto out;
230 }
231 }
232
233 err = read_name(ino, name);
234 kfree(name);
235 out:
236 return err;
237}
238
239static struct inode *hostfs_iget(struct super_block *sb) 195static struct inode *hostfs_iget(struct super_block *sb)
240{ 196{
241 struct inode *inode; 197 struct inode *inode = new_inode(sb);
242 long ret;
243
244 inode = iget_locked(sb, 0);
245 if (!inode) 198 if (!inode)
246 return ERR_PTR(-ENOMEM); 199 return ERR_PTR(-ENOMEM);
247 if (inode->i_state & I_NEW) {
248 ret = hostfs_read_inode(inode);
249 if (ret < 0) {
250 iget_failed(inode);
251 return ERR_PTR(ret);
252 }
253 unlock_new_inode(inode);
254 }
255 return inode; 200 return inode;
256} 201}
257 202
@@ -269,7 +214,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
269 long long f_files; 214 long long f_files;
270 long long f_ffree; 215 long long f_ffree;
271 216
272 err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename, 217 err = do_statfs(dentry->d_sb->s_fs_info,
273 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, 218 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
274 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), 219 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
275 &sf->f_namelen, sf->f_spare); 220 &sf->f_namelen, sf->f_spare);
@@ -288,47 +233,32 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
288{ 233{
289 struct hostfs_inode_info *hi; 234 struct hostfs_inode_info *hi;
290 235
291 hi = kmalloc(sizeof(*hi), GFP_KERNEL); 236 hi = kzalloc(sizeof(*hi), GFP_KERNEL);
292 if (hi == NULL) 237 if (hi == NULL)
293 return NULL; 238 return NULL;
294 239 hi->fd = -1;
295 *hi = ((struct hostfs_inode_info) { .host_filename = NULL,
296 .fd = -1,
297 .mode = 0 });
298 inode_init_once(&hi->vfs_inode); 240 inode_init_once(&hi->vfs_inode);
299 return &hi->vfs_inode; 241 return &hi->vfs_inode;
300} 242}
301 243
302static void hostfs_delete_inode(struct inode *inode) 244static void hostfs_evict_inode(struct inode *inode)
303{ 245{
304 truncate_inode_pages(&inode->i_data, 0); 246 truncate_inode_pages(&inode->i_data, 0);
247 end_writeback(inode);
305 if (HOSTFS_I(inode)->fd != -1) { 248 if (HOSTFS_I(inode)->fd != -1) {
306 close_file(&HOSTFS_I(inode)->fd); 249 close_file(&HOSTFS_I(inode)->fd);
307 HOSTFS_I(inode)->fd = -1; 250 HOSTFS_I(inode)->fd = -1;
308 } 251 }
309 clear_inode(inode);
310} 252}
311 253
312static void hostfs_destroy_inode(struct inode *inode) 254static void hostfs_destroy_inode(struct inode *inode)
313{ 255{
314 kfree(HOSTFS_I(inode)->host_filename);
315
316 /*
317 * XXX: This should not happen, probably. The check is here for
318 * additional safety.
319 */
320 if (HOSTFS_I(inode)->fd != -1) {
321 close_file(&HOSTFS_I(inode)->fd);
322 printk(KERN_DEBUG "Closing host fd in .destroy_inode\n");
323 }
324
325 kfree(HOSTFS_I(inode)); 256 kfree(HOSTFS_I(inode));
326} 257}
327 258
328static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) 259static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
329{ 260{
330 struct inode *root = vfs->mnt_sb->s_root->d_inode; 261 const char *root_path = vfs->mnt_sb->s_fs_info;
331 const char *root_path = HOSTFS_I(root)->host_filename;
332 size_t offset = strlen(root_ino) + 1; 262 size_t offset = strlen(root_ino) + 1;
333 263
334 if (strlen(root_path) > offset) 264 if (strlen(root_path) > offset)
@@ -339,9 +269,8 @@ static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
339 269
340static const struct super_operations hostfs_sbops = { 270static const struct super_operations hostfs_sbops = {
341 .alloc_inode = hostfs_alloc_inode, 271 .alloc_inode = hostfs_alloc_inode,
342 .drop_inode = generic_delete_inode,
343 .delete_inode = hostfs_delete_inode,
344 .destroy_inode = hostfs_destroy_inode, 272 .destroy_inode = hostfs_destroy_inode,
273 .evict_inode = hostfs_evict_inode,
345 .statfs = hostfs_statfs, 274 .statfs = hostfs_statfs,
346 .show_options = hostfs_show_options, 275 .show_options = hostfs_show_options,
347}; 276};
@@ -353,11 +282,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
353 unsigned long long next, ino; 282 unsigned long long next, ino;
354 int error, len; 283 int error, len;
355 284
356 name = dentry_name(file->f_path.dentry, 0); 285 name = dentry_name(file->f_path.dentry);
357 if (name == NULL) 286 if (name == NULL)
358 return -ENOMEM; 287 return -ENOMEM;
359 dir = open_dir(name, &error); 288 dir = open_dir(name, &error);
360 kfree(name); 289 __putname(name);
361 if (dir == NULL) 290 if (dir == NULL)
362 return -error; 291 return -error;
363 next = file->f_pos; 292 next = file->f_pos;
@@ -373,47 +302,66 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
373 302
374int hostfs_file_open(struct inode *ino, struct file *file) 303int hostfs_file_open(struct inode *ino, struct file *file)
375{ 304{
305 static DEFINE_MUTEX(open_mutex);
376 char *name; 306 char *name;
377 fmode_t mode = 0; 307 fmode_t mode = 0;
308 int err;
378 int r = 0, w = 0, fd; 309 int r = 0, w = 0, fd;
379 310
380 mode = file->f_mode & (FMODE_READ | FMODE_WRITE); 311 mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
381 if ((mode & HOSTFS_I(ino)->mode) == mode) 312 if ((mode & HOSTFS_I(ino)->mode) == mode)
382 return 0; 313 return 0;
383 314
384 /* 315 mode |= HOSTFS_I(ino)->mode;
385 * The file may already have been opened, but with the wrong access,
386 * so this resets things and reopens the file with the new access.
387 */
388 if (HOSTFS_I(ino)->fd != -1) {
389 close_file(&HOSTFS_I(ino)->fd);
390 HOSTFS_I(ino)->fd = -1;
391 }
392 316
393 HOSTFS_I(ino)->mode |= mode; 317retry:
394 if (HOSTFS_I(ino)->mode & FMODE_READ) 318 if (mode & FMODE_READ)
395 r = 1; 319 r = 1;
396 if (HOSTFS_I(ino)->mode & FMODE_WRITE) 320 if (mode & FMODE_WRITE)
397 w = 1; 321 w = 1;
398 if (w) 322 if (w)
399 r = 1; 323 r = 1;
400 324
401 name = dentry_name(file->f_path.dentry, 0); 325 name = dentry_name(file->f_path.dentry);
402 if (name == NULL) 326 if (name == NULL)
403 return -ENOMEM; 327 return -ENOMEM;
404 328
405 fd = open_file(name, r, w, append); 329 fd = open_file(name, r, w, append);
406 kfree(name); 330 __putname(name);
407 if (fd < 0) 331 if (fd < 0)
408 return fd; 332 return fd;
409 FILE_HOSTFS_I(file)->fd = fd; 333
334 mutex_lock(&open_mutex);
335 /* somebody else had handled it first? */
336 if ((mode & HOSTFS_I(ino)->mode) == mode) {
337 mutex_unlock(&open_mutex);
338 return 0;
339 }
340 if ((mode | HOSTFS_I(ino)->mode) != mode) {
341 mode |= HOSTFS_I(ino)->mode;
342 mutex_unlock(&open_mutex);
343 close_file(&fd);
344 goto retry;
345 }
346 if (HOSTFS_I(ino)->fd == -1) {
347 HOSTFS_I(ino)->fd = fd;
348 } else {
349 err = replace_file(fd, HOSTFS_I(ino)->fd);
350 close_file(&fd);
351 if (err < 0) {
352 mutex_unlock(&open_mutex);
353 return err;
354 }
355 }
356 HOSTFS_I(ino)->mode = mode;
357 mutex_unlock(&open_mutex);
410 358
411 return 0; 359 return 0;
412} 360}
413 361
414int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) 362int hostfs_fsync(struct file *file, int datasync)
415{ 363{
416 return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync); 364 return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
417} 365}
418 366
419static const struct file_operations hostfs_file_fops = { 367static const struct file_operations hostfs_file_fops = {
@@ -544,54 +492,50 @@ static const struct address_space_operations hostfs_aops = {
544 .write_end = hostfs_write_end, 492 .write_end = hostfs_write_end,
545}; 493};
546 494
547static int init_inode(struct inode *inode, struct dentry *dentry) 495static int read_name(struct inode *ino, char *name)
548{ 496{
549 char *name; 497 dev_t rdev;
550 int type, err = -ENOMEM; 498 struct hostfs_stat st;
551 int maj, min; 499 int err = stat_file(name, &st, -1);
552 dev_t rdev = 0; 500 if (err)
501 return err;
553 502
554 if (dentry) { 503 /* Reencode maj and min with the kernel encoding.*/
555 name = dentry_name(dentry, 0); 504 rdev = MKDEV(st.maj, st.min);
556 if (name == NULL)
557 goto out;
558 type = file_type(name, &maj, &min);
559 /* Reencode maj and min with the kernel encoding.*/
560 rdev = MKDEV(maj, min);
561 kfree(name);
562 }
563 else type = OS_TYPE_DIR;
564 505
565 err = 0; 506 switch (st.mode & S_IFMT) {
566 if (type == OS_TYPE_SYMLINK) 507 case S_IFLNK:
567 inode->i_op = &page_symlink_inode_operations; 508 ino->i_op = &hostfs_link_iops;
568 else if (type == OS_TYPE_DIR)
569 inode->i_op = &hostfs_dir_iops;
570 else inode->i_op = &hostfs_iops;
571
572 if (type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops;
573 else inode->i_fop = &hostfs_file_fops;
574
575 if (type == OS_TYPE_SYMLINK)
576 inode->i_mapping->a_ops = &hostfs_link_aops;
577 else inode->i_mapping->a_ops = &hostfs_aops;
578
579 switch (type) {
580 case OS_TYPE_CHARDEV:
581 init_special_inode(inode, S_IFCHR, rdev);
582 break; 509 break;
583 case OS_TYPE_BLOCKDEV: 510 case S_IFDIR:
584 init_special_inode(inode, S_IFBLK, rdev); 511 ino->i_op = &hostfs_dir_iops;
512 ino->i_fop = &hostfs_dir_fops;
585 break; 513 break;
586 case OS_TYPE_FIFO: 514 case S_IFCHR:
587 init_special_inode(inode, S_IFIFO, 0); 515 case S_IFBLK:
516 case S_IFIFO:
517 case S_IFSOCK:
518 init_special_inode(ino, st.mode & S_IFMT, rdev);
519 ino->i_op = &hostfs_iops;
588 break; 520 break;
589 case OS_TYPE_SOCK: 521
590 init_special_inode(inode, S_IFSOCK, 0); 522 default:
591 break; 523 ino->i_op = &hostfs_iops;
592 } 524 ino->i_fop = &hostfs_file_fops;
593 out: 525 ino->i_mapping->a_ops = &hostfs_aops;
594 return err; 526 }
527
528 ino->i_ino = st.ino;
529 ino->i_mode = st.mode;
530 ino->i_nlink = st.nlink;
531 ino->i_uid = st.uid;
532 ino->i_gid = st.gid;
533 ino->i_atime = st.atime;
534 ino->i_mtime = st.mtime;
535 ino->i_ctime = st.ctime;
536 ino->i_size = st.size;
537 ino->i_blocks = st.blocks;
538 return 0;
595} 539}
596 540
597int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, 541int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
@@ -607,12 +551,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
607 goto out; 551 goto out;
608 } 552 }
609 553
610 error = init_inode(inode, dentry);
611 if (error)
612 goto out_put;
613
614 error = -ENOMEM; 554 error = -ENOMEM;
615 name = dentry_name(dentry, 0); 555 name = dentry_name(dentry);
616 if (name == NULL) 556 if (name == NULL)
617 goto out_put; 557 goto out_put;
618 558
@@ -622,9 +562,10 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
622 mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH); 562 mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH);
623 if (fd < 0) 563 if (fd < 0)
624 error = fd; 564 error = fd;
625 else error = read_name(inode, name); 565 else
566 error = read_name(inode, name);
626 567
627 kfree(name); 568 __putname(name);
628 if (error) 569 if (error)
629 goto out_put; 570 goto out_put;
630 571
@@ -652,17 +593,14 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
652 goto out; 593 goto out;
653 } 594 }
654 595
655 err = init_inode(inode, dentry);
656 if (err)
657 goto out_put;
658
659 err = -ENOMEM; 596 err = -ENOMEM;
660 name = dentry_name(dentry, 0); 597 name = dentry_name(dentry);
661 if (name == NULL) 598 if (name == NULL)
662 goto out_put; 599 goto out_put;
663 600
664 err = read_name(inode, name); 601 err = read_name(inode, name);
665 kfree(name); 602
603 __putname(name);
666 if (err == -ENOENT) { 604 if (err == -ENOENT) {
667 iput(inode); 605 iput(inode);
668 inode = NULL; 606 inode = NULL;
@@ -680,36 +618,21 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
680 return ERR_PTR(err); 618 return ERR_PTR(err);
681} 619}
682 620
683static char *inode_dentry_name(struct inode *ino, struct dentry *dentry)
684{
685 char *file;
686 int len;
687
688 file = inode_name(ino, dentry->d_name.len + 1);
689 if (file == NULL)
690 return NULL;
691 strcat(file, "/");
692 len = strlen(file);
693 strncat(file, dentry->d_name.name, dentry->d_name.len);
694 file[len + dentry->d_name.len] = '\0';
695 return file;
696}
697
698int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) 621int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
699{ 622{
700 char *from_name, *to_name; 623 char *from_name, *to_name;
701 int err; 624 int err;
702 625
703 if ((from_name = inode_dentry_name(ino, from)) == NULL) 626 if ((from_name = dentry_name(from)) == NULL)
704 return -ENOMEM; 627 return -ENOMEM;
705 to_name = dentry_name(to, 0); 628 to_name = dentry_name(to);
706 if (to_name == NULL) { 629 if (to_name == NULL) {
707 kfree(from_name); 630 __putname(from_name);
708 return -ENOMEM; 631 return -ENOMEM;
709 } 632 }
710 err = link_file(to_name, from_name); 633 err = link_file(to_name, from_name);
711 kfree(from_name); 634 __putname(from_name);
712 kfree(to_name); 635 __putname(to_name);
713 return err; 636 return err;
714} 637}
715 638
@@ -718,13 +641,14 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry)
718 char *file; 641 char *file;
719 int err; 642 int err;
720 643
721 if ((file = inode_dentry_name(ino, dentry)) == NULL)
722 return -ENOMEM;
723 if (append) 644 if (append)
724 return -EPERM; 645 return -EPERM;
725 646
647 if ((file = dentry_name(dentry)) == NULL)
648 return -ENOMEM;
649
726 err = unlink_file(file); 650 err = unlink_file(file);
727 kfree(file); 651 __putname(file);
728 return err; 652 return err;
729} 653}
730 654
@@ -733,10 +657,10 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
733 char *file; 657 char *file;
734 int err; 658 int err;
735 659
736 if ((file = inode_dentry_name(ino, dentry)) == NULL) 660 if ((file = dentry_name(dentry)) == NULL)
737 return -ENOMEM; 661 return -ENOMEM;
738 err = make_symlink(file, to); 662 err = make_symlink(file, to);
739 kfree(file); 663 __putname(file);
740 return err; 664 return err;
741} 665}
742 666
@@ -745,10 +669,10 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
745 char *file; 669 char *file;
746 int err; 670 int err;
747 671
748 if ((file = inode_dentry_name(ino, dentry)) == NULL) 672 if ((file = dentry_name(dentry)) == NULL)
749 return -ENOMEM; 673 return -ENOMEM;
750 err = do_mkdir(file, mode); 674 err = do_mkdir(file, mode);
751 kfree(file); 675 __putname(file);
752 return err; 676 return err;
753} 677}
754 678
@@ -757,10 +681,10 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
757 char *file; 681 char *file;
758 int err; 682 int err;
759 683
760 if ((file = inode_dentry_name(ino, dentry)) == NULL) 684 if ((file = dentry_name(dentry)) == NULL)
761 return -ENOMEM; 685 return -ENOMEM;
762 err = do_rmdir(file); 686 err = do_rmdir(file);
763 kfree(file); 687 __putname(file);
764 return err; 688 return err;
765} 689}
766 690
@@ -776,22 +700,20 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
776 goto out; 700 goto out;
777 } 701 }
778 702
779 err = init_inode(inode, dentry);
780 if (err)
781 goto out_put;
782
783 err = -ENOMEM; 703 err = -ENOMEM;
784 name = dentry_name(dentry, 0); 704 name = dentry_name(dentry);
785 if (name == NULL) 705 if (name == NULL)
786 goto out_put; 706 goto out_put;
787 707
788 init_special_inode(inode, mode, dev); 708 init_special_inode(inode, mode, dev);
789 err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); 709 err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
790 if (err) 710 if (!err)
791 goto out_free; 711 goto out_free;
792 712
793 err = read_name(inode, name); 713 err = read_name(inode, name);
794 kfree(name); 714 __putname(name);
715 if (err)
716 goto out_put;
795 if (err) 717 if (err)
796 goto out_put; 718 goto out_put;
797 719
@@ -799,7 +721,7 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
799 return 0; 721 return 0;
800 722
801 out_free: 723 out_free:
802 kfree(name); 724 __putname(name);
803 out_put: 725 out_put:
804 iput(inode); 726 iput(inode);
805 out: 727 out:
@@ -812,15 +734,15 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
812 char *from_name, *to_name; 734 char *from_name, *to_name;
813 int err; 735 int err;
814 736
815 if ((from_name = inode_dentry_name(from_ino, from)) == NULL) 737 if ((from_name = dentry_name(from)) == NULL)
816 return -ENOMEM; 738 return -ENOMEM;
817 if ((to_name = inode_dentry_name(to_ino, to)) == NULL) { 739 if ((to_name = dentry_name(to)) == NULL) {
818 kfree(from_name); 740 __putname(from_name);
819 return -ENOMEM; 741 return -ENOMEM;
820 } 742 }
821 err = rename_file(from_name, to_name); 743 err = rename_file(from_name, to_name);
822 kfree(from_name); 744 __putname(from_name);
823 kfree(to_name); 745 __putname(to_name);
824 return err; 746 return err;
825} 747}
826 748
@@ -832,7 +754,7 @@ int hostfs_permission(struct inode *ino, int desired)
832 if (desired & MAY_READ) r = 1; 754 if (desired & MAY_READ) r = 1;
833 if (desired & MAY_WRITE) w = 1; 755 if (desired & MAY_WRITE) w = 1;
834 if (desired & MAY_EXEC) x = 1; 756 if (desired & MAY_EXEC) x = 1;
835 name = inode_name(ino, 0); 757 name = inode_name(ino);
836 if (name == NULL) 758 if (name == NULL)
837 return -ENOMEM; 759 return -ENOMEM;
838 760
@@ -841,7 +763,7 @@ int hostfs_permission(struct inode *ino, int desired)
841 err = 0; 763 err = 0;
842 else 764 else
843 err = access_file(name, r, w, x); 765 err = access_file(name, r, w, x);
844 kfree(name); 766 __putname(name);
845 if (!err) 767 if (!err)
846 err = generic_permission(ino, desired, NULL); 768 err = generic_permission(ino, desired, NULL);
847 return err; 769 return err;
@@ -849,13 +771,14 @@ int hostfs_permission(struct inode *ino, int desired)
849 771
850int hostfs_setattr(struct dentry *dentry, struct iattr *attr) 772int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
851{ 773{
774 struct inode *inode = dentry->d_inode;
852 struct hostfs_iattr attrs; 775 struct hostfs_iattr attrs;
853 char *name; 776 char *name;
854 int err; 777 int err;
855 778
856 int fd = HOSTFS_I(dentry->d_inode)->fd; 779 int fd = HOSTFS_I(inode)->fd;
857 780
858 err = inode_change_ok(dentry->d_inode, attr); 781 err = inode_change_ok(inode, attr);
859 if (err) 782 if (err)
860 return err; 783 return err;
861 784
@@ -897,15 +820,26 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
897 if (attr->ia_valid & ATTR_MTIME_SET) { 820 if (attr->ia_valid & ATTR_MTIME_SET) {
898 attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET; 821 attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
899 } 822 }
900 name = dentry_name(dentry, 0); 823 name = dentry_name(dentry);
901 if (name == NULL) 824 if (name == NULL)
902 return -ENOMEM; 825 return -ENOMEM;
903 err = set_attr(name, &attrs, fd); 826 err = set_attr(name, &attrs, fd);
904 kfree(name); 827 __putname(name);
905 if (err) 828 if (err)
906 return err; 829 return err;
907 830
908 return inode_setattr(dentry->d_inode, attr); 831 if ((attr->ia_valid & ATTR_SIZE) &&
832 attr->ia_size != i_size_read(inode)) {
833 int error;
834
835 error = vmtruncate(inode, attr->ia_size);
836 if (err)
837 return err;
838 }
839
840 setattr_copy(inode, attr);
841 mark_inode_dirty(inode);
842 return 0;
909} 843}
910 844
911static const struct inode_operations hostfs_iops = { 845static const struct inode_operations hostfs_iops = {
@@ -935,32 +869,41 @@ static const struct inode_operations hostfs_dir_iops = {
935 .setattr = hostfs_setattr, 869 .setattr = hostfs_setattr,
936}; 870};
937 871
938int hostfs_link_readpage(struct file *file, struct page *page) 872static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
939{ 873{
940 char *buffer, *name; 874 char *link = __getname();
941 int err; 875 if (link) {
942 876 char *path = dentry_name(dentry);
943 buffer = kmap(page); 877 int err = -ENOMEM;
944 name = inode_name(page->mapping->host, 0); 878 if (path) {
945 if (name == NULL) 879 err = hostfs_do_readlink(path, link, PATH_MAX);
946 return -ENOMEM; 880 if (err == PATH_MAX)
947 err = hostfs_do_readlink(name, buffer, PAGE_CACHE_SIZE); 881 err = -E2BIG;
948 kfree(name); 882 __putname(path);
949 if (err == PAGE_CACHE_SIZE) 883 }
950 err = -E2BIG; 884 if (err < 0) {
951 else if (err > 0) { 885 __putname(link);
952 flush_dcache_page(page); 886 link = ERR_PTR(err);
953 SetPageUptodate(page); 887 }
954 if (PageError(page)) ClearPageError(page); 888 } else {
955 err = 0; 889 link = ERR_PTR(-ENOMEM);
956 } 890 }
957 kunmap(page); 891
958 unlock_page(page); 892 nd_set_link(nd, link);
959 return err; 893 return NULL;
960} 894}
961 895
962static const struct address_space_operations hostfs_link_aops = { 896static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
963 .readpage = hostfs_link_readpage, 897{
898 char *s = nd_get_link(nd);
899 if (!IS_ERR(s))
900 __putname(s);
901}
902
903static const struct inode_operations hostfs_link_iops = {
904 .readlink = generic_readlink,
905 .follow_link = hostfs_follow_link,
906 .put_link = hostfs_put_link,
964}; 907};
965 908
966static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) 909static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
@@ -980,49 +923,41 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
980 req_root = ""; 923 req_root = "";
981 924
982 err = -ENOMEM; 925 err = -ENOMEM;
983 host_root_path = kmalloc(strlen(root_ino) + 1 926 sb->s_fs_info = host_root_path =
984 + strlen(req_root) + 1, GFP_KERNEL); 927 kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
985 if (host_root_path == NULL) 928 if (host_root_path == NULL)
986 goto out; 929 goto out;
987 930
988 sprintf(host_root_path, "%s/%s", root_ino, req_root); 931 sprintf(host_root_path, "%s/%s", root_ino, req_root);
989 932
990 root_inode = hostfs_iget(sb); 933 root_inode = new_inode(sb);
991 if (IS_ERR(root_inode)) { 934 if (!root_inode)
992 err = PTR_ERR(root_inode); 935 goto out;
993 goto out_free;
994 }
995 936
996 err = init_inode(root_inode, NULL); 937 err = read_name(root_inode, host_root_path);
997 if (err) 938 if (err)
998 goto out_put; 939 goto out_put;
999 940
1000 HOSTFS_I(root_inode)->host_filename = host_root_path; 941 if (S_ISLNK(root_inode->i_mode)) {
1001 /* 942 char *name = follow_link(host_root_path);
1002 * Avoid that in the error path, iput(root_inode) frees again 943 if (IS_ERR(name))
1003 * host_root_path through hostfs_destroy_inode! 944 err = PTR_ERR(name);
1004 */ 945 else
1005 host_root_path = NULL; 946 err = read_name(root_inode, name);
947 kfree(name);
948 if (err)
949 goto out_put;
950 }
1006 951
1007 err = -ENOMEM; 952 err = -ENOMEM;
1008 sb->s_root = d_alloc_root(root_inode); 953 sb->s_root = d_alloc_root(root_inode);
1009 if (sb->s_root == NULL) 954 if (sb->s_root == NULL)
1010 goto out_put; 955 goto out_put;
1011 956
1012 err = hostfs_read_inode(root_inode);
1013 if (err) {
1014 /* No iput in this case because the dput does that for us */
1015 dput(sb->s_root);
1016 sb->s_root = NULL;
1017 goto out;
1018 }
1019
1020 return 0; 957 return 0;
1021 958
1022out_put: 959out_put:
1023 iput(root_inode); 960 iput(root_inode);
1024out_free:
1025 kfree(host_root_path);
1026out: 961out:
1027 return err; 962 return err;
1028} 963}
@@ -1034,11 +969,17 @@ static int hostfs_read_sb(struct file_system_type *type,
1034 return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt); 969 return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
1035} 970}
1036 971
972static void hostfs_kill_sb(struct super_block *s)
973{
974 kill_anon_super(s);
975 kfree(s->s_fs_info);
976}
977
1037static struct file_system_type hostfs_type = { 978static struct file_system_type hostfs_type = {
1038 .owner = THIS_MODULE, 979 .owner = THIS_MODULE,
1039 .name = "hostfs", 980 .name = "hostfs",
1040 .get_sb = hostfs_read_sb, 981 .get_sb = hostfs_read_sb,
1041 .kill_sb = kill_anon_super, 982 .kill_sb = hostfs_kill_sb,
1042 .fs_flags = 0, 983 .fs_flags = 0,
1043}; 984};
1044 985
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index b79424f93282..6777aa06ce2c 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -19,11 +19,27 @@
19#include "user.h" 19#include "user.h"
20#include <utime.h> 20#include <utime.h>
21 21
22int stat_file(const char *path, unsigned long long *inode_out, int *mode_out, 22static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
23 int *nlink_out, int *uid_out, int *gid_out, 23{
24 unsigned long long *size_out, struct timespec *atime_out, 24 p->ino = buf->st_ino;
25 struct timespec *mtime_out, struct timespec *ctime_out, 25 p->mode = buf->st_mode;
26 int *blksize_out, unsigned long long *blocks_out, int fd) 26 p->nlink = buf->st_nlink;
27 p->uid = buf->st_uid;
28 p->gid = buf->st_gid;
29 p->size = buf->st_size;
30 p->atime.tv_sec = buf->st_atime;
31 p->atime.tv_nsec = 0;
32 p->ctime.tv_sec = buf->st_ctime;
33 p->ctime.tv_nsec = 0;
34 p->mtime.tv_sec = buf->st_mtime;
35 p->mtime.tv_nsec = 0;
36 p->blksize = buf->st_blksize;
37 p->blocks = buf->st_blocks;
38 p->maj = os_major(buf->st_rdev);
39 p->min = os_minor(buf->st_rdev);
40}
41
42int stat_file(const char *path, struct hostfs_stat *p, int fd)
27{ 43{
28 struct stat64 buf; 44 struct stat64 buf;
29 45
@@ -33,68 +49,10 @@ int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
33 } else if (lstat64(path, &buf) < 0) { 49 } else if (lstat64(path, &buf) < 0) {
34 return -errno; 50 return -errno;
35 } 51 }
36 52 stat64_to_hostfs(&buf, p);
37 if (inode_out != NULL)
38 *inode_out = buf.st_ino;
39 if (mode_out != NULL)
40 *mode_out = buf.st_mode;
41 if (nlink_out != NULL)
42 *nlink_out = buf.st_nlink;
43 if (uid_out != NULL)
44 *uid_out = buf.st_uid;
45 if (gid_out != NULL)
46 *gid_out = buf.st_gid;
47 if (size_out != NULL)
48 *size_out = buf.st_size;
49 if (atime_out != NULL) {
50 atime_out->tv_sec = buf.st_atime;
51 atime_out->tv_nsec = 0;
52 }
53 if (mtime_out != NULL) {
54 mtime_out->tv_sec = buf.st_mtime;
55 mtime_out->tv_nsec = 0;
56 }
57 if (ctime_out != NULL) {
58 ctime_out->tv_sec = buf.st_ctime;
59 ctime_out->tv_nsec = 0;
60 }
61 if (blksize_out != NULL)
62 *blksize_out = buf.st_blksize;
63 if (blocks_out != NULL)
64 *blocks_out = buf.st_blocks;
65 return 0; 53 return 0;
66} 54}
67 55
68int file_type(const char *path, int *maj, int *min)
69{
70 struct stat64 buf;
71
72 if (lstat64(path, &buf) < 0)
73 return -errno;
74 /*
75 * We cannot pass rdev as is because glibc and the kernel disagree
76 * about its definition.
77 */
78 if (maj != NULL)
79 *maj = major(buf.st_rdev);
80 if (min != NULL)
81 *min = minor(buf.st_rdev);
82
83 if (S_ISDIR(buf.st_mode))
84 return OS_TYPE_DIR;
85 else if (S_ISLNK(buf.st_mode))
86 return OS_TYPE_SYMLINK;
87 else if (S_ISCHR(buf.st_mode))
88 return OS_TYPE_CHARDEV;
89 else if (S_ISBLK(buf.st_mode))
90 return OS_TYPE_BLOCKDEV;
91 else if (S_ISFIFO(buf.st_mode))
92 return OS_TYPE_FIFO;
93 else if (S_ISSOCK(buf.st_mode))
94 return OS_TYPE_SOCK;
95 else return OS_TYPE_FILE;
96}
97
98int access_file(char *path, int r, int w, int x) 56int access_file(char *path, int r, int w, int x)
99{ 57{
100 int mode = 0; 58 int mode = 0;
@@ -202,6 +160,11 @@ int fsync_file(int fd, int datasync)
202 return 0; 160 return 0;
203} 161}
204 162
163int replace_file(int oldfd, int fd)
164{
165 return dup2(oldfd, fd);
166}
167
205void close_file(void *stream) 168void close_file(void *stream)
206{ 169{
207 close(*((int *) stream)); 170 close(*((int *) stream));
@@ -235,8 +198,8 @@ int file_create(char *name, int ur, int uw, int ux, int gr,
235 198
236int set_attr(const char *file, struct hostfs_iattr *attrs, int fd) 199int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
237{ 200{
201 struct hostfs_stat st;
238 struct timeval times[2]; 202 struct timeval times[2];
239 struct timespec atime_ts, mtime_ts;
240 int err, ma; 203 int err, ma;
241 204
242 if (attrs->ia_valid & HOSTFS_ATTR_MODE) { 205 if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
@@ -279,15 +242,14 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
279 */ 242 */
280 ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET); 243 ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET);
281 if (attrs->ia_valid & ma) { 244 if (attrs->ia_valid & ma) {
282 err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL, 245 err = stat_file(file, &st, fd);
283 &atime_ts, &mtime_ts, NULL, NULL, NULL, fd);
284 if (err != 0) 246 if (err != 0)
285 return err; 247 return err;
286 248
287 times[0].tv_sec = atime_ts.tv_sec; 249 times[0].tv_sec = st.atime.tv_sec;
288 times[0].tv_usec = atime_ts.tv_nsec / 1000; 250 times[0].tv_usec = st.atime.tv_nsec / 1000;
289 times[1].tv_sec = mtime_ts.tv_sec; 251 times[1].tv_sec = st.mtime.tv_sec;
290 times[1].tv_usec = mtime_ts.tv_nsec / 1000; 252 times[1].tv_usec = st.mtime.tv_nsec / 1000;
291 253
292 if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) { 254 if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) {
293 times[0].tv_sec = attrs->ia_atime.tv_sec; 255 times[0].tv_sec = attrs->ia_atime.tv_sec;
@@ -308,9 +270,9 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
308 270
309 /* Note: ctime is not handled */ 271 /* Note: ctime is not handled */
310 if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) { 272 if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) {
311 err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL, 273 err = stat_file(file, &st, fd);
312 &attrs->ia_atime, &attrs->ia_mtime, NULL, 274 attrs->ia_atime = st.atime;
313 NULL, NULL, fd); 275 attrs->ia_mtime = st.mtime;
314 if (err != 0) 276 if (err != 0)
315 return err; 277 return err;
316 } 278 }
@@ -361,7 +323,7 @@ int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor)
361{ 323{
362 int err; 324 int err;
363 325
364 err = mknod(file, mode, makedev(major, minor)); 326 err = mknod(file, mode, os_makedev(major, minor));
365 if (err) 327 if (err)
366 return -errno; 328 return -errno;
367 return 0; 329 return 0;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..c0340887c7ea 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
19 return 0; 19 return 0;
20} 20}
21 21
22int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 22int hpfs_file_fsync(struct file *file, int datasync)
23{ 23{
24 /*return file_fsync(file, dentry);*/ 24 /*return file_fsync(file, datasync);*/
25 return 0; /* Don't fsync :-) */ 25 return 0; /* Don't fsync :-) */
26} 26}
27 27
@@ -97,10 +97,19 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
97 loff_t pos, unsigned len, unsigned flags, 97 loff_t pos, unsigned len, unsigned flags,
98 struct page **pagep, void **fsdata) 98 struct page **pagep, void **fsdata)
99{ 99{
100 int ret;
101
100 *pagep = NULL; 102 *pagep = NULL;
101 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 103 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
102 hpfs_get_block, 104 hpfs_get_block,
103 &hpfs_i(mapping->host)->mmu_private); 105 &hpfs_i(mapping->host)->mmu_private);
106 if (unlikely(ret)) {
107 loff_t isize = mapping->host->i_size;
108 if (pos + len > isize)
109 vmtruncate(mapping->host, isize);
110 }
111
112 return ret;
104} 113}
105 114
106static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) 115static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..b59eac0232a0 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
268 268
269/* file.c */ 269/* file.c */
270 270
271int hpfs_file_fsync(struct file *, struct dentry *, int); 271int hpfs_file_fsync(struct file *, int);
272extern const struct file_operations hpfs_file_ops; 272extern const struct file_operations hpfs_file_ops;
273extern const struct inode_operations hpfs_file_iops; 273extern const struct inode_operations hpfs_file_iops;
274extern const struct address_space_operations hpfs_aops; 274extern const struct address_space_operations hpfs_aops;
@@ -281,7 +281,7 @@ void hpfs_write_inode(struct inode *);
281void hpfs_write_inode_nolock(struct inode *); 281void hpfs_write_inode_nolock(struct inode *);
282int hpfs_setattr(struct dentry *, struct iattr *); 282int hpfs_setattr(struct dentry *, struct iattr *);
283void hpfs_write_if_changed(struct inode *); 283void hpfs_write_if_changed(struct inode *);
284void hpfs_delete_inode(struct inode *); 284void hpfs_evict_inode(struct inode *);
285 285
286/* map.c */ 286/* map.c */
287 287
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1042a9bc97f3..56f0da1cfd10 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -277,9 +277,15 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
277 if (error) 277 if (error)
278 goto out_unlock; 278 goto out_unlock;
279 279
280 error = inode_setattr(inode, attr); 280 if ((attr->ia_valid & ATTR_SIZE) &&
281 if (error) 281 attr->ia_size != i_size_read(inode)) {
282 goto out_unlock; 282 error = vmtruncate(inode, attr->ia_size);
283 if (error)
284 return error;
285 }
286
287 setattr_copy(inode, attr);
288 mark_inode_dirty(inode);
283 289
284 hpfs_write_inode(inode); 290 hpfs_write_inode(inode);
285 291
@@ -296,11 +302,13 @@ void hpfs_write_if_changed(struct inode *inode)
296 hpfs_write_inode(inode); 302 hpfs_write_inode(inode);
297} 303}
298 304
299void hpfs_delete_inode(struct inode *inode) 305void hpfs_evict_inode(struct inode *inode)
300{ 306{
301 truncate_inode_pages(&inode->i_data, 0); 307 truncate_inode_pages(&inode->i_data, 0);
302 lock_kernel(); 308 end_writeback(inode);
303 hpfs_remove_fnode(inode->i_sb, inode->i_ino); 309 if (!inode->i_nlink) {
304 unlock_kernel(); 310 lock_kernel();
305 clear_inode(inode); 311 hpfs_remove_fnode(inode->i_sb, inode->i_ino);
312 unlock_kernel();
313 }
306} 314}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index aa53842c599c..2607010be2fe 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -450,7 +450,7 @@ static const struct super_operations hpfs_sops =
450{ 450{
451 .alloc_inode = hpfs_alloc_inode, 451 .alloc_inode = hpfs_alloc_inode,
452 .destroy_inode = hpfs_destroy_inode, 452 .destroy_inode = hpfs_destroy_inode,
453 .delete_inode = hpfs_delete_inode, 453 .evict_inode = hpfs_evict_inode,
454 .put_super = hpfs_put_super, 454 .put_super = hpfs_put_super,
455 .statfs = hpfs_statfs, 455 .statfs = hpfs_statfs,
456 .remount_fs = hpfs_remount_fs, 456 .remount_fs = hpfs_remount_fs,
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..7b027720d820 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/statfs.h> 16#include <linux/statfs.h>
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/pid_namespace.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include "os.h" 20#include "os.h"
20 21
@@ -587,7 +588,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
587 return err; 588 return err;
588} 589}
589 590
590static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) 591static int hppfs_fsync(struct file *file, int datasync)
591{ 592{
592 return 0; 593 return 0;
593} 594}
@@ -623,12 +624,11 @@ static struct inode *hppfs_alloc_inode(struct super_block *sb)
623 return &hi->vfs_inode; 624 return &hi->vfs_inode;
624} 625}
625 626
626void hppfs_delete_inode(struct inode *ino) 627void hppfs_evict_inode(struct inode *ino)
627{ 628{
629 end_writeback(ino);
628 dput(HPPFS_I(ino)->proc_dentry); 630 dput(HPPFS_I(ino)->proc_dentry);
629 mntput(ino->i_sb->s_fs_info); 631 mntput(ino->i_sb->s_fs_info);
630
631 clear_inode(ino);
632} 632}
633 633
634static void hppfs_destroy_inode(struct inode *inode) 634static void hppfs_destroy_inode(struct inode *inode)
@@ -639,7 +639,7 @@ static void hppfs_destroy_inode(struct inode *inode)
639static const struct super_operations hppfs_sbops = { 639static const struct super_operations hppfs_sbops = {
640 .alloc_inode = hppfs_alloc_inode, 640 .alloc_inode = hppfs_alloc_inode,
641 .destroy_inode = hppfs_destroy_inode, 641 .destroy_inode = hppfs_destroy_inode,
642 .delete_inode = hppfs_delete_inode, 642 .evict_inode = hppfs_evict_inode,
643 .statfs = hppfs_statfs, 643 .statfs = hppfs_statfs,
644}; 644};
645 645
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..6e5bd42f3860 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -371,27 +371,10 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
371 hugetlb_unreserve_pages(inode, start, freed); 371 hugetlb_unreserve_pages(inode, start, freed);
372} 372}
373 373
374static void hugetlbfs_delete_inode(struct inode *inode) 374static void hugetlbfs_evict_inode(struct inode *inode)
375{ 375{
376 truncate_hugepages(inode, 0); 376 truncate_hugepages(inode, 0);
377 clear_inode(inode); 377 end_writeback(inode);
378}
379
380static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
381{
382 if (generic_detach_inode(inode)) {
383 truncate_hugepages(inode, 0);
384 clear_inode(inode);
385 destroy_inode(inode);
386 }
387}
388
389static void hugetlbfs_drop_inode(struct inode *inode)
390{
391 if (!inode->i_nlink)
392 generic_delete_inode(inode);
393 else
394 hugetlbfs_forget_inode(inode);
395} 378}
396 379
397static inline void 380static inline void
@@ -448,19 +431,20 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
448 431
449 error = inode_change_ok(inode, attr); 432 error = inode_change_ok(inode, attr);
450 if (error) 433 if (error)
451 goto out; 434 return error;
452 435
453 if (ia_valid & ATTR_SIZE) { 436 if (ia_valid & ATTR_SIZE) {
454 error = -EINVAL; 437 error = -EINVAL;
455 if (!(attr->ia_size & ~huge_page_mask(h))) 438 if (attr->ia_size & ~huge_page_mask(h))
456 error = hugetlb_vmtruncate(inode, attr->ia_size); 439 return -EINVAL;
440 error = hugetlb_vmtruncate(inode, attr->ia_size);
457 if (error) 441 if (error)
458 goto out; 442 return error;
459 attr->ia_valid &= ~ATTR_SIZE;
460 } 443 }
461 error = inode_setattr(inode, attr); 444
462out: 445 setattr_copy(inode, attr);
463 return error; 446 mark_inode_dirty(inode);
447 return 0;
464} 448}
465 449
466static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 450static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
@@ -688,7 +672,7 @@ static void init_once(void *foo)
688const struct file_operations hugetlbfs_file_operations = { 672const struct file_operations hugetlbfs_file_operations = {
689 .read = hugetlbfs_read, 673 .read = hugetlbfs_read,
690 .mmap = hugetlbfs_file_mmap, 674 .mmap = hugetlbfs_file_mmap,
691 .fsync = simple_sync_file, 675 .fsync = noop_fsync,
692 .get_unmapped_area = hugetlb_get_unmapped_area, 676 .get_unmapped_area = hugetlb_get_unmapped_area,
693}; 677};
694 678
@@ -712,9 +696,8 @@ static const struct inode_operations hugetlbfs_inode_operations = {
712static const struct super_operations hugetlbfs_ops = { 696static const struct super_operations hugetlbfs_ops = {
713 .alloc_inode = hugetlbfs_alloc_inode, 697 .alloc_inode = hugetlbfs_alloc_inode,
714 .destroy_inode = hugetlbfs_destroy_inode, 698 .destroy_inode = hugetlbfs_destroy_inode,
699 .evict_inode = hugetlbfs_evict_inode,
715 .statfs = hugetlbfs_statfs, 700 .statfs = hugetlbfs_statfs,
716 .delete_inode = hugetlbfs_delete_inode,
717 .drop_inode = hugetlbfs_drop_inode,
718 .put_super = hugetlbfs_put_super, 701 .put_super = hugetlbfs_put_super,
719 .show_options = generic_show_options, 702 .show_options = generic_show_options,
720}; 703};
diff --git a/fs/inode.c b/fs/inode.c
index aaaaf096aa8e..d4fe9c031864 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,7 +20,6 @@
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/cdev.h> 21#include <linux/cdev.h>
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/inotify.h>
24#include <linux/fsnotify.h> 23#include <linux/fsnotify.h>
25#include <linux/mount.h> 24#include <linux/mount.h>
26#include <linux/async.h> 25#include <linux/async.h>
@@ -264,12 +263,8 @@ void inode_init_once(struct inode *inode)
264 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); 263 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
265 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); 264 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
266 i_size_ordered_init(inode); 265 i_size_ordered_init(inode);
267#ifdef CONFIG_INOTIFY
268 INIT_LIST_HEAD(&inode->inotify_watches);
269 mutex_init(&inode->inotify_mutex);
270#endif
271#ifdef CONFIG_FSNOTIFY 266#ifdef CONFIG_FSNOTIFY
272 INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries); 267 INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
273#endif 268#endif
274 INIT_LIST_HEAD(&inode->i_obj_list); 269 INIT_LIST_HEAD(&inode->i_obj_list);
275 mutex_init(&inode->i_obj_mutex); 270 mutex_init(&inode->i_obj_mutex);
@@ -288,42 +283,42 @@ static void init_once(void *foo)
288 */ 283 */
289void __iget(struct inode *inode) 284void __iget(struct inode *inode)
290{ 285{
291 if (atomic_read(&inode->i_count)) { 286 if (atomic_inc_return(&inode->i_count) != 1)
292 atomic_inc(&inode->i_count);
293 return; 287 return;
294 } 288
295 atomic_inc(&inode->i_count);
296 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 289 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
297 list_move(&inode->i_list, &inode_in_use); 290 list_move(&inode->i_list, &inode_in_use);
298 inodes_stat.nr_unused--; 291 inodes_stat.nr_unused--;
299} 292}
300 293
301/** 294void end_writeback(struct inode *inode)
302 * clear_inode - clear an inode
303 * @inode: inode to clear
304 *
305 * This is called by the filesystem to tell us
306 * that the inode is no longer useful. We just
307 * terminate it with extreme prejudice.
308 */
309void clear_inode(struct inode *inode)
310{ 295{
311 might_sleep(); 296 might_sleep();
312 invalidate_inode_buffers(inode);
313
314 BUG_ON(inode->i_data.nrpages); 297 BUG_ON(inode->i_data.nrpages);
298 BUG_ON(!list_empty(&inode->i_data.private_list));
315 BUG_ON(!(inode->i_state & I_FREEING)); 299 BUG_ON(!(inode->i_state & I_FREEING));
316 BUG_ON(inode->i_state & I_CLEAR); 300 BUG_ON(inode->i_state & I_CLEAR);
317 inode_sync_wait(inode); 301 inode_sync_wait(inode);
318 if (inode->i_sb->s_op->clear_inode) 302 inode->i_state = I_FREEING | I_CLEAR;
319 inode->i_sb->s_op->clear_inode(inode); 303}
304EXPORT_SYMBOL(end_writeback);
305
306static void evict(struct inode *inode)
307{
308 const struct super_operations *op = inode->i_sb->s_op;
309
310 if (op->evict_inode) {
311 op->evict_inode(inode);
312 } else {
313 if (inode->i_data.nrpages)
314 truncate_inode_pages(&inode->i_data, 0);
315 end_writeback(inode);
316 }
320 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 317 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
321 bd_forget(inode); 318 bd_forget(inode);
322 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 319 if (S_ISCHR(inode->i_mode) && inode->i_cdev)
323 cd_forget(inode); 320 cd_forget(inode);
324 inode->i_state = I_CLEAR;
325} 321}
326EXPORT_SYMBOL(clear_inode);
327 322
328/* 323/*
329 * dispose_list - dispose of the contents of a local list 324 * dispose_list - dispose of the contents of a local list
@@ -342,9 +337,7 @@ static void dispose_list(struct list_head *head)
342 inode = list_first_entry(head, struct inode, i_list); 337 inode = list_first_entry(head, struct inode, i_list);
343 list_del(&inode->i_list); 338 list_del(&inode->i_list);
344 339
345 if (inode->i_data.nrpages) 340 evict(inode);
346 truncate_inode_pages(&inode->i_data, 0);
347 clear_inode(inode);
348 341
349 spin_lock(&inode_lock); 342 spin_lock(&inode_lock);
350 hlist_del_init(&inode->i_hash); 343 hlist_del_init(&inode->i_hash);
@@ -417,7 +410,6 @@ int invalidate_inodes(struct super_block *sb)
417 410
418 down_write(&iprune_sem); 411 down_write(&iprune_sem);
419 spin_lock(&inode_lock); 412 spin_lock(&inode_lock);
420 inotify_unmount_inodes(&sb->s_inodes);
421 fsnotify_unmount_inodes(&sb->s_inodes); 413 fsnotify_unmount_inodes(&sb->s_inodes);
422 busy = invalidate_list(&sb->s_inodes, &throw_away); 414 busy = invalidate_list(&sb->s_inodes, &throw_away);
423 spin_unlock(&inode_lock); 415 spin_unlock(&inode_lock);
@@ -516,7 +508,7 @@ static void prune_icache(int nr_to_scan)
516 * This function is passed the number of inodes to scan, and it returns the 508 * This function is passed the number of inodes to scan, and it returns the
517 * total number of remaining possibly-reclaimable inodes. 509 * total number of remaining possibly-reclaimable inodes.
518 */ 510 */
519static int shrink_icache_memory(int nr, gfp_t gfp_mask) 511static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
520{ 512{
521 if (nr) { 513 if (nr) {
522 /* 514 /*
@@ -557,7 +549,7 @@ repeat:
557 continue; 549 continue;
558 if (!test(inode, data)) 550 if (!test(inode, data))
559 continue; 551 continue;
560 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 552 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
561 __wait_on_freeing_inode(inode); 553 __wait_on_freeing_inode(inode);
562 goto repeat; 554 goto repeat;
563 } 555 }
@@ -582,7 +574,7 @@ repeat:
582 continue; 574 continue;
583 if (inode->i_sb != sb) 575 if (inode->i_sb != sb)
584 continue; 576 continue;
585 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 577 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
586 __wait_on_freeing_inode(inode); 578 __wait_on_freeing_inode(inode);
587 goto repeat; 579 goto repeat;
588 } 580 }
@@ -844,7 +836,7 @@ EXPORT_SYMBOL(iunique);
844struct inode *igrab(struct inode *inode) 836struct inode *igrab(struct inode *inode)
845{ 837{
846 spin_lock(&inode_lock); 838 spin_lock(&inode_lock);
847 if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))) 839 if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
848 __iget(inode); 840 __iget(inode);
849 else 841 else
850 /* 842 /*
@@ -1093,7 +1085,7 @@ int insert_inode_locked(struct inode *inode)
1093 continue; 1085 continue;
1094 if (old->i_sb != sb) 1086 if (old->i_sb != sb)
1095 continue; 1087 continue;
1096 if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) 1088 if (old->i_state & (I_FREEING|I_WILL_FREE))
1097 continue; 1089 continue;
1098 break; 1090 break;
1099 } 1091 }
@@ -1132,7 +1124,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1132 continue; 1124 continue;
1133 if (!test(old, data)) 1125 if (!test(old, data))
1134 continue; 1126 continue;
1135 if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) 1127 if (old->i_state & (I_FREEING|I_WILL_FREE))
1136 continue; 1128 continue;
1137 break; 1129 break;
1138 } 1130 }
@@ -1184,71 +1176,51 @@ void remove_inode_hash(struct inode *inode)
1184} 1176}
1185EXPORT_SYMBOL(remove_inode_hash); 1177EXPORT_SYMBOL(remove_inode_hash);
1186 1178
1179int generic_delete_inode(struct inode *inode)
1180{
1181 return 1;
1182}
1183EXPORT_SYMBOL(generic_delete_inode);
1184
1187/* 1185/*
1188 * Tell the filesystem that this inode is no longer of any interest and should 1186 * Normal UNIX filesystem behaviour: delete the
1189 * be completely destroyed. 1187 * inode when the usage count drops to zero, and
1190 * 1188 * i_nlink is zero.
1191 * We leave the inode in the inode hash table until *after* the filesystem's
1192 * ->delete_inode completes. This ensures that an iget (such as nfsd might
1193 * instigate) will always find up-to-date information either in the hash or on
1194 * disk.
1195 *
1196 * I_FREEING is set so that no-one will take a new reference to the inode while
1197 * it is being deleted.
1198 */ 1189 */
1199void generic_delete_inode(struct inode *inode) 1190int generic_drop_inode(struct inode *inode)
1200{ 1191{
1201 const struct super_operations *op = inode->i_sb->s_op; 1192 return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
1202
1203 list_del_init(&inode->i_list);
1204 list_del_init(&inode->i_sb_list);
1205 WARN_ON(inode->i_state & I_NEW);
1206 inode->i_state |= I_FREEING;
1207 inodes_stat.nr_inodes--;
1208 spin_unlock(&inode_lock);
1209
1210 security_inode_delete(inode);
1211
1212 if (op->delete_inode) {
1213 void (*delete)(struct inode *) = op->delete_inode;
1214 /* Filesystems implementing their own
1215 * s_op->delete_inode are required to call
1216 * truncate_inode_pages and clear_inode()
1217 * internally */
1218 delete(inode);
1219 } else {
1220 truncate_inode_pages(&inode->i_data, 0);
1221 clear_inode(inode);
1222 }
1223 spin_lock(&inode_lock);
1224 hlist_del_init(&inode->i_hash);
1225 spin_unlock(&inode_lock);
1226 wake_up_inode(inode);
1227 BUG_ON(inode->i_state != I_CLEAR);
1228 destroy_inode(inode);
1229} 1193}
1230EXPORT_SYMBOL(generic_delete_inode); 1194EXPORT_SYMBOL_GPL(generic_drop_inode);
1231 1195
1232/** 1196/*
1233 * generic_detach_inode - remove inode from inode lists 1197 * Called when we're dropping the last reference
1234 * @inode: inode to remove 1198 * to an inode.
1235 *
1236 * Remove inode from inode lists, write it if it's dirty. This is just an
1237 * internal VFS helper exported for hugetlbfs. Do not use!
1238 * 1199 *
1239 * Returns 1 if inode should be completely destroyed. 1200 * Call the FS "drop_inode()" function, defaulting to
1201 * the legacy UNIX filesystem behaviour. If it tells
1202 * us to evict inode, do so. Otherwise, retain inode
1203 * in cache if fs is alive, sync and evict if fs is
1204 * shutting down.
1240 */ 1205 */
1241int generic_detach_inode(struct inode *inode) 1206static void iput_final(struct inode *inode)
1242{ 1207{
1243 struct super_block *sb = inode->i_sb; 1208 struct super_block *sb = inode->i_sb;
1209 const struct super_operations *op = inode->i_sb->s_op;
1210 int drop;
1244 1211
1245 if (!hlist_unhashed(&inode->i_hash)) { 1212 if (op && op->drop_inode)
1213 drop = op->drop_inode(inode);
1214 else
1215 drop = generic_drop_inode(inode);
1216
1217 if (!drop) {
1246 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 1218 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1247 list_move(&inode->i_list, &inode_unused); 1219 list_move(&inode->i_list, &inode_unused);
1248 inodes_stat.nr_unused++; 1220 inodes_stat.nr_unused++;
1249 if (sb->s_flags & MS_ACTIVE) { 1221 if (sb->s_flags & MS_ACTIVE) {
1250 spin_unlock(&inode_lock); 1222 spin_unlock(&inode_lock);
1251 return 0; 1223 return;
1252 } 1224 }
1253 WARN_ON(inode->i_state & I_NEW); 1225 WARN_ON(inode->i_state & I_NEW);
1254 inode->i_state |= I_WILL_FREE; 1226 inode->i_state |= I_WILL_FREE;
@@ -1266,56 +1238,15 @@ int generic_detach_inode(struct inode *inode)
1266 inode->i_state |= I_FREEING; 1238 inode->i_state |= I_FREEING;
1267 inodes_stat.nr_inodes--; 1239 inodes_stat.nr_inodes--;
1268 spin_unlock(&inode_lock); 1240 spin_unlock(&inode_lock);
1269 return 1; 1241 evict(inode);
1270} 1242 spin_lock(&inode_lock);
1271EXPORT_SYMBOL_GPL(generic_detach_inode); 1243 hlist_del_init(&inode->i_hash);
1272 1244 spin_unlock(&inode_lock);
1273static void generic_forget_inode(struct inode *inode)
1274{
1275 if (!generic_detach_inode(inode))
1276 return;
1277 if (inode->i_data.nrpages)
1278 truncate_inode_pages(&inode->i_data, 0);
1279 clear_inode(inode);
1280 wake_up_inode(inode); 1245 wake_up_inode(inode);
1246 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
1281 destroy_inode(inode); 1247 destroy_inode(inode);
1282} 1248}
1283 1249
1284/*
1285 * Normal UNIX filesystem behaviour: delete the
1286 * inode when the usage count drops to zero, and
1287 * i_nlink is zero.
1288 */
1289void generic_drop_inode(struct inode *inode)
1290{
1291 if (!inode->i_nlink)
1292 generic_delete_inode(inode);
1293 else
1294 generic_forget_inode(inode);
1295}
1296EXPORT_SYMBOL_GPL(generic_drop_inode);
1297
1298/*
1299 * Called when we're dropping the last reference
1300 * to an inode.
1301 *
1302 * Call the FS "drop()" function, defaulting to
1303 * the legacy UNIX filesystem behaviour..
1304 *
1305 * NOTE! NOTE! NOTE! We're called with the inode lock
1306 * held, and the drop function is supposed to release
1307 * the lock!
1308 */
1309static inline void iput_final(struct inode *inode)
1310{
1311 const struct super_operations *op = inode->i_sb->s_op;
1312 void (*drop)(struct inode *) = generic_drop_inode;
1313
1314 if (op && op->drop_inode)
1315 drop = op->drop_inode;
1316 drop(inode);
1317}
1318
1319/** 1250/**
1320 * iput - put an inode 1251 * iput - put an inode
1321 * @inode: inode to put 1252 * @inode: inode to put
@@ -1328,7 +1259,7 @@ static inline void iput_final(struct inode *inode)
1328void iput(struct inode *inode) 1259void iput(struct inode *inode)
1329{ 1260{
1330 if (inode) { 1261 if (inode) {
1331 BUG_ON(inode->i_state == I_CLEAR); 1262 BUG_ON(inode->i_state & I_CLEAR);
1332 1263
1333 if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1264 if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
1334 iput_final(inode); 1265 iput_final(inode);
@@ -1612,3 +1543,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1612 inode->i_ino); 1543 inode->i_ino);
1613} 1544}
1614EXPORT_SYMBOL(init_special_inode); 1545EXPORT_SYMBOL(init_special_inode);
1546
1547/**
1548 * Init uid,gid,mode for new inode according to posix standards
1549 * @inode: New inode
1550 * @dir: Directory inode
1551 * @mode: mode of the new inode
1552 */
1553void inode_init_owner(struct inode *inode, const struct inode *dir,
1554 mode_t mode)
1555{
1556 inode->i_uid = current_fsuid();
1557 if (dir && dir->i_mode & S_ISGID) {
1558 inode->i_gid = dir->i_gid;
1559 if (S_ISDIR(mode))
1560 mode |= S_ISGID;
1561 } else
1562 inode->i_gid = current_fsgid();
1563 inode->i_mode = mode;
1564}
1565EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index 8a03a5447bdf..a6910e91cee8 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/lglock.h>
13
12struct super_block; 14struct super_block;
13struct linux_binprm; 15struct linux_binprm;
14struct path; 16struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
70 72
71extern void __init mnt_init(void); 73extern void __init mnt_init(void);
72 74
73extern spinlock_t vfsmount_lock; 75DECLARE_BRLOCK(vfsmount_lock);
76
74 77
75/* 78/*
76 * fs_struct.c 79 * fs_struct.c
@@ -80,6 +83,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
80/* 83/*
81 * file_table.c 84 * file_table.c
82 */ 85 */
86extern void file_sb_list_add(struct file *f, struct super_block *sb);
87extern void file_sb_list_del(struct file *f);
83extern void mark_files_ro(struct super_block *); 88extern void mark_files_ro(struct super_block *);
84extern struct file *get_empty_filp(void); 89extern struct file *get_empty_filp(void);
85 90
@@ -87,6 +92,8 @@ extern struct file *get_empty_filp(void);
87 * super.c 92 * super.c
88 */ 93 */
89extern int do_remount_sb(struct super_block *, int, void *, int); 94extern int do_remount_sb(struct super_block *, int, void *, int);
95extern void __put_super(struct super_block *sb);
96extern void put_super(struct super_block *sb);
90 97
91/* 98/*
92 * open.c 99 * open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7faefb4da939..f855ea4fc888 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -29,7 +29,6 @@
29 * @arg: command-specific argument for ioctl 29 * @arg: command-specific argument for ioctl
30 * 30 *
31 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise 31 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
32 * invokes filesystem specific ->ioctl method. If neither method exists,
33 * returns -ENOTTY. 32 * returns -ENOTTY.
34 * 33 *
35 * Returns 0 on success, -errno on error. 34 * Returns 0 on success, -errno on error.
@@ -39,21 +38,12 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
39{ 38{
40 int error = -ENOTTY; 39 int error = -ENOTTY;
41 40
42 if (!filp->f_op) 41 if (!filp->f_op || !filp->f_op->unlocked_ioctl)
43 goto out; 42 goto out;
44 43
45 if (filp->f_op->unlocked_ioctl) { 44 error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
46 error = filp->f_op->unlocked_ioctl(filp, cmd, arg); 45 if (error == -ENOIOCTLCMD)
47 if (error == -ENOIOCTLCMD) 46 error = -EINVAL;
48 error = -EINVAL;
49 goto out;
50 } else if (filp->f_op->ioctl) {
51 lock_kernel();
52 error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
53 filp, cmd, arg);
54 unlock_kernel();
55 }
56
57 out: 47 out:
58 return error; 48 return error;
59} 49}
@@ -525,15 +515,8 @@ static int ioctl_fsfreeze(struct file *filp)
525 if (sb->s_op->freeze_fs == NULL) 515 if (sb->s_op->freeze_fs == NULL)
526 return -EOPNOTSUPP; 516 return -EOPNOTSUPP;
527 517
528 /* If a blockdevice-backed filesystem isn't specified, return. */
529 if (sb->s_bdev == NULL)
530 return -EINVAL;
531
532 /* Freeze */ 518 /* Freeze */
533 sb = freeze_bdev(sb->s_bdev); 519 return freeze_super(sb);
534 if (IS_ERR(sb))
535 return PTR_ERR(sb);
536 return 0;
537} 520}
538 521
539static int ioctl_fsthaw(struct file *filp) 522static int ioctl_fsthaw(struct file *filp)
@@ -543,12 +526,8 @@ static int ioctl_fsthaw(struct file *filp)
543 if (!capable(CAP_SYS_ADMIN)) 526 if (!capable(CAP_SYS_ADMIN))
544 return -EPERM; 527 return -EPERM;
545 528
546 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
547 if (sb->s_bdev == NULL)
548 return -EINVAL;
549
550 /* Thaw */ 529 /* Thaw */
551 return thaw_bdev(sb->s_bdev, sb); 530 return thaw_super(sb);
552} 531}
553 532
554/* 533/*
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
272 272
273const struct file_operations isofs_dir_operations = 273const struct file_operations isofs_dir_operations =
274{ 274{
275 .llseek = generic_file_llseek,
275 .read = generic_read_dir, 276 .read = generic_read_dir,
276 .readdir = isofs_readdir, 277 .readdir = isofs_readdir,
277}; 278};
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6b4dcd4f2943..5a44811b5027 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -722,7 +722,12 @@ root_found:
722 } 722 }
723 723
724 s->s_magic = ISOFS_SUPER_MAGIC; 724 s->s_magic = ISOFS_SUPER_MAGIC;
725 s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */ 725
726 /*
727 * With multi-extent files, file size is only limited by the maximum
728 * size of a file system, which is 8 TB.
729 */
730 s->s_maxbytes = 0x80000000000LL;
726 731
727 /* 732 /*
728 * The CDROM is read-only, has no nodes (devices) on it, and since 733 * The CDROM is read-only, has no nodes (devices) on it, and since
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index b0435dd0654d..05a38b9c4c0e 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
254{ 254{
255 int i; 255 int i;
256 256
257 ll_rw_block(SWRITE, *batch_count, bhs); 257 for (i = 0; i < *batch_count; i++)
258 write_dirty_buffer(bhs[i], WRITE);
259
258 for (i = 0; i < *batch_count; i++) { 260 for (i = 0; i < *batch_count; i++) {
259 struct buffer_head *bh = bhs[i]; 261 struct buffer_head *bh = bhs[i];
260 clear_buffer_jwrite(bh); 262 clear_buffer_jwrite(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index ecb44c94ba8d..95d8c11c929e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal,
119 struct buffer_head *bh; 119 struct buffer_head *bh;
120 journal_header_t *header; 120 journal_header_t *header;
121 int ret; 121 int ret;
122 int barrier_done = 0;
123 122
124 if (is_journal_aborted(journal)) 123 if (is_journal_aborted(journal))
125 return 0; 124 return 0;
@@ -137,34 +136,36 @@ static int journal_write_commit_record(journal_t *journal,
137 136
138 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "write commit block");
139 set_buffer_dirty(bh); 138 set_buffer_dirty(bh);
139
140 if (journal->j_flags & JFS_BARRIER) { 140 if (journal->j_flags & JFS_BARRIER) {
141 set_buffer_ordered(bh); 141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
142 barrier_done = 1;
143 }
144 ret = sync_dirty_buffer(bh);
145 if (barrier_done)
146 clear_buffer_ordered(bh);
147 /* is it possible for another commit to fail at roughly
148 * the same time as this one? If so, we don't want to
149 * trust the barrier flag in the super, but instead want
150 * to remember if we sent a barrier request
151 */
152 if (ret == -EOPNOTSUPP && barrier_done) {
153 char b[BDEVNAME_SIZE];
154 142
155 printk(KERN_WARNING 143 /*
156 "JBD: barrier-based sync failed on %s - " 144 * Is it possible for another commit to fail at roughly
157 "disabling barriers\n", 145 * the same time as this one? If so, we don't want to
158 bdevname(journal->j_dev, b)); 146 * trust the barrier flag in the super, but instead want
159 spin_lock(&journal->j_state_lock); 147 * to remember if we sent a barrier request
160 journal->j_flags &= ~JFS_BARRIER; 148 */
161 spin_unlock(&journal->j_state_lock); 149 if (ret == -EOPNOTSUPP) {
150 char b[BDEVNAME_SIZE];
162 151
163 /* And try again, without the barrier */ 152 printk(KERN_WARNING
164 set_buffer_uptodate(bh); 153 "JBD: barrier-based sync failed on %s - "
165 set_buffer_dirty(bh); 154 "disabling barriers\n",
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JFS_BARRIER;
158 spin_unlock(&journal->j_state_lock);
159
160 /* And try again, without the barrier */
161 set_buffer_uptodate(bh);
162 set_buffer_dirty(bh);
163 ret = sync_dirty_buffer(bh);
164 }
165 } else {
166 ret = sync_dirty_buffer(bh); 166 ret = sync_dirty_buffer(bh);
167 } 167 }
168
168 put_bh(bh); /* One for getblk() */ 169 put_bh(bh); /* One for getblk() */
169 journal_put_journal_head(descriptor); 170 journal_put_journal_head(descriptor);
170 171
@@ -786,6 +787,12 @@ wait_for_iobuf:
786 787
787 jbd_debug(3, "JBD: commit phase 6\n"); 788 jbd_debug(3, "JBD: commit phase 6\n");
788 789
790 /* All metadata is written, now write commit record and do cleanup */
791 spin_lock(&journal->j_state_lock);
792 J_ASSERT(commit_transaction->t_state == T_COMMIT);
793 commit_transaction->t_state = T_COMMIT_RECORD;
794 spin_unlock(&journal->j_state_lock);
795
789 if (journal_write_commit_record(journal, commit_transaction)) 796 if (journal_write_commit_record(journal, commit_transaction))
790 err = -EIO; 797 err = -EIO;
791 798
@@ -923,7 +930,7 @@ restart_loop:
923 930
924 jbd_debug(3, "JBD: commit phase 8\n"); 931 jbd_debug(3, "JBD: commit phase 8\n");
925 932
926 J_ASSERT(commit_transaction->t_state == T_COMMIT); 933 J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
927 934
928 commit_transaction->t_state = T_FINISHED; 935 commit_transaction->t_state = T_FINISHED;
929 J_ASSERT(commit_transaction == journal->j_committing_transaction); 936 J_ASSERT(commit_transaction == journal->j_committing_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd224eec9b07..2c4b1f109da9 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -565,6 +565,38 @@ int log_wait_commit(journal_t *journal, tid_t tid)
565} 565}
566 566
567/* 567/*
568 * Return 1 if a given transaction has not yet sent barrier request
569 * connected with a transaction commit. If 0 is returned, transaction
570 * may or may not have sent the barrier. Used to avoid sending barrier
571 * twice in common cases.
572 */
573int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
574{
575 int ret = 0;
576 transaction_t *commit_trans;
577
578 if (!(journal->j_flags & JFS_BARRIER))
579 return 0;
580 spin_lock(&journal->j_state_lock);
581 /* Transaction already committed? */
582 if (tid_geq(journal->j_commit_sequence, tid))
583 goto out;
584 /*
585 * Transaction is being committed and we already proceeded to
586 * writing commit record?
587 */
588 commit_trans = journal->j_committing_transaction;
589 if (commit_trans && commit_trans->t_tid == tid &&
590 commit_trans->t_state >= T_COMMIT_RECORD)
591 goto out;
592 ret = 1;
593out:
594 spin_unlock(&journal->j_state_lock);
595 return ret;
596}
597EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
598
599/*
568 * Log buffer allocation routines: 600 * Log buffer allocation routines:
569 */ 601 */
570 602
@@ -992,7 +1024,7 @@ void journal_update_superblock(journal_t *journal, int wait)
992 if (wait) 1024 if (wait)
993 sync_dirty_buffer(bh); 1025 sync_dirty_buffer(bh);
994 else 1026 else
995 ll_rw_block(SWRITE, 1, &bh); 1027 write_dirty_buffer(bh, WRITE);
996 1028
997out: 1029out:
998 /* If we have just flushed the log (by marking s_start==0), then 1030 /* If we have just flushed the log (by marking s_start==0), then
@@ -1157,6 +1189,7 @@ int journal_destroy(journal_t *journal)
1157{ 1189{
1158 int err = 0; 1190 int err = 0;
1159 1191
1192
1160 /* Wait for the commit thread to wake up and die. */ 1193 /* Wait for the commit thread to wake up and die. */
1161 journal_kill_thread(journal); 1194 journal_kill_thread(journal);
1162 1195
@@ -1248,13 +1281,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat,
1248int journal_check_available_features (journal_t *journal, unsigned long compat, 1281int journal_check_available_features (journal_t *journal, unsigned long compat,
1249 unsigned long ro, unsigned long incompat) 1282 unsigned long ro, unsigned long incompat)
1250{ 1283{
1251 journal_superblock_t *sb;
1252
1253 if (!compat && !ro && !incompat) 1284 if (!compat && !ro && !incompat)
1254 return 1; 1285 return 1;
1255 1286
1256 sb = journal->j_superblock;
1257
1258 /* We can support any known requested features iff the 1287 /* We can support any known requested features iff the
1259 * superblock is in version 2. Otherwise we fail to support any 1288 * superblock is in version 2. Otherwise we fail to support any
1260 * extended sb features. */ 1289 * extended sb features. */
@@ -1448,7 +1477,6 @@ int journal_flush(journal_t *journal)
1448 1477
1449int journal_wipe(journal_t *journal, int write) 1478int journal_wipe(journal_t *journal, int write)
1450{ 1479{
1451 journal_superblock_t *sb;
1452 int err = 0; 1480 int err = 0;
1453 1481
1454 J_ASSERT (!(journal->j_flags & JFS_LOADED)); 1482 J_ASSERT (!(journal->j_flags & JFS_LOADED));
@@ -1457,8 +1485,6 @@ int journal_wipe(journal_t *journal, int write)
1457 if (err) 1485 if (err)
1458 return err; 1486 return err;
1459 1487
1460 sb = journal->j_superblock;
1461
1462 if (!journal->j_tail) 1488 if (!journal->j_tail)
1463 goto no_recovery; 1489 goto no_recovery;
1464 1490
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 54c9bc9e1b17..81051dafebf5 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -283,12 +283,9 @@ int journal_recover(journal_t *journal)
283int journal_skip_recovery(journal_t *journal) 283int journal_skip_recovery(journal_t *journal)
284{ 284{
285 int err; 285 int err;
286 journal_superblock_t * sb;
287
288 struct recovery_info info; 286 struct recovery_info info;
289 287
290 memset (&info, 0, sizeof(info)); 288 memset (&info, 0, sizeof(info));
291 sb = journal->j_superblock;
292 289
293 err = do_one_pass(journal, &info, PASS_SCAN); 290 err = do_one_pass(journal, &info, PASS_SCAN);
294 291
@@ -297,7 +294,8 @@ int journal_skip_recovery(journal_t *journal)
297 ++journal->j_transaction_sequence; 294 ++journal->j_transaction_sequence;
298 } else { 295 } else {
299#ifdef CONFIG_JBD_DEBUG 296#ifdef CONFIG_JBD_DEBUG
300 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); 297 int dropped = info.end_transaction -
298 be32_to_cpu(journal->j_superblock->s_sequence);
301#endif 299#endif
302 jbd_debug(1, 300 jbd_debug(1,
303 "JBD: ignoring %d transaction%s from the journal.\n", 301 "JBD: ignoring %d transaction%s from the journal.\n",
@@ -321,11 +319,6 @@ static int do_one_pass(journal_t *journal,
321 unsigned int sequence; 319 unsigned int sequence;
322 int blocktype; 320 int blocktype;
323 321
324 /* Precompute the maximum metadata descriptors in a descriptor block */
325 int MAX_BLOCKS_PER_DESC;
326 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
327 / sizeof(journal_block_tag_t));
328
329 /* 322 /*
330 * First thing is to establish what we expect to find in the log 323 * First thing is to establish what we expect to find in the log
331 * (in terms of transaction IDs), and where (in terms of log 324 * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad717328343a..d29018307e2e 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
617 set_buffer_jwrite(bh); 617 set_buffer_jwrite(bh);
618 BUFFER_TRACE(bh, "write"); 618 BUFFER_TRACE(bh, "write");
619 set_buffer_dirty(bh); 619 set_buffer_dirty(bh);
620 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); 620 write_dirty_buffer(bh, write_op);
621} 621}
622#endif 622#endif
623 623
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..5247e7ffdcb4 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
118void __jbd2_log_wait_for_space(journal_t *journal) 118void __jbd2_log_wait_for_space(journal_t *journal)
119{ 119{
120 int nblocks, space_left; 120 int nblocks, space_left;
121 assert_spin_locked(&journal->j_state_lock); 121 /* assert_spin_locked(&journal->j_state_lock); */
122 122
123 nblocks = jbd_space_needed(journal); 123 nblocks = jbd_space_needed(journal);
124 while (__jbd2_log_space_left(journal) < nblocks) { 124 while (__jbd2_log_space_left(journal) < nblocks) {
125 if (journal->j_flags & JBD2_ABORT) 125 if (journal->j_flags & JBD2_ABORT)
126 return; 126 return;
127 spin_unlock(&journal->j_state_lock); 127 write_unlock(&journal->j_state_lock);
128 mutex_lock(&journal->j_checkpoint_mutex); 128 mutex_lock(&journal->j_checkpoint_mutex);
129 129
130 /* 130 /*
@@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
138 * filesystem, so abort the journal and leave a stack 138 * filesystem, so abort the journal and leave a stack
139 * trace for forensic evidence. 139 * trace for forensic evidence.
140 */ 140 */
141 spin_lock(&journal->j_state_lock); 141 write_lock(&journal->j_state_lock);
142 spin_lock(&journal->j_list_lock); 142 spin_lock(&journal->j_list_lock);
143 nblocks = jbd_space_needed(journal); 143 nblocks = jbd_space_needed(journal);
144 space_left = __jbd2_log_space_left(journal); 144 space_left = __jbd2_log_space_left(journal);
@@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
149 if (journal->j_committing_transaction) 149 if (journal->j_committing_transaction)
150 tid = journal->j_committing_transaction->t_tid; 150 tid = journal->j_committing_transaction->t_tid;
151 spin_unlock(&journal->j_list_lock); 151 spin_unlock(&journal->j_list_lock);
152 spin_unlock(&journal->j_state_lock); 152 write_unlock(&journal->j_state_lock);
153 if (chkpt) { 153 if (chkpt) {
154 jbd2_log_do_checkpoint(journal); 154 jbd2_log_do_checkpoint(journal);
155 } else if (jbd2_cleanup_journal_tail(journal) == 0) { 155 } else if (jbd2_cleanup_journal_tail(journal) == 0) {
@@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
167 WARN_ON(1); 167 WARN_ON(1);
168 jbd2_journal_abort(journal, 0); 168 jbd2_journal_abort(journal, 0);
169 } 169 }
170 spin_lock(&journal->j_state_lock); 170 write_lock(&journal->j_state_lock);
171 } else { 171 } else {
172 spin_unlock(&journal->j_list_lock); 172 spin_unlock(&journal->j_list_lock);
173 } 173 }
@@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count)
255{ 255{
256 int i; 256 int i;
257 257
258 ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs); 258 for (i = 0; i < *batch_count; i++)
259 write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
260
259 for (i = 0; i < *batch_count; i++) { 261 for (i = 0; i < *batch_count; i++) {
260 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 262 struct buffer_head *bh = journal->j_chkpt_bhs[i];
261 clear_buffer_jwrite(bh); 263 clear_buffer_jwrite(bh);
@@ -474,7 +476,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
474 * next transaction ID we will write, and where it will 476 * next transaction ID we will write, and where it will
475 * start. */ 477 * start. */
476 478
477 spin_lock(&journal->j_state_lock); 479 write_lock(&journal->j_state_lock);
478 spin_lock(&journal->j_list_lock); 480 spin_lock(&journal->j_list_lock);
479 transaction = journal->j_checkpoint_transactions; 481 transaction = journal->j_checkpoint_transactions;
480 if (transaction) { 482 if (transaction) {
@@ -496,7 +498,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
496 /* If the oldest pinned transaction is at the tail of the log 498 /* If the oldest pinned transaction is at the tail of the log
497 already then there's not much we can do right now. */ 499 already then there's not much we can do right now. */
498 if (journal->j_tail_sequence == first_tid) { 500 if (journal->j_tail_sequence == first_tid) {
499 spin_unlock(&journal->j_state_lock); 501 write_unlock(&journal->j_state_lock);
500 return 1; 502 return 1;
501 } 503 }
502 504
@@ -516,7 +518,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
516 journal->j_free += freed; 518 journal->j_free += freed;
517 journal->j_tail_sequence = first_tid; 519 journal->j_tail_sequence = first_tid;
518 journal->j_tail = blocknr; 520 journal->j_tail = blocknr;
519 spin_unlock(&journal->j_state_lock); 521 write_unlock(&journal->j_state_lock);
520 522
521 /* 523 /*
522 * If there is an external journal, we need to make sure that 524 * If there is an external journal, we need to make sure that
@@ -530,7 +532,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
530 */ 532 */
531 if ((journal->j_fs_dev != journal->j_dev) && 533 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER)) 534 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, NULL); 535 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
536 BLKDEV_IFL_WAIT);
534 if (!(journal->j_flags & JBD2_ABORT)) 537 if (!(journal->j_flags & JBD2_ABORT))
535 jbd2_journal_update_superblock(journal, 1); 538 jbd2_journal_update_superblock(journal, 1);
536 return 0; 539 return 0;
@@ -774,7 +777,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
774 J_ASSERT(transaction->t_log_list == NULL); 777 J_ASSERT(transaction->t_log_list == NULL);
775 J_ASSERT(transaction->t_checkpoint_list == NULL); 778 J_ASSERT(transaction->t_checkpoint_list == NULL);
776 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 779 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
777 J_ASSERT(transaction->t_updates == 0); 780 J_ASSERT(atomic_read(&transaction->t_updates) == 0);
778 J_ASSERT(journal->j_committing_transaction != transaction); 781 J_ASSERT(journal->j_committing_transaction != transaction);
779 J_ASSERT(journal->j_running_transaction != transaction); 782 J_ASSERT(journal->j_running_transaction != transaction);
780 783
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..7c068c189d80 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -101,7 +101,6 @@ static int journal_submit_commit_record(journal_t *journal,
101 struct commit_header *tmp; 101 struct commit_header *tmp;
102 struct buffer_head *bh; 102 struct buffer_head *bh;
103 int ret; 103 int ret;
104 int barrier_done = 0;
105 struct timespec now = current_kernel_time(); 104 struct timespec now = current_kernel_time();
106 105
107 if (is_journal_aborted(journal)) 106 if (is_journal_aborted(journal))
@@ -136,30 +135,22 @@ static int journal_submit_commit_record(journal_t *journal,
136 if (journal->j_flags & JBD2_BARRIER && 135 if (journal->j_flags & JBD2_BARRIER &&
137 !JBD2_HAS_INCOMPAT_FEATURE(journal, 136 !JBD2_HAS_INCOMPAT_FEATURE(journal,
138 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
139 set_buffer_ordered(bh); 138 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
140 barrier_done = 1; 139 if (ret == -EOPNOTSUPP) {
141 } 140 printk(KERN_WARNING
142 ret = submit_bh(WRITE_SYNC_PLUG, bh); 141 "JBD2: Disabling barriers on %s, "
143 if (barrier_done) 142 "not supported by device\n", journal->j_devname);
144 clear_buffer_ordered(bh); 143 write_lock(&journal->j_state_lock);
145 144 journal->j_flags &= ~JBD2_BARRIER;
146 /* is it possible for another commit to fail at roughly 145 write_unlock(&journal->j_state_lock);
147 * the same time as this one? If so, we don't want to 146
148 * trust the barrier flag in the super, but instead want 147 /* And try again, without the barrier */
149 * to remember if we sent a barrier request 148 lock_buffer(bh);
150 */ 149 set_buffer_uptodate(bh);
151 if (ret == -EOPNOTSUPP && barrier_done) { 150 clear_buffer_dirty(bh);
152 printk(KERN_WARNING 151 ret = submit_bh(WRITE_SYNC_PLUG, bh);
153 "JBD: barrier-based sync failed on %s - " 152 }
154 "disabling barriers\n", journal->j_devname); 153 } else {
155 spin_lock(&journal->j_state_lock);
156 journal->j_flags &= ~JBD2_BARRIER;
157 spin_unlock(&journal->j_state_lock);
158
159 /* And try again, without the barrier */
160 lock_buffer(bh);
161 set_buffer_uptodate(bh);
162 clear_buffer_dirty(bh);
163 ret = submit_bh(WRITE_SYNC_PLUG, bh); 154 ret = submit_bh(WRITE_SYNC_PLUG, bh);
164 } 155 }
165 *cbh = bh; 156 *cbh = bh;
@@ -180,11 +171,11 @@ retry:
180 wait_on_buffer(bh); 171 wait_on_buffer(bh);
181 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { 172 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
182 printk(KERN_WARNING 173 printk(KERN_WARNING
183 "JBD2: wait_on_commit_record: sync failed on %s - " 174 "JBD2: %s: disabling barries on %s - not supported "
184 "disabling barriers\n", journal->j_devname); 175 "by device\n", __func__, journal->j_devname);
185 spin_lock(&journal->j_state_lock); 176 write_lock(&journal->j_state_lock);
186 journal->j_flags &= ~JBD2_BARRIER; 177 journal->j_flags &= ~JBD2_BARRIER;
187 spin_unlock(&journal->j_state_lock); 178 write_unlock(&journal->j_state_lock);
188 179
189 lock_buffer(bh); 180 lock_buffer(bh);
190 clear_buffer_dirty(bh); 181 clear_buffer_dirty(bh);
@@ -400,7 +391,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
400 jbd_debug(1, "JBD: starting commit of transaction %d\n", 391 jbd_debug(1, "JBD: starting commit of transaction %d\n",
401 commit_transaction->t_tid); 392 commit_transaction->t_tid);
402 393
403 spin_lock(&journal->j_state_lock); 394 write_lock(&journal->j_state_lock);
404 commit_transaction->t_state = T_LOCKED; 395 commit_transaction->t_state = T_LOCKED;
405 396
406 /* 397 /*
@@ -417,23 +408,23 @@ void jbd2_journal_commit_transaction(journal_t *journal)
417 stats.run.rs_locked); 408 stats.run.rs_locked);
418 409
419 spin_lock(&commit_transaction->t_handle_lock); 410 spin_lock(&commit_transaction->t_handle_lock);
420 while (commit_transaction->t_updates) { 411 while (atomic_read(&commit_transaction->t_updates)) {
421 DEFINE_WAIT(wait); 412 DEFINE_WAIT(wait);
422 413
423 prepare_to_wait(&journal->j_wait_updates, &wait, 414 prepare_to_wait(&journal->j_wait_updates, &wait,
424 TASK_UNINTERRUPTIBLE); 415 TASK_UNINTERRUPTIBLE);
425 if (commit_transaction->t_updates) { 416 if (atomic_read(&commit_transaction->t_updates)) {
426 spin_unlock(&commit_transaction->t_handle_lock); 417 spin_unlock(&commit_transaction->t_handle_lock);
427 spin_unlock(&journal->j_state_lock); 418 write_unlock(&journal->j_state_lock);
428 schedule(); 419 schedule();
429 spin_lock(&journal->j_state_lock); 420 write_lock(&journal->j_state_lock);
430 spin_lock(&commit_transaction->t_handle_lock); 421 spin_lock(&commit_transaction->t_handle_lock);
431 } 422 }
432 finish_wait(&journal->j_wait_updates, &wait); 423 finish_wait(&journal->j_wait_updates, &wait);
433 } 424 }
434 spin_unlock(&commit_transaction->t_handle_lock); 425 spin_unlock(&commit_transaction->t_handle_lock);
435 426
436 J_ASSERT (commit_transaction->t_outstanding_credits <= 427 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
437 journal->j_max_transaction_buffers); 428 journal->j_max_transaction_buffers);
438 429
439 /* 430 /*
@@ -497,7 +488,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
497 start_time = ktime_get(); 488 start_time = ktime_get();
498 commit_transaction->t_log_start = journal->j_head; 489 commit_transaction->t_log_start = journal->j_head;
499 wake_up(&journal->j_wait_transaction_locked); 490 wake_up(&journal->j_wait_transaction_locked);
500 spin_unlock(&journal->j_state_lock); 491 write_unlock(&journal->j_state_lock);
501 492
502 jbd_debug (3, "JBD: commit phase 2\n"); 493 jbd_debug (3, "JBD: commit phase 2\n");
503 494
@@ -519,19 +510,20 @@ void jbd2_journal_commit_transaction(journal_t *journal)
519 * transaction! Now comes the tricky part: we need to write out 510 * transaction! Now comes the tricky part: we need to write out
520 * metadata. Loop over the transaction's entire buffer list: 511 * metadata. Loop over the transaction's entire buffer list:
521 */ 512 */
522 spin_lock(&journal->j_state_lock); 513 write_lock(&journal->j_state_lock);
523 commit_transaction->t_state = T_COMMIT; 514 commit_transaction->t_state = T_COMMIT;
524 spin_unlock(&journal->j_state_lock); 515 write_unlock(&journal->j_state_lock);
525 516
526 trace_jbd2_commit_logging(journal, commit_transaction); 517 trace_jbd2_commit_logging(journal, commit_transaction);
527 stats.run.rs_logging = jiffies; 518 stats.run.rs_logging = jiffies;
528 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, 519 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
529 stats.run.rs_logging); 520 stats.run.rs_logging);
530 stats.run.rs_blocks = commit_transaction->t_outstanding_credits; 521 stats.run.rs_blocks =
522 atomic_read(&commit_transaction->t_outstanding_credits);
531 stats.run.rs_blocks_logged = 0; 523 stats.run.rs_blocks_logged = 0;
532 524
533 J_ASSERT(commit_transaction->t_nr_buffers <= 525 J_ASSERT(commit_transaction->t_nr_buffers <=
534 commit_transaction->t_outstanding_credits); 526 atomic_read(&commit_transaction->t_outstanding_credits));
535 527
536 err = 0; 528 err = 0;
537 descriptor = NULL; 529 descriptor = NULL;
@@ -616,7 +608,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
616 * the free space in the log, but this counter is changed 608 * the free space in the log, but this counter is changed
617 * by jbd2_journal_next_log_block() also. 609 * by jbd2_journal_next_log_block() also.
618 */ 610 */
619 commit_transaction->t_outstanding_credits--; 611 atomic_dec(&commit_transaction->t_outstanding_credits);
620 612
621 /* Bump b_count to prevent truncate from stumbling over 613 /* Bump b_count to prevent truncate from stumbling over
622 the shadowed buffer! @@@ This can go if we ever get 614 the shadowed buffer! @@@ This can go if we ever get
@@ -717,7 +709,8 @@ start_journal_io:
717 if (commit_transaction->t_flushed_data_blocks && 709 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) && 710 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER)) 711 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, NULL); 712 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
713 BLKDEV_IFL_WAIT);
721 714
722 /* Done it all: now write the commit record asynchronously. */ 715 /* Done it all: now write the commit record asynchronously. */
723 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 716 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +720,8 @@ start_journal_io:
727 if (err) 720 if (err)
728 __jbd2_journal_abort_hard(journal); 721 __jbd2_journal_abort_hard(journal);
729 if (journal->j_flags & JBD2_BARRIER) 722 if (journal->j_flags & JBD2_BARRIER)
730 blkdev_issue_flush(journal->j_dev, NULL); 723 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
724 BLKDEV_IFL_WAIT);
731 } 725 }
732 726
733 err = journal_finish_inode_data_buffers(journal, commit_transaction); 727 err = journal_finish_inode_data_buffers(journal, commit_transaction);
@@ -975,7 +969,7 @@ restart_loop:
975 * __jbd2_journal_drop_transaction(). Otherwise we could race with 969 * __jbd2_journal_drop_transaction(). Otherwise we could race with
976 * other checkpointing code processing the transaction... 970 * other checkpointing code processing the transaction...
977 */ 971 */
978 spin_lock(&journal->j_state_lock); 972 write_lock(&journal->j_state_lock);
979 spin_lock(&journal->j_list_lock); 973 spin_lock(&journal->j_list_lock);
980 /* 974 /*
981 * Now recheck if some buffers did not get attached to the transaction 975 * Now recheck if some buffers did not get attached to the transaction
@@ -983,7 +977,7 @@ restart_loop:
983 */ 977 */
984 if (commit_transaction->t_forget) { 978 if (commit_transaction->t_forget) {
985 spin_unlock(&journal->j_list_lock); 979 spin_unlock(&journal->j_list_lock);
986 spin_unlock(&journal->j_state_lock); 980 write_unlock(&journal->j_state_lock);
987 goto restart_loop; 981 goto restart_loop;
988 } 982 }
989 983
@@ -1001,7 +995,8 @@ restart_loop:
1001 * File the transaction statistics 995 * File the transaction statistics
1002 */ 996 */
1003 stats.ts_tid = commit_transaction->t_tid; 997 stats.ts_tid = commit_transaction->t_tid;
1004 stats.run.rs_handle_count = commit_transaction->t_handle_count; 998 stats.run.rs_handle_count =
999 atomic_read(&commit_transaction->t_handle_count);
1005 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1000 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1006 commit_transaction->t_tid, &stats.run); 1001 commit_transaction->t_tid, &stats.run);
1007 1002
@@ -1035,7 +1030,7 @@ restart_loop:
1035 journal->j_average_commit_time*3) / 4; 1030 journal->j_average_commit_time*3) / 4;
1036 else 1031 else
1037 journal->j_average_commit_time = commit_time; 1032 journal->j_average_commit_time = commit_time;
1038 spin_unlock(&journal->j_state_lock); 1033 write_unlock(&journal->j_state_lock);
1039 1034
1040 if (commit_transaction->t_checkpoint_list == NULL && 1035 if (commit_transaction->t_checkpoint_list == NULL &&
1041 commit_transaction->t_checkpoint_io_list == NULL) { 1036 commit_transaction->t_checkpoint_io_list == NULL) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c03d4dce4d76..0e8014ea6b94 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,6 +41,7 @@
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/log2.h> 42#include <linux/log2.h>
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/backing-dev.h>
44 45
45#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
46#include <trace/events/jbd2.h> 47#include <trace/events/jbd2.h>
@@ -48,8 +49,6 @@
48#include <asm/uaccess.h> 49#include <asm/uaccess.h>
49#include <asm/page.h> 50#include <asm/page.h>
50 51
51EXPORT_SYMBOL(jbd2_journal_start);
52EXPORT_SYMBOL(jbd2_journal_restart);
53EXPORT_SYMBOL(jbd2_journal_extend); 52EXPORT_SYMBOL(jbd2_journal_extend);
54EXPORT_SYMBOL(jbd2_journal_stop); 53EXPORT_SYMBOL(jbd2_journal_stop);
55EXPORT_SYMBOL(jbd2_journal_lock_updates); 54EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -143,7 +142,7 @@ static int kjournald2(void *arg)
143 /* 142 /*
144 * And now, wait forever for commit wakeup events. 143 * And now, wait forever for commit wakeup events.
145 */ 144 */
146 spin_lock(&journal->j_state_lock); 145 write_lock(&journal->j_state_lock);
147 146
148loop: 147loop:
149 if (journal->j_flags & JBD2_UNMOUNT) 148 if (journal->j_flags & JBD2_UNMOUNT)
@@ -154,10 +153,10 @@ loop:
154 153
155 if (journal->j_commit_sequence != journal->j_commit_request) { 154 if (journal->j_commit_sequence != journal->j_commit_request) {
156 jbd_debug(1, "OK, requests differ\n"); 155 jbd_debug(1, "OK, requests differ\n");
157 spin_unlock(&journal->j_state_lock); 156 write_unlock(&journal->j_state_lock);
158 del_timer_sync(&journal->j_commit_timer); 157 del_timer_sync(&journal->j_commit_timer);
159 jbd2_journal_commit_transaction(journal); 158 jbd2_journal_commit_transaction(journal);
160 spin_lock(&journal->j_state_lock); 159 write_lock(&journal->j_state_lock);
161 goto loop; 160 goto loop;
162 } 161 }
163 162
@@ -169,9 +168,9 @@ loop:
169 * be already stopped. 168 * be already stopped.
170 */ 169 */
171 jbd_debug(1, "Now suspending kjournald2\n"); 170 jbd_debug(1, "Now suspending kjournald2\n");
172 spin_unlock(&journal->j_state_lock); 171 write_unlock(&journal->j_state_lock);
173 refrigerator(); 172 refrigerator();
174 spin_lock(&journal->j_state_lock); 173 write_lock(&journal->j_state_lock);
175 } else { 174 } else {
176 /* 175 /*
177 * We assume on resume that commits are already there, 176 * We assume on resume that commits are already there,
@@ -191,9 +190,9 @@ loop:
191 if (journal->j_flags & JBD2_UNMOUNT) 190 if (journal->j_flags & JBD2_UNMOUNT)
192 should_sleep = 0; 191 should_sleep = 0;
193 if (should_sleep) { 192 if (should_sleep) {
194 spin_unlock(&journal->j_state_lock); 193 write_unlock(&journal->j_state_lock);
195 schedule(); 194 schedule();
196 spin_lock(&journal->j_state_lock); 195 write_lock(&journal->j_state_lock);
197 } 196 }
198 finish_wait(&journal->j_wait_commit, &wait); 197 finish_wait(&journal->j_wait_commit, &wait);
199 } 198 }
@@ -211,7 +210,7 @@ loop:
211 goto loop; 210 goto loop;
212 211
213end_loop: 212end_loop:
214 spin_unlock(&journal->j_state_lock); 213 write_unlock(&journal->j_state_lock);
215 del_timer_sync(&journal->j_commit_timer); 214 del_timer_sync(&journal->j_commit_timer);
216 journal->j_task = NULL; 215 journal->j_task = NULL;
217 wake_up(&journal->j_wait_done_commit); 216 wake_up(&journal->j_wait_done_commit);
@@ -234,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal)
234 233
235static void journal_kill_thread(journal_t *journal) 234static void journal_kill_thread(journal_t *journal)
236{ 235{
237 spin_lock(&journal->j_state_lock); 236 write_lock(&journal->j_state_lock);
238 journal->j_flags |= JBD2_UNMOUNT; 237 journal->j_flags |= JBD2_UNMOUNT;
239 238
240 while (journal->j_task) { 239 while (journal->j_task) {
241 wake_up(&journal->j_wait_commit); 240 wake_up(&journal->j_wait_commit);
242 spin_unlock(&journal->j_state_lock); 241 write_unlock(&journal->j_state_lock);
243 wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 242 wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
244 spin_lock(&journal->j_state_lock); 243 write_lock(&journal->j_state_lock);
245 } 244 }
246 spin_unlock(&journal->j_state_lock); 245 write_unlock(&journal->j_state_lock);
247} 246}
248 247
249/* 248/*
@@ -297,7 +296,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
297 struct page *new_page; 296 struct page *new_page;
298 unsigned int new_offset; 297 unsigned int new_offset;
299 struct buffer_head *bh_in = jh2bh(jh_in); 298 struct buffer_head *bh_in = jh2bh(jh_in);
300 struct jbd2_buffer_trigger_type *triggers;
301 journal_t *journal = transaction->t_journal; 299 journal_t *journal = transaction->t_journal;
302 300
303 /* 301 /*
@@ -311,7 +309,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
311 */ 309 */
312 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 310 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
313 311
314 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 312retry_alloc:
313 new_bh = alloc_buffer_head(GFP_NOFS);
314 if (!new_bh) {
315 /*
316 * Failure is not an option, but __GFP_NOFAIL is going
317 * away; so we retry ourselves here.
318 */
319 congestion_wait(BLK_RW_ASYNC, HZ/50);
320 goto retry_alloc;
321 }
322
315 /* keep subsequent assertions sane */ 323 /* keep subsequent assertions sane */
316 new_bh->b_state = 0; 324 new_bh->b_state = 0;
317 init_buffer(new_bh, NULL, NULL); 325 init_buffer(new_bh, NULL, NULL);
@@ -328,21 +336,21 @@ repeat:
328 done_copy_out = 1; 336 done_copy_out = 1;
329 new_page = virt_to_page(jh_in->b_frozen_data); 337 new_page = virt_to_page(jh_in->b_frozen_data);
330 new_offset = offset_in_page(jh_in->b_frozen_data); 338 new_offset = offset_in_page(jh_in->b_frozen_data);
331 triggers = jh_in->b_frozen_triggers;
332 } else { 339 } else {
333 new_page = jh2bh(jh_in)->b_page; 340 new_page = jh2bh(jh_in)->b_page;
334 new_offset = offset_in_page(jh2bh(jh_in)->b_data); 341 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
335 triggers = jh_in->b_triggers;
336 } 342 }
337 343
338 mapped_data = kmap_atomic(new_page, KM_USER0); 344 mapped_data = kmap_atomic(new_page, KM_USER0);
339 /* 345 /*
340 * Fire any commit trigger. Do this before checking for escaping, 346 * Fire data frozen trigger if data already wasn't frozen. Do this
341 * as the trigger may modify the magic offset. If a copy-out 347 * before checking for escaping, as the trigger may modify the magic
342 * happens afterwards, it will have the correct data in the buffer. 348 * offset. If a copy-out happens afterwards, it will have the correct
349 * data in the buffer.
343 */ 350 */
344 jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, 351 if (!done_copy_out)
345 triggers); 352 jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
353 jh_in->b_triggers);
346 354
347 /* 355 /*
348 * Check for escaping 356 * Check for escaping
@@ -443,7 +451,7 @@ int __jbd2_log_space_left(journal_t *journal)
443{ 451{
444 int left = journal->j_free; 452 int left = journal->j_free;
445 453
446 assert_spin_locked(&journal->j_state_lock); 454 /* assert_spin_locked(&journal->j_state_lock); */
447 455
448 /* 456 /*
449 * Be pessimistic here about the number of those free blocks which 457 * Be pessimistic here about the number of those free blocks which
@@ -488,9 +496,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
488{ 496{
489 int ret; 497 int ret;
490 498
491 spin_lock(&journal->j_state_lock); 499 write_lock(&journal->j_state_lock);
492 ret = __jbd2_log_start_commit(journal, tid); 500 ret = __jbd2_log_start_commit(journal, tid);
493 spin_unlock(&journal->j_state_lock); 501 write_unlock(&journal->j_state_lock);
494 return ret; 502 return ret;
495} 503}
496 504
@@ -509,7 +517,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
509 transaction_t *transaction = NULL; 517 transaction_t *transaction = NULL;
510 tid_t tid; 518 tid_t tid;
511 519
512 spin_lock(&journal->j_state_lock); 520 read_lock(&journal->j_state_lock);
513 if (journal->j_running_transaction && !current->journal_info) { 521 if (journal->j_running_transaction && !current->journal_info) {
514 transaction = journal->j_running_transaction; 522 transaction = journal->j_running_transaction;
515 __jbd2_log_start_commit(journal, transaction->t_tid); 523 __jbd2_log_start_commit(journal, transaction->t_tid);
@@ -517,12 +525,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
517 transaction = journal->j_committing_transaction; 525 transaction = journal->j_committing_transaction;
518 526
519 if (!transaction) { 527 if (!transaction) {
520 spin_unlock(&journal->j_state_lock); 528 read_unlock(&journal->j_state_lock);
521 return 0; /* Nothing to retry */ 529 return 0; /* Nothing to retry */
522 } 530 }
523 531
524 tid = transaction->t_tid; 532 tid = transaction->t_tid;
525 spin_unlock(&journal->j_state_lock); 533 read_unlock(&journal->j_state_lock);
526 jbd2_log_wait_commit(journal, tid); 534 jbd2_log_wait_commit(journal, tid);
527 return 1; 535 return 1;
528} 536}
@@ -536,7 +544,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
536{ 544{
537 int ret = 0; 545 int ret = 0;
538 546
539 spin_lock(&journal->j_state_lock); 547 write_lock(&journal->j_state_lock);
540 if (journal->j_running_transaction) { 548 if (journal->j_running_transaction) {
541 tid_t tid = journal->j_running_transaction->t_tid; 549 tid_t tid = journal->j_running_transaction->t_tid;
542 550
@@ -555,7 +563,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
555 *ptid = journal->j_committing_transaction->t_tid; 563 *ptid = journal->j_committing_transaction->t_tid;
556 ret = 1; 564 ret = 1;
557 } 565 }
558 spin_unlock(&journal->j_state_lock); 566 write_unlock(&journal->j_state_lock);
559 return ret; 567 return ret;
560} 568}
561 569
@@ -567,26 +575,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
567{ 575{
568 int err = 0; 576 int err = 0;
569 577
578 read_lock(&journal->j_state_lock);
570#ifdef CONFIG_JBD2_DEBUG 579#ifdef CONFIG_JBD2_DEBUG
571 spin_lock(&journal->j_state_lock);
572 if (!tid_geq(journal->j_commit_request, tid)) { 580 if (!tid_geq(journal->j_commit_request, tid)) {
573 printk(KERN_EMERG 581 printk(KERN_EMERG
574 "%s: error: j_commit_request=%d, tid=%d\n", 582 "%s: error: j_commit_request=%d, tid=%d\n",
575 __func__, journal->j_commit_request, tid); 583 __func__, journal->j_commit_request, tid);
576 } 584 }
577 spin_unlock(&journal->j_state_lock);
578#endif 585#endif
579 spin_lock(&journal->j_state_lock);
580 while (tid_gt(tid, journal->j_commit_sequence)) { 586 while (tid_gt(tid, journal->j_commit_sequence)) {
581 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", 587 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
582 tid, journal->j_commit_sequence); 588 tid, journal->j_commit_sequence);
583 wake_up(&journal->j_wait_commit); 589 wake_up(&journal->j_wait_commit);
584 spin_unlock(&journal->j_state_lock); 590 read_unlock(&journal->j_state_lock);
585 wait_event(journal->j_wait_done_commit, 591 wait_event(journal->j_wait_done_commit,
586 !tid_gt(tid, journal->j_commit_sequence)); 592 !tid_gt(tid, journal->j_commit_sequence));
587 spin_lock(&journal->j_state_lock); 593 read_lock(&journal->j_state_lock);
588 } 594 }
589 spin_unlock(&journal->j_state_lock); 595 read_unlock(&journal->j_state_lock);
590 596
591 if (unlikely(is_journal_aborted(journal))) { 597 if (unlikely(is_journal_aborted(journal))) {
592 printk(KERN_EMERG "journal commit I/O error\n"); 598 printk(KERN_EMERG "journal commit I/O error\n");
@@ -603,7 +609,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
603{ 609{
604 unsigned long blocknr; 610 unsigned long blocknr;
605 611
606 spin_lock(&journal->j_state_lock); 612 write_lock(&journal->j_state_lock);
607 J_ASSERT(journal->j_free > 1); 613 J_ASSERT(journal->j_free > 1);
608 614
609 blocknr = journal->j_head; 615 blocknr = journal->j_head;
@@ -611,7 +617,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
611 journal->j_free--; 617 journal->j_free--;
612 if (journal->j_head == journal->j_last) 618 if (journal->j_head == journal->j_last)
613 journal->j_head = journal->j_first; 619 journal->j_head = journal->j_first;
614 spin_unlock(&journal->j_state_lock); 620 write_unlock(&journal->j_state_lock);
615 return jbd2_journal_bmap(journal, blocknr, retp); 621 return jbd2_journal_bmap(journal, blocknr, retp);
616} 622}
617 623
@@ -831,7 +837,7 @@ static journal_t * journal_init_common (void)
831 mutex_init(&journal->j_checkpoint_mutex); 837 mutex_init(&journal->j_checkpoint_mutex);
832 spin_lock_init(&journal->j_revoke_lock); 838 spin_lock_init(&journal->j_revoke_lock);
833 spin_lock_init(&journal->j_list_lock); 839 spin_lock_init(&journal->j_list_lock);
834 spin_lock_init(&journal->j_state_lock); 840 rwlock_init(&journal->j_state_lock);
835 841
836 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 842 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
837 journal->j_min_batch_time = 0; 843 journal->j_min_batch_time = 0;
@@ -1097,14 +1103,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1097 set_buffer_uptodate(bh); 1103 set_buffer_uptodate(bh);
1098 } 1104 }
1099 1105
1100 spin_lock(&journal->j_state_lock); 1106 read_lock(&journal->j_state_lock);
1101 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1107 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
1102 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1108 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
1103 1109
1104 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1110 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1105 sb->s_start = cpu_to_be32(journal->j_tail); 1111 sb->s_start = cpu_to_be32(journal->j_tail);
1106 sb->s_errno = cpu_to_be32(journal->j_errno); 1112 sb->s_errno = cpu_to_be32(journal->j_errno);
1107 spin_unlock(&journal->j_state_lock); 1113 read_unlock(&journal->j_state_lock);
1108 1114
1109 BUFFER_TRACE(bh, "marking dirty"); 1115 BUFFER_TRACE(bh, "marking dirty");
1110 mark_buffer_dirty(bh); 1116 mark_buffer_dirty(bh);
@@ -1118,19 +1124,19 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1118 set_buffer_uptodate(bh); 1124 set_buffer_uptodate(bh);
1119 } 1125 }
1120 } else 1126 } else
1121 ll_rw_block(SWRITE, 1, &bh); 1127 write_dirty_buffer(bh, WRITE);
1122 1128
1123out: 1129out:
1124 /* If we have just flushed the log (by marking s_start==0), then 1130 /* If we have just flushed the log (by marking s_start==0), then
1125 * any future commit will have to be careful to update the 1131 * any future commit will have to be careful to update the
1126 * superblock again to re-record the true start of the log. */ 1132 * superblock again to re-record the true start of the log. */
1127 1133
1128 spin_lock(&journal->j_state_lock); 1134 write_lock(&journal->j_state_lock);
1129 if (sb->s_start) 1135 if (sb->s_start)
1130 journal->j_flags &= ~JBD2_FLUSHED; 1136 journal->j_flags &= ~JBD2_FLUSHED;
1131 else 1137 else
1132 journal->j_flags |= JBD2_FLUSHED; 1138 journal->j_flags |= JBD2_FLUSHED;
1133 spin_unlock(&journal->j_state_lock); 1139 write_unlock(&journal->j_state_lock);
1134} 1140}
1135 1141
1136/* 1142/*
@@ -1392,13 +1398,9 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1392int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, 1398int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
1393 unsigned long ro, unsigned long incompat) 1399 unsigned long ro, unsigned long incompat)
1394{ 1400{
1395 journal_superblock_t *sb;
1396
1397 if (!compat && !ro && !incompat) 1401 if (!compat && !ro && !incompat)
1398 return 1; 1402 return 1;
1399 1403
1400 sb = journal->j_superblock;
1401
1402 /* We can support any known requested features iff the 1404 /* We can support any known requested features iff the
1403 * superblock is in version 2. Otherwise we fail to support any 1405 * superblock is in version 2. Otherwise we fail to support any
1404 * extended sb features. */ 1406 * extended sb features. */
@@ -1546,7 +1548,7 @@ int jbd2_journal_flush(journal_t *journal)
1546 transaction_t *transaction = NULL; 1548 transaction_t *transaction = NULL;
1547 unsigned long old_tail; 1549 unsigned long old_tail;
1548 1550
1549 spin_lock(&journal->j_state_lock); 1551 write_lock(&journal->j_state_lock);
1550 1552
1551 /* Force everything buffered to the log... */ 1553 /* Force everything buffered to the log... */
1552 if (journal->j_running_transaction) { 1554 if (journal->j_running_transaction) {
@@ -1559,10 +1561,10 @@ int jbd2_journal_flush(journal_t *journal)
1559 if (transaction) { 1561 if (transaction) {
1560 tid_t tid = transaction->t_tid; 1562 tid_t tid = transaction->t_tid;
1561 1563
1562 spin_unlock(&journal->j_state_lock); 1564 write_unlock(&journal->j_state_lock);
1563 jbd2_log_wait_commit(journal, tid); 1565 jbd2_log_wait_commit(journal, tid);
1564 } else { 1566 } else {
1565 spin_unlock(&journal->j_state_lock); 1567 write_unlock(&journal->j_state_lock);
1566 } 1568 }
1567 1569
1568 /* ...and flush everything in the log out to disk. */ 1570 /* ...and flush everything in the log out to disk. */
@@ -1586,12 +1588,12 @@ int jbd2_journal_flush(journal_t *journal)
1586 * the magic code for a fully-recovered superblock. Any future 1588 * the magic code for a fully-recovered superblock. Any future
1587 * commits of data to the journal will restore the current 1589 * commits of data to the journal will restore the current
1588 * s_start value. */ 1590 * s_start value. */
1589 spin_lock(&journal->j_state_lock); 1591 write_lock(&journal->j_state_lock);
1590 old_tail = journal->j_tail; 1592 old_tail = journal->j_tail;
1591 journal->j_tail = 0; 1593 journal->j_tail = 0;
1592 spin_unlock(&journal->j_state_lock); 1594 write_unlock(&journal->j_state_lock);
1593 jbd2_journal_update_superblock(journal, 1); 1595 jbd2_journal_update_superblock(journal, 1);
1594 spin_lock(&journal->j_state_lock); 1596 write_lock(&journal->j_state_lock);
1595 journal->j_tail = old_tail; 1597 journal->j_tail = old_tail;
1596 1598
1597 J_ASSERT(!journal->j_running_transaction); 1599 J_ASSERT(!journal->j_running_transaction);
@@ -1599,7 +1601,7 @@ int jbd2_journal_flush(journal_t *journal)
1599 J_ASSERT(!journal->j_checkpoint_transactions); 1601 J_ASSERT(!journal->j_checkpoint_transactions);
1600 J_ASSERT(journal->j_head == journal->j_tail); 1602 J_ASSERT(journal->j_head == journal->j_tail);
1601 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1603 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1602 spin_unlock(&journal->j_state_lock); 1604 write_unlock(&journal->j_state_lock);
1603 return 0; 1605 return 0;
1604} 1606}
1605 1607
@@ -1618,7 +1620,6 @@ int jbd2_journal_flush(journal_t *journal)
1618 1620
1619int jbd2_journal_wipe(journal_t *journal, int write) 1621int jbd2_journal_wipe(journal_t *journal, int write)
1620{ 1622{
1621 journal_superblock_t *sb;
1622 int err = 0; 1623 int err = 0;
1623 1624
1624 J_ASSERT (!(journal->j_flags & JBD2_LOADED)); 1625 J_ASSERT (!(journal->j_flags & JBD2_LOADED));
@@ -1627,8 +1628,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1627 if (err) 1628 if (err)
1628 return err; 1629 return err;
1629 1630
1630 sb = journal->j_superblock;
1631
1632 if (!journal->j_tail) 1631 if (!journal->j_tail)
1633 goto no_recovery; 1632 goto no_recovery;
1634 1633
@@ -1666,12 +1665,12 @@ void __jbd2_journal_abort_hard(journal_t *journal)
1666 printk(KERN_ERR "Aborting journal on device %s.\n", 1665 printk(KERN_ERR "Aborting journal on device %s.\n",
1667 journal->j_devname); 1666 journal->j_devname);
1668 1667
1669 spin_lock(&journal->j_state_lock); 1668 write_lock(&journal->j_state_lock);
1670 journal->j_flags |= JBD2_ABORT; 1669 journal->j_flags |= JBD2_ABORT;
1671 transaction = journal->j_running_transaction; 1670 transaction = journal->j_running_transaction;
1672 if (transaction) 1671 if (transaction)
1673 __jbd2_log_start_commit(journal, transaction->t_tid); 1672 __jbd2_log_start_commit(journal, transaction->t_tid);
1674 spin_unlock(&journal->j_state_lock); 1673 write_unlock(&journal->j_state_lock);
1675} 1674}
1676 1675
1677/* Soft abort: record the abort error status in the journal superblock, 1676/* Soft abort: record the abort error status in the journal superblock,
@@ -1756,12 +1755,12 @@ int jbd2_journal_errno(journal_t *journal)
1756{ 1755{
1757 int err; 1756 int err;
1758 1757
1759 spin_lock(&journal->j_state_lock); 1758 read_lock(&journal->j_state_lock);
1760 if (journal->j_flags & JBD2_ABORT) 1759 if (journal->j_flags & JBD2_ABORT)
1761 err = -EROFS; 1760 err = -EROFS;
1762 else 1761 else
1763 err = journal->j_errno; 1762 err = journal->j_errno;
1764 spin_unlock(&journal->j_state_lock); 1763 read_unlock(&journal->j_state_lock);
1765 return err; 1764 return err;
1766} 1765}
1767 1766
@@ -1776,12 +1775,12 @@ int jbd2_journal_clear_err(journal_t *journal)
1776{ 1775{
1777 int err = 0; 1776 int err = 0;
1778 1777
1779 spin_lock(&journal->j_state_lock); 1778 write_lock(&journal->j_state_lock);
1780 if (journal->j_flags & JBD2_ABORT) 1779 if (journal->j_flags & JBD2_ABORT)
1781 err = -EROFS; 1780 err = -EROFS;
1782 else 1781 else
1783 journal->j_errno = 0; 1782 journal->j_errno = 0;
1784 spin_unlock(&journal->j_state_lock); 1783 write_unlock(&journal->j_state_lock);
1785 return err; 1784 return err;
1786} 1785}
1787 1786
@@ -1794,10 +1793,10 @@ int jbd2_journal_clear_err(journal_t *journal)
1794 */ 1793 */
1795void jbd2_journal_ack_err(journal_t *journal) 1794void jbd2_journal_ack_err(journal_t *journal)
1796{ 1795{
1797 spin_lock(&journal->j_state_lock); 1796 write_lock(&journal->j_state_lock);
1798 if (journal->j_errno) 1797 if (journal->j_errno)
1799 journal->j_flags |= JBD2_ACK_ERR; 1798 journal->j_flags |= JBD2_ACK_ERR;
1800 spin_unlock(&journal->j_state_lock); 1799 write_unlock(&journal->j_state_lock);
1801} 1800}
1802 1801
1803int jbd2_journal_blocks_per_page(struct inode *inode) 1802int jbd2_journal_blocks_per_page(struct inode *inode)
@@ -1889,7 +1888,7 @@ static struct kmem_cache *get_slab(size_t size)
1889 BUG_ON(i >= JBD2_MAX_SLABS); 1888 BUG_ON(i >= JBD2_MAX_SLABS);
1890 if (unlikely(i < 0)) 1889 if (unlikely(i < 0))
1891 i = 0; 1890 i = 0;
1892 BUG_ON(jbd2_slab[i] == 0); 1891 BUG_ON(jbd2_slab[i] == NULL);
1893 return jbd2_slab[i]; 1892 return jbd2_slab[i];
1894} 1893}
1895 1894
@@ -2202,8 +2201,6 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
2202void jbd2_journal_release_jbd_inode(journal_t *journal, 2201void jbd2_journal_release_jbd_inode(journal_t *journal,
2203 struct jbd2_inode *jinode) 2202 struct jbd2_inode *jinode)
2204{ 2203{
2205 int writeout = 0;
2206
2207 if (!journal) 2204 if (!journal)
2208 return; 2205 return;
2209restart: 2206restart:
@@ -2220,9 +2217,6 @@ restart:
2220 goto restart; 2217 goto restart;
2221 } 2218 }
2222 2219
2223 /* Do we need to wait for data writeback? */
2224 if (journal->j_committing_transaction == jinode->i_transaction)
2225 writeout = 1;
2226 if (jinode->i_transaction) { 2220 if (jinode->i_transaction) {
2227 list_del(&jinode->i_list); 2221 list_del(&jinode->i_list);
2228 jinode->i_transaction = NULL; 2222 jinode->i_transaction = NULL;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 049281b7cb89..2bc4d5f116f1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -285,12 +285,10 @@ int jbd2_journal_recover(journal_t *journal)
285int jbd2_journal_skip_recovery(journal_t *journal) 285int jbd2_journal_skip_recovery(journal_t *journal)
286{ 286{
287 int err; 287 int err;
288 journal_superblock_t * sb;
289 288
290 struct recovery_info info; 289 struct recovery_info info;
291 290
292 memset (&info, 0, sizeof(info)); 291 memset (&info, 0, sizeof(info));
293 sb = journal->j_superblock;
294 292
295 err = do_one_pass(journal, &info, PASS_SCAN); 293 err = do_one_pass(journal, &info, PASS_SCAN);
296 294
@@ -299,7 +297,8 @@ int jbd2_journal_skip_recovery(journal_t *journal)
299 ++journal->j_transaction_sequence; 297 ++journal->j_transaction_sequence;
300 } else { 298 } else {
301#ifdef CONFIG_JBD2_DEBUG 299#ifdef CONFIG_JBD2_DEBUG
302 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); 300 int dropped = info.end_transaction -
301 be32_to_cpu(journal->j_superblock->s_sequence);
303#endif 302#endif
304 jbd_debug(1, 303 jbd_debug(1,
305 "JBD: ignoring %d transaction%s from the journal.\n", 304 "JBD: ignoring %d transaction%s from the journal.\n",
@@ -365,11 +364,6 @@ static int do_one_pass(journal_t *journal,
365 int tag_bytes = journal_tag_bytes(journal); 364 int tag_bytes = journal_tag_bytes(journal);
366 __u32 crc32_sum = ~0; /* Transactional Checksums */ 365 __u32 crc32_sum = ~0; /* Transactional Checksums */
367 366
368 /* Precompute the maximum metadata descriptors in a descriptor block */
369 int MAX_BLOCKS_PER_DESC;
370 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
371 / tag_bytes);
372
373 /* 367 /*
374 * First thing is to establish what we expect to find in the log 368 * First thing is to establish what we expect to find in the log
375 * (in terms of transaction IDs), and where (in terms of log 369 * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a360b06af2e3..9ad321fd63fd 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
625 set_buffer_jwrite(bh); 625 set_buffer_jwrite(bh);
626 BUFFER_TRACE(bh, "write"); 626 BUFFER_TRACE(bh, "write");
627 set_buffer_dirty(bh); 627 set_buffer_dirty(bh);
628 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); 628 write_dirty_buffer(bh, write_op);
629} 629}
630#endif 630#endif
631 631
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..f3479d6e0a83 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -26,6 +26,8 @@
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h> 28#include <linux/hrtimer.h>
29#include <linux/backing-dev.h>
30#include <linux/module.h>
29 31
30static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 32static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
31 33
@@ -53,6 +55,9 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
53 transaction->t_tid = journal->j_transaction_sequence++; 55 transaction->t_tid = journal->j_transaction_sequence++;
54 transaction->t_expires = jiffies + journal->j_commit_interval; 56 transaction->t_expires = jiffies + journal->j_commit_interval;
55 spin_lock_init(&transaction->t_handle_lock); 57 spin_lock_init(&transaction->t_handle_lock);
58 atomic_set(&transaction->t_updates, 0);
59 atomic_set(&transaction->t_outstanding_credits, 0);
60 atomic_set(&transaction->t_handle_count, 0);
56 INIT_LIST_HEAD(&transaction->t_inode_list); 61 INIT_LIST_HEAD(&transaction->t_inode_list);
57 INIT_LIST_HEAD(&transaction->t_private_list); 62 INIT_LIST_HEAD(&transaction->t_private_list);
58 63
@@ -77,71 +82,106 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
77 */ 82 */
78 83
79/* 84/*
85 * Update transiaction's maximum wait time, if debugging is enabled.
86 *
87 * In order for t_max_wait to be reliable, it must be protected by a
88 * lock. But doing so will mean that start_this_handle() can not be
89 * run in parallel on SMP systems, which limits our scalability. So
90 * unless debugging is enabled, we no longer update t_max_wait, which
91 * means that maximum wait time reported by the jbd2_run_stats
92 * tracepoint will always be zero.
93 */
94static inline void update_t_max_wait(transaction_t *transaction)
95{
96#ifdef CONFIG_JBD2_DEBUG
97 unsigned long ts = jiffies;
98
99 if (jbd2_journal_enable_debug &&
100 time_after(transaction->t_start, ts)) {
101 ts = jbd2_time_diff(ts, transaction->t_start);
102 spin_lock(&transaction->t_handle_lock);
103 if (ts > transaction->t_max_wait)
104 transaction->t_max_wait = ts;
105 spin_unlock(&transaction->t_handle_lock);
106 }
107#endif
108}
109
110/*
80 * start_this_handle: Given a handle, deal with any locking or stalling 111 * start_this_handle: Given a handle, deal with any locking or stalling
81 * needed to make sure that there is enough journal space for the handle 112 * needed to make sure that there is enough journal space for the handle
82 * to begin. Attach the handle to a transaction and set up the 113 * to begin. Attach the handle to a transaction and set up the
83 * transaction's buffer credits. 114 * transaction's buffer credits.
84 */ 115 */
85 116
86static int start_this_handle(journal_t *journal, handle_t *handle) 117static int start_this_handle(journal_t *journal, handle_t *handle,
118 int gfp_mask)
87{ 119{
88 transaction_t *transaction; 120 transaction_t *transaction;
89 int needed; 121 int needed;
90 int nblocks = handle->h_buffer_credits; 122 int nblocks = handle->h_buffer_credits;
91 transaction_t *new_transaction = NULL; 123 transaction_t *new_transaction = NULL;
92 int ret = 0;
93 unsigned long ts = jiffies;
94 124
95 if (nblocks > journal->j_max_transaction_buffers) { 125 if (nblocks > journal->j_max_transaction_buffers) {
96 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
97 current->comm, nblocks, 127 current->comm, nblocks,
98 journal->j_max_transaction_buffers); 128 journal->j_max_transaction_buffers);
99 ret = -ENOSPC; 129 return -ENOSPC;
100 goto out;
101 } 130 }
102 131
103alloc_transaction: 132alloc_transaction:
104 if (!journal->j_running_transaction) { 133 if (!journal->j_running_transaction) {
105 new_transaction = kzalloc(sizeof(*new_transaction), 134 new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
106 GFP_NOFS|__GFP_NOFAIL);
107 if (!new_transaction) { 135 if (!new_transaction) {
108 ret = -ENOMEM; 136 /*
109 goto out; 137 * If __GFP_FS is not present, then we may be
138 * being called from inside the fs writeback
139 * layer, so we MUST NOT fail. Since
140 * __GFP_NOFAIL is going away, we will arrange
141 * to retry the allocation ourselves.
142 */
143 if ((gfp_mask & __GFP_FS) == 0) {
144 congestion_wait(BLK_RW_ASYNC, HZ/50);
145 goto alloc_transaction;
146 }
147 return -ENOMEM;
110 } 148 }
111 } 149 }
112 150
113 jbd_debug(3, "New handle %p going live.\n", handle); 151 jbd_debug(3, "New handle %p going live.\n", handle);
114 152
115repeat:
116
117 /* 153 /*
118 * We need to hold j_state_lock until t_updates has been incremented, 154 * We need to hold j_state_lock until t_updates has been incremented,
119 * for proper journal barrier handling 155 * for proper journal barrier handling
120 */ 156 */
121 spin_lock(&journal->j_state_lock); 157repeat:
122repeat_locked: 158 read_lock(&journal->j_state_lock);
123 if (is_journal_aborted(journal) || 159 if (is_journal_aborted(journal) ||
124 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 160 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
125 spin_unlock(&journal->j_state_lock); 161 read_unlock(&journal->j_state_lock);
126 ret = -EROFS; 162 kfree(new_transaction);
127 goto out; 163 return -EROFS;
128 } 164 }
129 165
130 /* Wait on the journal's transaction barrier if necessary */ 166 /* Wait on the journal's transaction barrier if necessary */
131 if (journal->j_barrier_count) { 167 if (journal->j_barrier_count) {
132 spin_unlock(&journal->j_state_lock); 168 read_unlock(&journal->j_state_lock);
133 wait_event(journal->j_wait_transaction_locked, 169 wait_event(journal->j_wait_transaction_locked,
134 journal->j_barrier_count == 0); 170 journal->j_barrier_count == 0);
135 goto repeat; 171 goto repeat;
136 } 172 }
137 173
138 if (!journal->j_running_transaction) { 174 if (!journal->j_running_transaction) {
139 if (!new_transaction) { 175 read_unlock(&journal->j_state_lock);
140 spin_unlock(&journal->j_state_lock); 176 if (!new_transaction)
141 goto alloc_transaction; 177 goto alloc_transaction;
178 write_lock(&journal->j_state_lock);
179 if (!journal->j_running_transaction) {
180 jbd2_get_transaction(journal, new_transaction);
181 new_transaction = NULL;
142 } 182 }
143 jbd2_get_transaction(journal, new_transaction); 183 write_unlock(&journal->j_state_lock);
144 new_transaction = NULL; 184 goto repeat;
145 } 185 }
146 186
147 transaction = journal->j_running_transaction; 187 transaction = journal->j_running_transaction;
@@ -155,7 +195,7 @@ repeat_locked:
155 195
156 prepare_to_wait(&journal->j_wait_transaction_locked, 196 prepare_to_wait(&journal->j_wait_transaction_locked,
157 &wait, TASK_UNINTERRUPTIBLE); 197 &wait, TASK_UNINTERRUPTIBLE);
158 spin_unlock(&journal->j_state_lock); 198 read_unlock(&journal->j_state_lock);
159 schedule(); 199 schedule();
160 finish_wait(&journal->j_wait_transaction_locked, &wait); 200 finish_wait(&journal->j_wait_transaction_locked, &wait);
161 goto repeat; 201 goto repeat;
@@ -166,8 +206,8 @@ repeat_locked:
166 * buffers requested by this operation, we need to stall pending a log 206 * buffers requested by this operation, we need to stall pending a log
167 * checkpoint to free some more log space. 207 * checkpoint to free some more log space.
168 */ 208 */
169 spin_lock(&transaction->t_handle_lock); 209 needed = atomic_add_return(nblocks,
170 needed = transaction->t_outstanding_credits + nblocks; 210 &transaction->t_outstanding_credits);
171 211
172 if (needed > journal->j_max_transaction_buffers) { 212 if (needed > journal->j_max_transaction_buffers) {
173 /* 213 /*
@@ -178,11 +218,11 @@ repeat_locked:
178 DEFINE_WAIT(wait); 218 DEFINE_WAIT(wait);
179 219
180 jbd_debug(2, "Handle %p starting new commit...\n", handle); 220 jbd_debug(2, "Handle %p starting new commit...\n", handle);
181 spin_unlock(&transaction->t_handle_lock); 221 atomic_sub(nblocks, &transaction->t_outstanding_credits);
182 prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 222 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
183 TASK_UNINTERRUPTIBLE); 223 TASK_UNINTERRUPTIBLE);
184 __jbd2_log_start_commit(journal, transaction->t_tid); 224 __jbd2_log_start_commit(journal, transaction->t_tid);
185 spin_unlock(&journal->j_state_lock); 225 read_unlock(&journal->j_state_lock);
186 schedule(); 226 schedule();
187 finish_wait(&journal->j_wait_transaction_locked, &wait); 227 finish_wait(&journal->j_wait_transaction_locked, &wait);
188 goto repeat; 228 goto repeat;
@@ -215,35 +255,31 @@ repeat_locked:
215 */ 255 */
216 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { 256 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
217 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); 257 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
218 spin_unlock(&transaction->t_handle_lock); 258 atomic_sub(nblocks, &transaction->t_outstanding_credits);
219 __jbd2_log_wait_for_space(journal); 259 read_unlock(&journal->j_state_lock);
220 goto repeat_locked; 260 write_lock(&journal->j_state_lock);
261 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
262 __jbd2_log_wait_for_space(journal);
263 write_unlock(&journal->j_state_lock);
264 goto repeat;
221 } 265 }
222 266
223 /* OK, account for the buffers that this operation expects to 267 /* OK, account for the buffers that this operation expects to
224 * use and add the handle to the running transaction. */ 268 * use and add the handle to the running transaction.
225 269 */
226 if (time_after(transaction->t_start, ts)) { 270 update_t_max_wait(transaction);
227 ts = jbd2_time_diff(ts, transaction->t_start);
228 if (ts > transaction->t_max_wait)
229 transaction->t_max_wait = ts;
230 }
231
232 handle->h_transaction = transaction; 271 handle->h_transaction = transaction;
233 transaction->t_outstanding_credits += nblocks; 272 atomic_inc(&transaction->t_updates);
234 transaction->t_updates++; 273 atomic_inc(&transaction->t_handle_count);
235 transaction->t_handle_count++;
236 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", 274 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
237 handle, nblocks, transaction->t_outstanding_credits, 275 handle, nblocks,
276 atomic_read(&transaction->t_outstanding_credits),
238 __jbd2_log_space_left(journal)); 277 __jbd2_log_space_left(journal));
239 spin_unlock(&transaction->t_handle_lock); 278 read_unlock(&journal->j_state_lock);
240 spin_unlock(&journal->j_state_lock);
241 279
242 lock_map_acquire(&handle->h_lockdep_map); 280 lock_map_acquire(&handle->h_lockdep_map);
243out: 281 kfree(new_transaction);
244 if (unlikely(new_transaction)) /* It's usually NULL */ 282 return 0;
245 kfree(new_transaction);
246 return ret;
247} 283}
248 284
249static struct lock_class_key jbd2_handle_key; 285static struct lock_class_key jbd2_handle_key;
@@ -278,7 +314,7 @@ static handle_t *new_handle(int nblocks)
278 * 314 *
279 * Return a pointer to a newly allocated handle, or NULL on failure 315 * Return a pointer to a newly allocated handle, or NULL on failure
280 */ 316 */
281handle_t *jbd2_journal_start(journal_t *journal, int nblocks) 317handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
282{ 318{
283 handle_t *handle = journal_current_handle(); 319 handle_t *handle = journal_current_handle();
284 int err; 320 int err;
@@ -298,7 +334,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
298 334
299 current->journal_info = handle; 335 current->journal_info = handle;
300 336
301 err = start_this_handle(journal, handle); 337 err = start_this_handle(journal, handle, gfp_mask);
302 if (err < 0) { 338 if (err < 0) {
303 jbd2_free_handle(handle); 339 jbd2_free_handle(handle);
304 current->journal_info = NULL; 340 current->journal_info = NULL;
@@ -308,6 +344,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
308out: 344out:
309 return handle; 345 return handle;
310} 346}
347EXPORT_SYMBOL(jbd2__journal_start);
348
349
350handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
351{
352 return jbd2__journal_start(journal, nblocks, GFP_NOFS);
353}
354EXPORT_SYMBOL(jbd2_journal_start);
355
311 356
312/** 357/**
313 * int jbd2_journal_extend() - extend buffer credits. 358 * int jbd2_journal_extend() - extend buffer credits.
@@ -342,7 +387,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
342 387
343 result = 1; 388 result = 1;
344 389
345 spin_lock(&journal->j_state_lock); 390 read_lock(&journal->j_state_lock);
346 391
347 /* Don't extend a locked-down transaction! */ 392 /* Don't extend a locked-down transaction! */
348 if (handle->h_transaction->t_state != T_RUNNING) { 393 if (handle->h_transaction->t_state != T_RUNNING) {
@@ -352,7 +397,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
352 } 397 }
353 398
354 spin_lock(&transaction->t_handle_lock); 399 spin_lock(&transaction->t_handle_lock);
355 wanted = transaction->t_outstanding_credits + nblocks; 400 wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
356 401
357 if (wanted > journal->j_max_transaction_buffers) { 402 if (wanted > journal->j_max_transaction_buffers) {
358 jbd_debug(3, "denied handle %p %d blocks: " 403 jbd_debug(3, "denied handle %p %d blocks: "
@@ -367,14 +412,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
367 } 412 }
368 413
369 handle->h_buffer_credits += nblocks; 414 handle->h_buffer_credits += nblocks;
370 transaction->t_outstanding_credits += nblocks; 415 atomic_add(nblocks, &transaction->t_outstanding_credits);
371 result = 0; 416 result = 0;
372 417
373 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); 418 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
374unlock: 419unlock:
375 spin_unlock(&transaction->t_handle_lock); 420 spin_unlock(&transaction->t_handle_lock);
376error_out: 421error_out:
377 spin_unlock(&journal->j_state_lock); 422 read_unlock(&journal->j_state_lock);
378out: 423out:
379 return result; 424 return result;
380} 425}
@@ -394,8 +439,7 @@ out:
394 * transaction capabable of guaranteeing the requested number of 439 * transaction capabable of guaranteeing the requested number of
395 * credits. 440 * credits.
396 */ 441 */
397 442int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
398int jbd2_journal_restart(handle_t *handle, int nblocks)
399{ 443{
400 transaction_t *transaction = handle->h_transaction; 444 transaction_t *transaction = handle->h_transaction;
401 journal_t *journal = transaction->t_journal; 445 journal_t *journal = transaction->t_journal;
@@ -410,28 +454,34 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
410 * First unlink the handle from its current transaction, and start the 454 * First unlink the handle from its current transaction, and start the
411 * commit on that. 455 * commit on that.
412 */ 456 */
413 J_ASSERT(transaction->t_updates > 0); 457 J_ASSERT(atomic_read(&transaction->t_updates) > 0);
414 J_ASSERT(journal_current_handle() == handle); 458 J_ASSERT(journal_current_handle() == handle);
415 459
416 spin_lock(&journal->j_state_lock); 460 read_lock(&journal->j_state_lock);
417 spin_lock(&transaction->t_handle_lock); 461 spin_lock(&transaction->t_handle_lock);
418 transaction->t_outstanding_credits -= handle->h_buffer_credits; 462 atomic_sub(handle->h_buffer_credits,
419 transaction->t_updates--; 463 &transaction->t_outstanding_credits);
420 464 if (atomic_dec_and_test(&transaction->t_updates))
421 if (!transaction->t_updates)
422 wake_up(&journal->j_wait_updates); 465 wake_up(&journal->j_wait_updates);
423 spin_unlock(&transaction->t_handle_lock); 466 spin_unlock(&transaction->t_handle_lock);
424 467
425 jbd_debug(2, "restarting handle %p\n", handle); 468 jbd_debug(2, "restarting handle %p\n", handle);
426 __jbd2_log_start_commit(journal, transaction->t_tid); 469 __jbd2_log_start_commit(journal, transaction->t_tid);
427 spin_unlock(&journal->j_state_lock); 470 read_unlock(&journal->j_state_lock);
428 471
429 lock_map_release(&handle->h_lockdep_map); 472 lock_map_release(&handle->h_lockdep_map);
430 handle->h_buffer_credits = nblocks; 473 handle->h_buffer_credits = nblocks;
431 ret = start_this_handle(journal, handle); 474 ret = start_this_handle(journal, handle, gfp_mask);
432 return ret; 475 return ret;
433} 476}
477EXPORT_SYMBOL(jbd2__journal_restart);
478
434 479
480int jbd2_journal_restart(handle_t *handle, int nblocks)
481{
482 return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
483}
484EXPORT_SYMBOL(jbd2_journal_restart);
435 485
436/** 486/**
437 * void jbd2_journal_lock_updates () - establish a transaction barrier. 487 * void jbd2_journal_lock_updates () - establish a transaction barrier.
@@ -447,7 +497,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
447{ 497{
448 DEFINE_WAIT(wait); 498 DEFINE_WAIT(wait);
449 499
450 spin_lock(&journal->j_state_lock); 500 write_lock(&journal->j_state_lock);
451 ++journal->j_barrier_count; 501 ++journal->j_barrier_count;
452 502
453 /* Wait until there are no running updates */ 503 /* Wait until there are no running updates */
@@ -458,19 +508,19 @@ void jbd2_journal_lock_updates(journal_t *journal)
458 break; 508 break;
459 509
460 spin_lock(&transaction->t_handle_lock); 510 spin_lock(&transaction->t_handle_lock);
461 if (!transaction->t_updates) { 511 if (!atomic_read(&transaction->t_updates)) {
462 spin_unlock(&transaction->t_handle_lock); 512 spin_unlock(&transaction->t_handle_lock);
463 break; 513 break;
464 } 514 }
465 prepare_to_wait(&journal->j_wait_updates, &wait, 515 prepare_to_wait(&journal->j_wait_updates, &wait,
466 TASK_UNINTERRUPTIBLE); 516 TASK_UNINTERRUPTIBLE);
467 spin_unlock(&transaction->t_handle_lock); 517 spin_unlock(&transaction->t_handle_lock);
468 spin_unlock(&journal->j_state_lock); 518 write_unlock(&journal->j_state_lock);
469 schedule(); 519 schedule();
470 finish_wait(&journal->j_wait_updates, &wait); 520 finish_wait(&journal->j_wait_updates, &wait);
471 spin_lock(&journal->j_state_lock); 521 write_lock(&journal->j_state_lock);
472 } 522 }
473 spin_unlock(&journal->j_state_lock); 523 write_unlock(&journal->j_state_lock);
474 524
475 /* 525 /*
476 * We have now established a barrier against other normal updates, but 526 * We have now established a barrier against other normal updates, but
@@ -494,9 +544,9 @@ void jbd2_journal_unlock_updates (journal_t *journal)
494 J_ASSERT(journal->j_barrier_count != 0); 544 J_ASSERT(journal->j_barrier_count != 0);
495 545
496 mutex_unlock(&journal->j_barrier); 546 mutex_unlock(&journal->j_barrier);
497 spin_lock(&journal->j_state_lock); 547 write_lock(&journal->j_state_lock);
498 --journal->j_barrier_count; 548 --journal->j_barrier_count;
499 spin_unlock(&journal->j_state_lock); 549 write_unlock(&journal->j_state_lock);
500 wake_up(&journal->j_wait_transaction_locked); 550 wake_up(&journal->j_wait_transaction_locked);
501} 551}
502 552
@@ -725,6 +775,9 @@ done:
725 page = jh2bh(jh)->b_page; 775 page = jh2bh(jh)->b_page;
726 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; 776 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
727 source = kmap_atomic(page, KM_USER0); 777 source = kmap_atomic(page, KM_USER0);
778 /* Fire data frozen trigger just before we copy the data */
779 jbd2_buffer_frozen_trigger(jh, source + offset,
780 jh->b_triggers);
728 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 781 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
729 kunmap_atomic(source, KM_USER0); 782 kunmap_atomic(source, KM_USER0);
730 783
@@ -963,15 +1016,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
963 jh->b_triggers = type; 1016 jh->b_triggers = type;
964} 1017}
965 1018
966void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, 1019void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
967 struct jbd2_buffer_trigger_type *triggers) 1020 struct jbd2_buffer_trigger_type *triggers)
968{ 1021{
969 struct buffer_head *bh = jh2bh(jh); 1022 struct buffer_head *bh = jh2bh(jh);
970 1023
971 if (!triggers || !triggers->t_commit) 1024 if (!triggers || !triggers->t_frozen)
972 return; 1025 return;
973 1026
974 triggers->t_commit(triggers, bh, mapped_data, bh->b_size); 1027 triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
975} 1028}
976 1029
977void jbd2_buffer_abort_trigger(struct journal_head *jh, 1030void jbd2_buffer_abort_trigger(struct journal_head *jh,
@@ -1235,7 +1288,8 @@ int jbd2_journal_stop(handle_t *handle)
1235{ 1288{
1236 transaction_t *transaction = handle->h_transaction; 1289 transaction_t *transaction = handle->h_transaction;
1237 journal_t *journal = transaction->t_journal; 1290 journal_t *journal = transaction->t_journal;
1238 int err; 1291 int err, wait_for_commit = 0;
1292 tid_t tid;
1239 pid_t pid; 1293 pid_t pid;
1240 1294
1241 J_ASSERT(journal_current_handle() == handle); 1295 J_ASSERT(journal_current_handle() == handle);
@@ -1243,7 +1297,7 @@ int jbd2_journal_stop(handle_t *handle)
1243 if (is_handle_aborted(handle)) 1297 if (is_handle_aborted(handle))
1244 err = -EIO; 1298 err = -EIO;
1245 else { 1299 else {
1246 J_ASSERT(transaction->t_updates > 0); 1300 J_ASSERT(atomic_read(&transaction->t_updates) > 0);
1247 err = 0; 1301 err = 0;
1248 } 1302 }
1249 1303
@@ -1288,9 +1342,9 @@ int jbd2_journal_stop(handle_t *handle)
1288 1342
1289 journal->j_last_sync_writer = pid; 1343 journal->j_last_sync_writer = pid;
1290 1344
1291 spin_lock(&journal->j_state_lock); 1345 read_lock(&journal->j_state_lock);
1292 commit_time = journal->j_average_commit_time; 1346 commit_time = journal->j_average_commit_time;
1293 spin_unlock(&journal->j_state_lock); 1347 read_unlock(&journal->j_state_lock);
1294 1348
1295 trans_time = ktime_to_ns(ktime_sub(ktime_get(), 1349 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1296 transaction->t_start_time)); 1350 transaction->t_start_time));
@@ -1311,15 +1365,8 @@ int jbd2_journal_stop(handle_t *handle)
1311 if (handle->h_sync) 1365 if (handle->h_sync)
1312 transaction->t_synchronous_commit = 1; 1366 transaction->t_synchronous_commit = 1;
1313 current->journal_info = NULL; 1367 current->journal_info = NULL;
1314 spin_lock(&journal->j_state_lock); 1368 atomic_sub(handle->h_buffer_credits,
1315 spin_lock(&transaction->t_handle_lock); 1369 &transaction->t_outstanding_credits);
1316 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1317 transaction->t_updates--;
1318 if (!transaction->t_updates) {
1319 wake_up(&journal->j_wait_updates);
1320 if (journal->j_barrier_count)
1321 wake_up(&journal->j_wait_transaction_locked);
1322 }
1323 1370
1324 /* 1371 /*
1325 * If the handle is marked SYNC, we need to set another commit 1372 * If the handle is marked SYNC, we need to set another commit
@@ -1328,32 +1375,42 @@ int jbd2_journal_stop(handle_t *handle)
1328 * transaction is too old now. 1375 * transaction is too old now.
1329 */ 1376 */
1330 if (handle->h_sync || 1377 if (handle->h_sync ||
1331 transaction->t_outstanding_credits > 1378 (atomic_read(&transaction->t_outstanding_credits) >
1332 journal->j_max_transaction_buffers || 1379 journal->j_max_transaction_buffers) ||
1333 time_after_eq(jiffies, transaction->t_expires)) { 1380 time_after_eq(jiffies, transaction->t_expires)) {
1334 /* Do this even for aborted journals: an abort still 1381 /* Do this even for aborted journals: an abort still
1335 * completes the commit thread, it just doesn't write 1382 * completes the commit thread, it just doesn't write
1336 * anything to disk. */ 1383 * anything to disk. */
1337 tid_t tid = transaction->t_tid;
1338 1384
1339 spin_unlock(&transaction->t_handle_lock);
1340 jbd_debug(2, "transaction too old, requesting commit for " 1385 jbd_debug(2, "transaction too old, requesting commit for "
1341 "handle %p\n", handle); 1386 "handle %p\n", handle);
1342 /* This is non-blocking */ 1387 /* This is non-blocking */
1343 __jbd2_log_start_commit(journal, transaction->t_tid); 1388 jbd2_log_start_commit(journal, transaction->t_tid);
1344 spin_unlock(&journal->j_state_lock);
1345 1389
1346 /* 1390 /*
1347 * Special case: JBD2_SYNC synchronous updates require us 1391 * Special case: JBD2_SYNC synchronous updates require us
1348 * to wait for the commit to complete. 1392 * to wait for the commit to complete.
1349 */ 1393 */
1350 if (handle->h_sync && !(current->flags & PF_MEMALLOC)) 1394 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1351 err = jbd2_log_wait_commit(journal, tid); 1395 wait_for_commit = 1;
1352 } else {
1353 spin_unlock(&transaction->t_handle_lock);
1354 spin_unlock(&journal->j_state_lock);
1355 } 1396 }
1356 1397
1398 /*
1399 * Once we drop t_updates, if it goes to zero the transaction
1400 * could start commiting on us and eventually disappear. So
1401 * once we do this, we must not dereference transaction
1402 * pointer again.
1403 */
1404 tid = transaction->t_tid;
1405 if (atomic_dec_and_test(&transaction->t_updates)) {
1406 wake_up(&journal->j_wait_updates);
1407 if (journal->j_barrier_count)
1408 wake_up(&journal->j_wait_transaction_locked);
1409 }
1410
1411 if (wait_for_commit)
1412 err = jbd2_log_wait_commit(journal, tid);
1413
1357 lock_map_release(&handle->h_lockdep_map); 1414 lock_map_release(&handle->h_lockdep_map);
1358 1415
1359 jbd2_free_handle(handle); 1416 jbd2_free_handle(handle);
@@ -1719,7 +1776,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1719 goto zap_buffer_unlocked; 1776 goto zap_buffer_unlocked;
1720 1777
1721 /* OK, we have data buffer in journaled mode */ 1778 /* OK, we have data buffer in journaled mode */
1722 spin_lock(&journal->j_state_lock); 1779 write_lock(&journal->j_state_lock);
1723 jbd_lock_bh_state(bh); 1780 jbd_lock_bh_state(bh);
1724 spin_lock(&journal->j_list_lock); 1781 spin_lock(&journal->j_list_lock);
1725 1782
@@ -1772,7 +1829,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1772 jbd2_journal_put_journal_head(jh); 1829 jbd2_journal_put_journal_head(jh);
1773 spin_unlock(&journal->j_list_lock); 1830 spin_unlock(&journal->j_list_lock);
1774 jbd_unlock_bh_state(bh); 1831 jbd_unlock_bh_state(bh);
1775 spin_unlock(&journal->j_state_lock); 1832 write_unlock(&journal->j_state_lock);
1776 return ret; 1833 return ret;
1777 } else { 1834 } else {
1778 /* There is no currently-running transaction. So the 1835 /* There is no currently-running transaction. So the
@@ -1786,7 +1843,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1786 jbd2_journal_put_journal_head(jh); 1843 jbd2_journal_put_journal_head(jh);
1787 spin_unlock(&journal->j_list_lock); 1844 spin_unlock(&journal->j_list_lock);
1788 jbd_unlock_bh_state(bh); 1845 jbd_unlock_bh_state(bh);
1789 spin_unlock(&journal->j_state_lock); 1846 write_unlock(&journal->j_state_lock);
1790 return ret; 1847 return ret;
1791 } else { 1848 } else {
1792 /* The orphan record's transaction has 1849 /* The orphan record's transaction has
@@ -1810,7 +1867,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1810 jbd2_journal_put_journal_head(jh); 1867 jbd2_journal_put_journal_head(jh);
1811 spin_unlock(&journal->j_list_lock); 1868 spin_unlock(&journal->j_list_lock);
1812 jbd_unlock_bh_state(bh); 1869 jbd_unlock_bh_state(bh);
1813 spin_unlock(&journal->j_state_lock); 1870 write_unlock(&journal->j_state_lock);
1814 return 0; 1871 return 0;
1815 } else { 1872 } else {
1816 /* Good, the buffer belongs to the running transaction. 1873 /* Good, the buffer belongs to the running transaction.
@@ -1829,7 +1886,7 @@ zap_buffer:
1829zap_buffer_no_jh: 1886zap_buffer_no_jh:
1830 spin_unlock(&journal->j_list_lock); 1887 spin_unlock(&journal->j_list_lock);
1831 jbd_unlock_bh_state(bh); 1888 jbd_unlock_bh_state(bh);
1832 spin_unlock(&journal->j_state_lock); 1889 write_unlock(&journal->j_state_lock);
1833zap_buffer_unlocked: 1890zap_buffer_unlocked:
1834 clear_buffer_dirty(bh); 1891 clear_buffer_dirty(bh);
1835 J_ASSERT_BH(bh, !buffer_jbddirty(bh)); 1892 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2136,9 +2193,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2136 /* Locks are here just to force reading of recent values, it is 2193 /* Locks are here just to force reading of recent values, it is
2137 * enough that the transaction was not committing before we started 2194 * enough that the transaction was not committing before we started
2138 * a transaction adding the inode to orphan list */ 2195 * a transaction adding the inode to orphan list */
2139 spin_lock(&journal->j_state_lock); 2196 read_lock(&journal->j_state_lock);
2140 commit_trans = journal->j_committing_transaction; 2197 commit_trans = journal->j_committing_transaction;
2141 spin_unlock(&journal->j_state_lock); 2198 read_unlock(&journal->j_state_lock);
2142 spin_lock(&journal->j_list_lock); 2199 spin_lock(&journal->j_list_lock);
2143 inode_trans = jinode->i_transaction; 2200 inode_trans = jinode->i_transaction;
2144 spin_unlock(&journal->j_list_lock); 2201 spin_unlock(&journal->j_list_lock);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476a..54a92fd02bbd 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
234 if (inode->i_mode != mode) { 234 if (inode->i_mode != mode) {
235 struct iattr attr; 235 struct iattr attr;
236 236
237 attr.ia_valid = ATTR_MODE; 237 attr.ia_valid = ATTR_MODE | ATTR_CTIME;
238 attr.ia_mode = mode; 238 attr.ia_mode = mode;
239 attr.ia_ctime = CURRENT_TIME_SEC;
239 rc = jffs2_do_setattr(inode, &attr); 240 rc = jffs2_do_setattr(inode, &attr);
240 if (rc < 0) 241 if (rc < 0)
241 return rc; 242 return rc;
@@ -419,7 +420,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
419 return rc; 420 return rc;
420} 421}
421 422
422struct xattr_handler jffs2_acl_access_xattr_handler = { 423const struct xattr_handler jffs2_acl_access_xattr_handler = {
423 .prefix = POSIX_ACL_XATTR_ACCESS, 424 .prefix = POSIX_ACL_XATTR_ACCESS,
424 .flags = ACL_TYPE_DEFAULT, 425 .flags = ACL_TYPE_DEFAULT,
425 .list = jffs2_acl_access_listxattr, 426 .list = jffs2_acl_access_listxattr,
@@ -427,7 +428,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
427 .set = jffs2_acl_setxattr, 428 .set = jffs2_acl_setxattr,
428}; 429};
429 430
430struct xattr_handler jffs2_acl_default_xattr_handler = { 431const struct xattr_handler jffs2_acl_default_xattr_handler = {
431 .prefix = POSIX_ACL_XATTR_DEFAULT, 432 .prefix = POSIX_ACL_XATTR_DEFAULT,
432 .flags = ACL_TYPE_DEFAULT, 433 .flags = ACL_TYPE_DEFAULT,
433 .list = jffs2_acl_default_listxattr, 434 .list = jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36b..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
33 33
34extern struct xattr_handler jffs2_acl_access_xattr_handler; 34extern const struct xattr_handler jffs2_acl_access_xattr_handler;
35extern struct xattr_handler jffs2_acl_default_xattr_handler; 35extern const struct xattr_handler jffs2_acl_default_xattr_handler;
36 36
37#else 37#else
38 38
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da94789..404111b016c9 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -23,10 +24,9 @@ static int jffs2_garbage_collect_thread(void *);
23 24
24void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c) 25void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
25{ 26{
26 spin_lock(&c->erase_completion_lock); 27 assert_spin_locked(&c->erase_completion_lock);
27 if (c->gc_task && jffs2_thread_should_wake(c)) 28 if (c->gc_task && jffs2_thread_should_wake(c))
28 send_sig(SIGHUP, c->gc_task, 1); 29 send_sig(SIGHUP, c->gc_task, 1);
29 spin_unlock(&c->erase_completion_lock);
30} 30}
31 31
32/* This must only ever be called when no GC thread is currently running */ 32/* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index c5e1450d79f9..a906f538d11c 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f0294410868d..617a1e5694c1 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -2,11 +2,12 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Created by Arjan van de Ven <arjanv@redhat.com> 5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
6 *
7 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>, 6 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
8 * University of Szeged, Hungary 7 * University of Szeged, Hungary
9 * 8 *
9 * Created by Arjan van de Ven <arjan@infradead.org>
10 *
10 * For licensing information, see the file 'LICENCE' in this directory. 11 * For licensing information, see the file 'LICENCE' in this directory.
11 * 12 *
12 */ 13 */
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 7d1d72faa774..e471a9106fd9 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>, 4 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
5 * University of Szeged, Hungary 5 * University of Szeged, Hungary
6 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
6 * 7 *
7 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
8 * 9 *
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index cd02acafde8a..ed25ae7c98eb 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2007 Nokia Corporation. All rights reserved. 4 * Copyright © 2007 Nokia Corporation. All rights reserved.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by Richard Purdie <rpurdie@openedhand.com> 7 * Created by Richard Purdie <rpurdie@openedhand.com>
7 * 8 *
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 546d1538d076..9696ad9ef5f7 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by Arjan van de Ven <arjanv@redhat.com> 7 * Created by Arjan van de Ven <arjanv@redhat.com>
7 * 8 *
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 170d289ac785..a12b4f763373 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by Arjan van de Ven <arjanv@redhat.com> 7 * Created by Arjan van de Ven <arjanv@redhat.com>
7 * 8 *
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index b46661a42758..97fc45de6f81 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index ec3538413926..e0b76c87a91a 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index a113ecc3bafe..c4f8eef5ca68 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417e085f..ed78a3cf3cb0 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -222,16 +223,17 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
222 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); 223 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
223 224
224 jffs2_free_raw_inode(ri); 225 jffs2_free_raw_inode(ri);
225 d_instantiate(dentry, inode);
226 226
227 D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", 227 D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
228 inode->i_ino, inode->i_mode, inode->i_nlink, 228 inode->i_ino, inode->i_mode, inode->i_nlink,
229 f->inocache->pino_nlink, inode->i_mapping->nrpages)); 229 f->inocache->pino_nlink, inode->i_mapping->nrpages));
230
231 d_instantiate(dentry, inode);
232 unlock_new_inode(inode);
230 return 0; 233 return 0;
231 234
232 fail: 235 fail:
233 make_bad_inode(inode); 236 iget_failed(inode);
234 iput(inode);
235 jffs2_free_raw_inode(ri); 237 jffs2_free_raw_inode(ri);
236 return ret; 238 return ret;
237} 239}
@@ -360,8 +362,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
360 /* Eeek. Wave bye bye */ 362 /* Eeek. Wave bye bye */
361 mutex_unlock(&f->sem); 363 mutex_unlock(&f->sem);
362 jffs2_complete_reservation(c); 364 jffs2_complete_reservation(c);
363 jffs2_clear_inode(inode); 365 ret = PTR_ERR(fn);
364 return PTR_ERR(fn); 366 goto fail;
365 } 367 }
366 368
367 /* We use f->target field to store the target path. */ 369 /* We use f->target field to store the target path. */
@@ -370,8 +372,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
370 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); 372 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
371 mutex_unlock(&f->sem); 373 mutex_unlock(&f->sem);
372 jffs2_complete_reservation(c); 374 jffs2_complete_reservation(c);
373 jffs2_clear_inode(inode); 375 ret = -ENOMEM;
374 return -ENOMEM; 376 goto fail;
375 } 377 }
376 378
377 memcpy(f->target, target, targetlen + 1); 379 memcpy(f->target, target, targetlen + 1);
@@ -386,30 +388,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
386 jffs2_complete_reservation(c); 388 jffs2_complete_reservation(c);
387 389
388 ret = jffs2_init_security(inode, dir_i); 390 ret = jffs2_init_security(inode, dir_i);
389 if (ret) { 391 if (ret)
390 jffs2_clear_inode(inode); 392 goto fail;
391 return ret; 393
392 }
393 ret = jffs2_init_acl_post(inode); 394 ret = jffs2_init_acl_post(inode);
394 if (ret) { 395 if (ret)
395 jffs2_clear_inode(inode); 396 goto fail;
396 return ret;
397 }
398 397
399 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 398 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
400 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 399 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
401 if (ret) { 400 if (ret)
402 /* Eep. */ 401 goto fail;
403 jffs2_clear_inode(inode);
404 return ret;
405 }
406 402
407 rd = jffs2_alloc_raw_dirent(); 403 rd = jffs2_alloc_raw_dirent();
408 if (!rd) { 404 if (!rd) {
409 /* Argh. Now we treat it like a normal delete */ 405 /* Argh. Now we treat it like a normal delete */
410 jffs2_complete_reservation(c); 406 jffs2_complete_reservation(c);
411 jffs2_clear_inode(inode); 407 ret = -ENOMEM;
412 return -ENOMEM; 408 goto fail;
413 } 409 }
414 410
415 dir_f = JFFS2_INODE_INFO(dir_i); 411 dir_f = JFFS2_INODE_INFO(dir_i);
@@ -437,8 +433,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
437 jffs2_complete_reservation(c); 433 jffs2_complete_reservation(c);
438 jffs2_free_raw_dirent(rd); 434 jffs2_free_raw_dirent(rd);
439 mutex_unlock(&dir_f->sem); 435 mutex_unlock(&dir_f->sem);
440 jffs2_clear_inode(inode); 436 ret = PTR_ERR(fd);
441 return PTR_ERR(fd); 437 goto fail;
442 } 438 }
443 439
444 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); 440 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -453,7 +449,12 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
453 jffs2_complete_reservation(c); 449 jffs2_complete_reservation(c);
454 450
455 d_instantiate(dentry, inode); 451 d_instantiate(dentry, inode);
452 unlock_new_inode(inode);
456 return 0; 453 return 0;
454
455 fail:
456 iget_failed(inode);
457 return ret;
457} 458}
458 459
459 460
@@ -519,8 +520,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
519 /* Eeek. Wave bye bye */ 520 /* Eeek. Wave bye bye */
520 mutex_unlock(&f->sem); 521 mutex_unlock(&f->sem);
521 jffs2_complete_reservation(c); 522 jffs2_complete_reservation(c);
522 jffs2_clear_inode(inode); 523 ret = PTR_ERR(fn);
523 return PTR_ERR(fn); 524 goto fail;
524 } 525 }
525 /* No data here. Only a metadata node, which will be 526 /* No data here. Only a metadata node, which will be
526 obsoleted by the first data write 527 obsoleted by the first data write
@@ -531,30 +532,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
531 jffs2_complete_reservation(c); 532 jffs2_complete_reservation(c);
532 533
533 ret = jffs2_init_security(inode, dir_i); 534 ret = jffs2_init_security(inode, dir_i);
534 if (ret) { 535 if (ret)
535 jffs2_clear_inode(inode); 536 goto fail;
536 return ret; 537
537 }
538 ret = jffs2_init_acl_post(inode); 538 ret = jffs2_init_acl_post(inode);
539 if (ret) { 539 if (ret)
540 jffs2_clear_inode(inode); 540 goto fail;
541 return ret;
542 }
543 541
544 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 542 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
545 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 543 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
546 if (ret) { 544 if (ret)
547 /* Eep. */ 545 goto fail;
548 jffs2_clear_inode(inode);
549 return ret;
550 }
551 546
552 rd = jffs2_alloc_raw_dirent(); 547 rd = jffs2_alloc_raw_dirent();
553 if (!rd) { 548 if (!rd) {
554 /* Argh. Now we treat it like a normal delete */ 549 /* Argh. Now we treat it like a normal delete */
555 jffs2_complete_reservation(c); 550 jffs2_complete_reservation(c);
556 jffs2_clear_inode(inode); 551 ret = -ENOMEM;
557 return -ENOMEM; 552 goto fail;
558 } 553 }
559 554
560 dir_f = JFFS2_INODE_INFO(dir_i); 555 dir_f = JFFS2_INODE_INFO(dir_i);
@@ -582,8 +577,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
582 jffs2_complete_reservation(c); 577 jffs2_complete_reservation(c);
583 jffs2_free_raw_dirent(rd); 578 jffs2_free_raw_dirent(rd);
584 mutex_unlock(&dir_f->sem); 579 mutex_unlock(&dir_f->sem);
585 jffs2_clear_inode(inode); 580 ret = PTR_ERR(fd);
586 return PTR_ERR(fd); 581 goto fail;
587 } 582 }
588 583
589 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); 584 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -599,7 +594,12 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
599 jffs2_complete_reservation(c); 594 jffs2_complete_reservation(c);
600 595
601 d_instantiate(dentry, inode); 596 d_instantiate(dentry, inode);
597 unlock_new_inode(inode);
602 return 0; 598 return 0;
599
600 fail:
601 iget_failed(inode);
602 return ret;
603} 603}
604 604
605static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) 605static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
@@ -693,8 +693,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
693 /* Eeek. Wave bye bye */ 693 /* Eeek. Wave bye bye */
694 mutex_unlock(&f->sem); 694 mutex_unlock(&f->sem);
695 jffs2_complete_reservation(c); 695 jffs2_complete_reservation(c);
696 jffs2_clear_inode(inode); 696 ret = PTR_ERR(fn);
697 return PTR_ERR(fn); 697 goto fail;
698 } 698 }
699 /* No data here. Only a metadata node, which will be 699 /* No data here. Only a metadata node, which will be
700 obsoleted by the first data write 700 obsoleted by the first data write
@@ -705,30 +705,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
705 jffs2_complete_reservation(c); 705 jffs2_complete_reservation(c);
706 706
707 ret = jffs2_init_security(inode, dir_i); 707 ret = jffs2_init_security(inode, dir_i);
708 if (ret) { 708 if (ret)
709 jffs2_clear_inode(inode); 709 goto fail;
710 return ret; 710
711 }
712 ret = jffs2_init_acl_post(inode); 711 ret = jffs2_init_acl_post(inode);
713 if (ret) { 712 if (ret)
714 jffs2_clear_inode(inode); 713 goto fail;
715 return ret;
716 }
717 714
718 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 715 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
719 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 716 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
720 if (ret) { 717 if (ret)
721 /* Eep. */ 718 goto fail;
722 jffs2_clear_inode(inode);
723 return ret;
724 }
725 719
726 rd = jffs2_alloc_raw_dirent(); 720 rd = jffs2_alloc_raw_dirent();
727 if (!rd) { 721 if (!rd) {
728 /* Argh. Now we treat it like a normal delete */ 722 /* Argh. Now we treat it like a normal delete */
729 jffs2_complete_reservation(c); 723 jffs2_complete_reservation(c);
730 jffs2_clear_inode(inode); 724 ret = -ENOMEM;
731 return -ENOMEM; 725 goto fail;
732 } 726 }
733 727
734 dir_f = JFFS2_INODE_INFO(dir_i); 728 dir_f = JFFS2_INODE_INFO(dir_i);
@@ -759,8 +753,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
759 jffs2_complete_reservation(c); 753 jffs2_complete_reservation(c);
760 jffs2_free_raw_dirent(rd); 754 jffs2_free_raw_dirent(rd);
761 mutex_unlock(&dir_f->sem); 755 mutex_unlock(&dir_f->sem);
762 jffs2_clear_inode(inode); 756 ret = PTR_ERR(fd);
763 return PTR_ERR(fd); 757 goto fail;
764 } 758 }
765 759
766 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); 760 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -775,8 +769,12 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
775 jffs2_complete_reservation(c); 769 jffs2_complete_reservation(c);
776 770
777 d_instantiate(dentry, inode); 771 d_instantiate(dentry, inode);
778 772 unlock_new_inode(inode);
779 return 0; 773 return 0;
774
775 fail:
776 iget_failed(inode);
777 return ret;
780} 778}
781 779
782static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, 780static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679be118a..abac961f617b 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -103,9 +104,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
103 jffs2_erase_failed(c, jeb, bad_offset); 104 jffs2_erase_failed(c, jeb, bad_offset);
104} 105}
105 106
106void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) 107int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
107{ 108{
108 struct jffs2_eraseblock *jeb; 109 struct jffs2_eraseblock *jeb;
110 int work_done = 0;
109 111
110 mutex_lock(&c->erase_free_sem); 112 mutex_lock(&c->erase_free_sem);
111 113
@@ -121,6 +123,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
121 mutex_unlock(&c->erase_free_sem); 123 mutex_unlock(&c->erase_free_sem);
122 jffs2_mark_erased_block(c, jeb); 124 jffs2_mark_erased_block(c, jeb);
123 125
126 work_done++;
124 if (!--count) { 127 if (!--count) {
125 D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n")); 128 D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
126 goto done; 129 goto done;
@@ -157,6 +160,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
157 mutex_unlock(&c->erase_free_sem); 160 mutex_unlock(&c->erase_free_sem);
158 done: 161 done:
159 D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n")); 162 D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
163 return work_done;
160} 164}
161 165
162static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) 166static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +169,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
165 mutex_lock(&c->erase_free_sem); 169 mutex_lock(&c->erase_free_sem);
166 spin_lock(&c->erase_completion_lock); 170 spin_lock(&c->erase_completion_lock);
167 list_move_tail(&jeb->list, &c->erase_complete_list); 171 list_move_tail(&jeb->list, &c->erase_complete_list);
172 /* Wake the GC thread to mark them clean */
173 jffs2_garbage_collect_trigger(c);
168 spin_unlock(&c->erase_completion_lock); 174 spin_unlock(&c->erase_completion_lock);
169 mutex_unlock(&c->erase_free_sem); 175 mutex_unlock(&c->erase_free_sem);
170 /* Ensure that kupdated calls us again to mark them clean */ 176 wake_up(&c->erase_wait);
171 jffs2_erase_pending_trigger(c);
172} 177}
173 178
174static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) 179static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +492,9 @@ filebad:
487 492
488refile: 493refile:
489 /* Stick it back on the list from whence it came and come back later */ 494 /* Stick it back on the list from whence it came and come back later */
490 jffs2_erase_pending_trigger(c);
491 mutex_lock(&c->erase_free_sem); 495 mutex_lock(&c->erase_free_sem);
492 spin_lock(&c->erase_completion_lock); 496 spin_lock(&c->erase_completion_lock);
497 jffs2_garbage_collect_trigger(c);
493 list_move(&jeb->list, &c->erase_complete_list); 498 list_move(&jeb->list, &c->erase_complete_list);
494 spin_unlock(&c->erase_completion_lock); 499 spin_unlock(&c->erase_completion_lock);
495 mutex_unlock(&c->erase_free_sem); 500 mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a19..1c0a08d711aa 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -26,9 +27,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
26 struct page **pagep, void **fsdata); 27 struct page **pagep, void **fsdata);
27static int jffs2_readpage (struct file *filp, struct page *pg); 28static int jffs2_readpage (struct file *filp, struct page *pg);
28 29
29int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync) 30int jffs2_fsync(struct file *filp, int datasync)
30{ 31{
31 struct inode *inode = dentry->d_inode; 32 struct inode *inode = filp->f_mapping->host;
32 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); 33 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
33 34
34 /* Trigger GC to flush any pending writes for this inode */ 35 /* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81b2142..6b2964a19850 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -169,13 +170,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
169 mutex_unlock(&f->sem); 170 mutex_unlock(&f->sem);
170 jffs2_complete_reservation(c); 171 jffs2_complete_reservation(c);
171 172
172 /* We have to do the vmtruncate() without f->sem held, since 173 /* We have to do the truncate_setsize() without f->sem held, since
173 some pages may be locked and waiting for it in readpage(). 174 some pages may be locked and waiting for it in readpage().
174 We are protected from a simultaneous write() extending i_size 175 We are protected from a simultaneous write() extending i_size
175 back past iattr->ia_size, because do_truncate() holds the 176 back past iattr->ia_size, because do_truncate() holds the
176 generic inode semaphore. */ 177 generic inode semaphore. */
177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { 178 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
178 vmtruncate(inode, iattr->ia_size); 179 truncate_setsize(inode, iattr->ia_size);
179 inode->i_blocks = (inode->i_size + 511) >> 9; 180 inode->i_blocks = (inode->i_size + 511) >> 9;
180 } 181 }
181 182
@@ -225,7 +226,7 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
225} 226}
226 227
227 228
228void jffs2_clear_inode (struct inode *inode) 229void jffs2_evict_inode (struct inode *inode)
229{ 230{
230 /* We can forget about this inode for now - drop all 231 /* We can forget about this inode for now - drop all
231 * the nodelists associated with it, etc. 232 * the nodelists associated with it, etc.
@@ -233,7 +234,9 @@ void jffs2_clear_inode (struct inode *inode)
233 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); 234 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
234 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); 235 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
235 236
236 D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode)); 237 D1(printk(KERN_DEBUG "jffs2_evict_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
238 truncate_inode_pages(&inode->i_data, 0);
239 end_writeback(inode);
237 jffs2_do_clear_inode(c, f); 240 jffs2_do_clear_inode(c, f);
238} 241}
239 242
@@ -313,8 +316,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
313 case S_IFBLK: 316 case S_IFBLK:
314 case S_IFCHR: 317 case S_IFCHR:
315 /* Read the device numbers from the media */ 318 /* Read the device numbers from the media */
316 if (f->metadata->size != sizeof(jdev.old) && 319 if (f->metadata->size != sizeof(jdev.old_id) &&
317 f->metadata->size != sizeof(jdev.new)) { 320 f->metadata->size != sizeof(jdev.new_id)) {
318 printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size); 321 printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
319 goto error_io; 322 goto error_io;
320 } 323 }
@@ -325,10 +328,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
325 printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino); 328 printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
326 goto error; 329 goto error;
327 } 330 }
328 if (f->metadata->size == sizeof(jdev.old)) 331 if (f->metadata->size == sizeof(jdev.old_id))
329 rdev = old_decode_dev(je16_to_cpu(jdev.old)); 332 rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
330 else 333 else
331 rdev = new_decode_dev(je32_to_cpu(jdev.new)); 334 rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
332 335
333 case S_IFSOCK: 336 case S_IFSOCK:
334 case S_IFIFO: 337 case S_IFIFO:
@@ -465,7 +468,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
465 inode->i_blocks = 0; 468 inode->i_blocks = 0;
466 inode->i_size = 0; 469 inode->i_size = 0;
467 470
468 insert_inode_hash(inode); 471 if (insert_inode_locked(inode) < 0) {
472 make_bad_inode(inode);
473 unlock_new_inode(inode);
474 iput(inode);
475 return ERR_PTR(-EINVAL);
476 }
469 477
470 return inode; 478 return inode;
471} 479}
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa12cff..846a79452497 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
@@ -214,6 +215,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
214 return ret; 215 return ret;
215 } 216 }
216 217
218 /* If there are any blocks which need erasing, erase them now */
219 if (!list_empty(&c->erase_complete_list) ||
220 !list_empty(&c->erase_pending_list)) {
221 spin_unlock(&c->erase_completion_lock);
222 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
223 if (jffs2_erase_pending_blocks(c, 1)) {
224 mutex_unlock(&c->alloc_sem);
225 return 0;
226 }
227 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
228 spin_lock(&c->erase_completion_lock);
229 }
230
217 /* First, work out which block we're garbage-collecting */ 231 /* First, work out which block we're garbage-collecting */
218 jeb = c->gcblock; 232 jeb = c->gcblock;
219 233
@@ -222,7 +236,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
222 236
223 if (!jeb) { 237 if (!jeb) {
224 /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */ 238 /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
225 if (!list_empty(&c->erase_pending_list)) { 239 if (c->nr_erasing_blocks) {
226 spin_unlock(&c->erase_completion_lock); 240 spin_unlock(&c->erase_completion_lock);
227 mutex_unlock(&c->alloc_sem); 241 mutex_unlock(&c->alloc_sem);
228 return -EAGAIN; 242 return -EAGAIN;
@@ -435,7 +449,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
435 list_add_tail(&c->gcblock->list, &c->erase_pending_list); 449 list_add_tail(&c->gcblock->list, &c->erase_pending_list);
436 c->gcblock = NULL; 450 c->gcblock = NULL;
437 c->nr_erasing_blocks++; 451 c->nr_erasing_blocks++;
438 jffs2_erase_pending_trigger(c); 452 jffs2_garbage_collect_trigger(c);
439 } 453 }
440 spin_unlock(&c->erase_completion_lock); 454 spin_unlock(&c->erase_completion_lock);
441 455
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index 9d41f43e47bb..859a598af020 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index c6923da98263..2e4a86763c07 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 85ef6dbb1be7..6784bc89add1 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -2,6 +2,7 @@
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright © 2001-2007 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
5 * 6 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 8 *
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6ec1847..523a91691052 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -24,7 +24,6 @@
24#ifdef __ECOS 24#ifdef __ECOS
25#include "os-ecos.h" 25#include "os-ecos.h"
26#else 26#else
27#include <linux/mtd/compatmac.h> /* For compatibility with older kernels */
28#include "os-linux.h" 27#include "os-linux.h"
29#endif 28#endif
30 29
@@ -312,11 +311,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
312static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev) 311static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
313{ 312{
314 if (old_valid_dev(rdev)) { 313 if (old_valid_dev(rdev)) {
315 jdev->old = cpu_to_je16(old_encode_dev(rdev)); 314 jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
316 return sizeof(jdev->old); 315 return sizeof(jdev->old_id);
317 } else { 316 } else {
318 jdev->new = cpu_to_je32(new_encode_dev(rdev)); 317 jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
319 return sizeof(jdev->new); 318 return sizeof(jdev->new_id);
320 } 319 }
321} 320}
322 321
@@ -464,7 +463,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
464int jffs2_do_mount_fs(struct jffs2_sb_info *c); 463int jffs2_do_mount_fs(struct jffs2_sb_info *c);
465 464
466/* erase.c */ 465/* erase.c */
467void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); 466int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
468void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); 467void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
469 468
470#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 469#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 191359dde4e1..694aa5b03505 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -116,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
116 116
117 ret = jffs2_garbage_collect_pass(c); 117 ret = jffs2_garbage_collect_pass(c);
118 118
119 if (ret == -EAGAIN) 119 if (ret == -EAGAIN) {
120 jffs2_erase_pending_blocks(c, 1); 120 spin_lock(&c->erase_completion_lock);
121 else if (ret) 121 if (c->nr_erasing_blocks &&
122 list_empty(&c->erase_pending_list) &&
123 list_empty(&c->erase_complete_list)) {
124 DECLARE_WAITQUEUE(wait, current);
125 set_current_state(TASK_UNINTERRUPTIBLE);
126 add_wait_queue(&c->erase_wait, &wait);
127 D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
128 spin_unlock(&c->erase_completion_lock);
129
130 schedule();
131 } else
132 spin_unlock(&c->erase_completion_lock);
133 } else if (ret)
122 return ret; 134 return ret;
123 135
124 cond_resched(); 136 cond_resched();
@@ -217,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
217 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); 229 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
218 list_move_tail(&ejeb->list, &c->erase_pending_list); 230 list_move_tail(&ejeb->list, &c->erase_pending_list);
219 c->nr_erasing_blocks++; 231 c->nr_erasing_blocks++;
220 jffs2_erase_pending_trigger(c); 232 jffs2_garbage_collect_trigger(c);
221 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", 233 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
222 ejeb->offset)); 234 ejeb->offset));
223 } 235 }
@@ -469,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
469void jffs2_complete_reservation(struct jffs2_sb_info *c) 481void jffs2_complete_reservation(struct jffs2_sb_info *c)
470{ 482{
471 D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n")); 483 D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
484 spin_lock(&c->erase_completion_lock);
472 jffs2_garbage_collect_trigger(c); 485 jffs2_garbage_collect_trigger(c);
486 spin_unlock(&c->erase_completion_lock);
473 mutex_unlock(&c->alloc_sem); 487 mutex_unlock(&c->alloc_sem);
474} 488}
475 489
@@ -611,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
611 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); 625 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
612 list_add_tail(&jeb->list, &c->erase_pending_list); 626 list_add_tail(&jeb->list, &c->erase_pending_list);
613 c->nr_erasing_blocks++; 627 c->nr_erasing_blocks++;
614 jffs2_erase_pending_trigger(c); 628 jffs2_garbage_collect_trigger(c);
615 } else { 629 } else {
616 /* Sometimes, however, we leave it elsewhere so it doesn't get 630 /* Sometimes, however, we leave it elsewhere so it doesn't get
617 immediately reused, and we spread the load a bit. */ 631 immediately reused, and we spread the load a bit. */
@@ -732,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
732 int nr_very_dirty = 0; 746 int nr_very_dirty = 0;
733 struct jffs2_eraseblock *jeb; 747 struct jffs2_eraseblock *jeb;
734 748
749 if (!list_empty(&c->erase_complete_list) ||
750 !list_empty(&c->erase_pending_list))
751 return 1;
752
735 if (c->unchecked_size) { 753 if (c->unchecked_size) {
736 D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", 754 D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
737 c->unchecked_size, c->checked_ino)); 755 c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7ebcb3..00bae7cc2e48 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
140 140
141#endif /* WRITEBUFFER */ 141#endif /* WRITEBUFFER */
142 142
143/* erase.c */ 143static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
144static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
145{ 144{
146 OFNI_BS_2SFFJ(c)->s_dirt = 1; 145 OFNI_BS_2SFFJ(c)->s_dirt = 1;
147} 146}
@@ -159,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
159extern const struct file_operations jffs2_file_operations; 158extern const struct file_operations jffs2_file_operations;
160extern const struct inode_operations jffs2_file_inode_operations; 159extern const struct inode_operations jffs2_file_inode_operations;
161extern const struct address_space_operations jffs2_file_address_operations; 160extern const struct address_space_operations jffs2_file_address_operations;
162int jffs2_fsync(struct file *, struct dentry *, int); 161int jffs2_fsync(struct file *, int);
163int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); 162int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
164 163
165/* ioctl.c */ 164/* ioctl.c */
@@ -172,7 +171,7 @@ extern const struct inode_operations jffs2_symlink_inode_operations;
172int jffs2_setattr (struct dentry *, struct iattr *); 171int jffs2_setattr (struct dentry *, struct iattr *);
173int jffs2_do_setattr (struct inode *, struct iattr *); 172int jffs2_do_setattr (struct inode *, struct iattr *);
174struct inode *jffs2_iget(struct super_block *, unsigned long); 173struct inode *jffs2_iget(struct super_block *, unsigned long);
175void jffs2_clear_inode (struct inode *); 174void jffs2_evict_inode (struct inode *);
176void jffs2_dirty_inode(struct inode *inode); 175void jffs2_dirty_inode(struct inode *inode);
177struct inode *jffs2_new_inode (struct inode *dir_i, int mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
178 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686cc206e..46f870d1cc36 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
260 ret = -EIO; 260 ret = -EIO;
261 goto out; 261 goto out;
262 } 262 }
263 jffs2_erase_pending_trigger(c); 263 spin_lock(&c->erase_completion_lock);
264 jffs2_garbage_collect_trigger(c);
265 spin_unlock(&c->erase_completion_lock);
264 } 266 }
265 ret = 0; 267 ret = 0;
266 out: 268 out:
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee058583..239f51216a68 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
77 return retlen; 77 return retlen;
78} 78}
79 79
80struct xattr_handler jffs2_security_xattr_handler = { 80const struct xattr_handler jffs2_security_xattr_handler = {
81 .prefix = XATTR_SECURITY_PREFIX, 81 .prefix = XATTR_SECURITY_PREFIX,
82 .list = jffs2_security_listxattr, 82 .list = jffs2_security_listxattr,
83 .set = jffs2_security_setxattr, 83 .set = jffs2_security_setxattr,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e595d0..662bba099501 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
63 63
64 if (!(sb->s_flags & MS_RDONLY)) { 64 if (!(sb->s_flags & MS_RDONLY)) {
65 D1(printk(KERN_DEBUG "jffs2_write_super()\n")); 65 D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
66 jffs2_garbage_collect_trigger(c);
67 jffs2_erase_pending_blocks(c, 0);
68 jffs2_flush_wbuf_gc(c, 0); 66 jffs2_flush_wbuf_gc(c, 0);
69 } 67 }
70 68
@@ -137,7 +135,7 @@ static const struct super_operations jffs2_super_operations =
137 .write_super = jffs2_write_super, 135 .write_super = jffs2_write_super,
138 .statfs = jffs2_statfs, 136 .statfs = jffs2_statfs,
139 .remount_fs = jffs2_remount_fs, 137 .remount_fs = jffs2_remount_fs,
140 .clear_inode = jffs2_clear_inode, 138 .evict_inode = jffs2_evict_inode,
141 .dirty_inode = jffs2_dirty_inode, 139 .dirty_inode = jffs2_dirty_inode,
142 .sync_fs = jffs2_sync_fs, 140 .sync_fs = jffs2_sync_fs,
143}; 141};
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac265e5..07ee1546b2fa 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
84 struct jffs2_inodirty *new; 84 struct jffs2_inodirty *new;
85 85
86 /* Mark the superblock dirty so that kupdated will flush... */ 86 /* Mark the superblock dirty so that kupdated will flush... */
87 jffs2_erase_pending_trigger(c); 87 jffs2_dirty_trigger(c);
88 88
89 if (jffs2_wbuf_pending_for_ino(c, ino)) 89 if (jffs2_wbuf_pending_for_ino(c, ino))
90 return; 90 return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
121 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); 121 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
122 list_add_tail(&jeb->list, &c->erase_pending_list); 122 list_add_tail(&jeb->list, &c->erase_pending_list);
123 c->nr_erasing_blocks++; 123 c->nr_erasing_blocks++;
124 jffs2_erase_pending_trigger(c); 124 jffs2_garbage_collect_trigger(c);
125 } else { 125 } else {
126 /* Sometimes, however, we leave it elsewhere so it doesn't get 126 /* Sometimes, however, we leave it elsewhere so it doesn't get
127 immediately reused, and we spread the load a bit. */ 127 immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
152 D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset)); 152 D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
153 list_add(&jeb->list, &c->erase_pending_list); 153 list_add(&jeb->list, &c->erase_pending_list);
154 c->nr_erasing_blocks++; 154 c->nr_erasing_blocks++;
155 jffs2_erase_pending_trigger(c); 155 jffs2_garbage_collect_trigger(c);
156 } 156 }
157 157
158 if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) { 158 if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
543 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); 543 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
544 list_move(&jeb->list, &c->erase_pending_list); 544 list_move(&jeb->list, &c->erase_pending_list);
545 c->nr_erasing_blocks++; 545 c->nr_erasing_blocks++;
546 jffs2_erase_pending_trigger(c); 546 jffs2_garbage_collect_trigger(c);
547 } 547 }
548 548
549 jffs2_dbg_acct_sanity_check_nolock(c, jeb); 549 jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d6..9b572ca40a49 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -588,7 +588,7 @@ static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
588 588
589void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) 589void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
590{ 590{
591 /* It's called from jffs2_clear_inode() on inode removing. 591 /* It's called from jffs2_evict_inode() on inode removing.
592 When an inode with XATTR is removed, those XATTRs must be removed. */ 592 When an inode with XATTR is removed, those XATTRs must be removed. */
593 struct jffs2_xattr_ref *ref, *_ref; 593 struct jffs2_xattr_ref *ref, *_ref;
594 594
@@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
626 626
627static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) 627static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
628{ 628{
629 /* success of check_xattr_ref_inode() means taht inode (ic) dose not have 629 /* success of check_xattr_ref_inode() means that inode (ic) dose not have
630 * duplicate name/value pairs. If duplicate name/value pair would be found, 630 * duplicate name/value pairs. If duplicate name/value pair would be found,
631 * one will be removed. 631 * one will be removed.
632 */ 632 */
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) 904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
905 * is an implementation of setxattr handler on jffs2. 905 * is an implementation of setxattr handler on jffs2.
906 * -------------------------------------------------- */ 906 * -------------------------------------------------- */
907struct xattr_handler *jffs2_xattr_handlers[] = { 907const struct xattr_handler *jffs2_xattr_handlers[] = {
908 &jffs2_user_xattr_handler, 908 &jffs2_user_xattr_handler,
909#ifdef CONFIG_JFFS2_FS_SECURITY 909#ifdef CONFIG_JFFS2_FS_SECURITY
910 &jffs2_security_xattr_handler, 910 &jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
917 NULL 917 NULL
918}; 918};
919 919
920static struct xattr_handler *xprefix_to_handler(int xprefix) { 920static const struct xattr_handler *xprefix_to_handler(int xprefix) {
921 struct xattr_handler *ret; 921 const struct xattr_handler *ret;
922 922
923 switch (xprefix) { 923 switch (xprefix) {
924 case JFFS2_XPREFIX_USER: 924 case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
955 struct jffs2_inode_cache *ic = f->inocache; 955 struct jffs2_inode_cache *ic = f->inocache;
956 struct jffs2_xattr_ref *ref, **pref; 956 struct jffs2_xattr_ref *ref, **pref;
957 struct jffs2_xattr_datum *xd; 957 struct jffs2_xattr_datum *xd;
958 struct xattr_handler *xhandle; 958 const struct xattr_handler *xhandle;
959 ssize_t len, rc; 959 ssize_t len, rc;
960 int retry = 0; 960 int retry = 0;
961 961
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7ab..cf4f5759b42b 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, 93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
94 const char *buffer, size_t size, int flags); 94 const char *buffer, size_t size, int flags);
95 95
96extern struct xattr_handler *jffs2_xattr_handlers[]; 96extern const struct xattr_handler *jffs2_xattr_handlers[];
97extern struct xattr_handler jffs2_user_xattr_handler; 97extern const struct xattr_handler jffs2_user_xattr_handler;
98extern struct xattr_handler jffs2_trusted_xattr_handler; 98extern const struct xattr_handler jffs2_trusted_xattr_handler;
99 99
100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); 100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
101#define jffs2_getxattr generic_getxattr 101#define jffs2_getxattr generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
122 122
123#ifdef CONFIG_JFFS2_FS_SECURITY 123#ifdef CONFIG_JFFS2_FS_SECURITY
124extern int jffs2_init_security(struct inode *inode, struct inode *dir); 124extern int jffs2_init_security(struct inode *inode, struct inode *dir);
125extern struct xattr_handler jffs2_security_xattr_handler; 125extern const struct xattr_handler jffs2_security_xattr_handler;
126#else 126#else
127#define jffs2_init_security(inode,dir) (0) 127#define jffs2_init_security(inode,dir) (0)
128#endif /* CONFIG_JFFS2_FS_SECURITY */ 128#endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e05..1c868194c504 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_trusted_xattr_handler = { 50const struct xattr_handler jffs2_trusted_xattr_handler = {
51 .prefix = XATTR_TRUSTED_PREFIX, 51 .prefix = XATTR_TRUSTED_PREFIX,
52 .list = jffs2_trusted_listxattr, 52 .list = jffs2_trusted_listxattr,
53 .set = jffs2_trusted_setxattr, 53 .set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dffe..916b5c966039 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_user_xattr_handler = { 50const struct xattr_handler jffs2_user_xattr_handler = {
51 .prefix = XATTR_USER_PREFIX, 51 .prefix = XATTR_USER_PREFIX,
52 .list = jffs2_user_listxattr, 52 .list = jffs2_user_listxattr,
53 .set = jffs2_user_setxattr, 53 .set = jffs2_user_setxattr,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 14ba982b3f24..c5ce6c1d1ff4 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -17,6 +17,7 @@
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 18 */
19 19
20#include <linux/mm.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/quotaops.h> 22#include <linux/quotaops.h>
22#include "jfs_incore.h" 23#include "jfs_incore.h"
@@ -27,9 +28,9 @@
27#include "jfs_acl.h" 28#include "jfs_acl.h"
28#include "jfs_debug.h" 29#include "jfs_debug.h"
29 30
30int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) 31int jfs_fsync(struct file *file, int datasync)
31{ 32{
32 struct inode *inode = dentry->d_inode; 33 struct inode *inode = file->f_mapping->host;
33 int rc = 0; 34 int rc = 0;
34 35
35 if (!(inode->i_state & I_DIRTY) || 36 if (!(inode->i_state & I_DIRTY) ||
@@ -98,7 +99,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
98 if (rc) 99 if (rc)
99 return rc; 100 return rc;
100 101
101 if (iattr->ia_valid & ATTR_SIZE) 102 if (is_quota_modification(inode, iattr))
102 dquot_initialize(inode); 103 dquot_initialize(inode);
103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 104 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 105 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
@@ -107,11 +108,18 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
107 return rc; 108 return rc;
108 } 109 }
109 110
110 rc = inode_setattr(inode, iattr); 111 if ((iattr->ia_valid & ATTR_SIZE) &&
112 iattr->ia_size != i_size_read(inode)) {
113 rc = vmtruncate(inode, iattr->ia_size);
114 if (rc)
115 return rc;
116 }
117
118 setattr_copy(inode, iattr);
119 mark_inode_dirty(inode);
111 120
112 if (!rc && (iattr->ia_valid & ATTR_MODE)) 121 if (iattr->ia_valid & ATTR_MODE)
113 rc = jfs_acl_chmod(inode); 122 rc = jfs_acl_chmod(inode);
114
115 return rc; 123 return rc;
116} 124}
117 125
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index ed9ba6fe04f5..9978803ceedc 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -145,31 +145,32 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
145 return 0; 145 return 0;
146} 146}
147 147
148void jfs_delete_inode(struct inode *inode) 148void jfs_evict_inode(struct inode *inode)
149{ 149{
150 jfs_info("In jfs_delete_inode, inode = 0x%p", inode); 150 jfs_info("In jfs_evict_inode, inode = 0x%p", inode);
151 151
152 if (!is_bad_inode(inode)) 152 if (!inode->i_nlink && !is_bad_inode(inode)) {
153 dquot_initialize(inode); 153 dquot_initialize(inode);
154 154
155 if (!is_bad_inode(inode) && 155 if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
156 (JFS_IP(inode)->fileset == FILESYSTEM_I)) { 156 truncate_inode_pages(&inode->i_data, 0);
157 truncate_inode_pages(&inode->i_data, 0);
158 157
159 if (test_cflag(COMMIT_Freewmap, inode)) 158 if (test_cflag(COMMIT_Freewmap, inode))
160 jfs_free_zero_link(inode); 159 jfs_free_zero_link(inode);
161 160
162 diFree(inode); 161 diFree(inode);
163 162
164 /* 163 /*
165 * Free the inode from the quota allocation. 164 * Free the inode from the quota allocation.
166 */ 165 */
167 dquot_initialize(inode); 166 dquot_initialize(inode);
168 dquot_free_inode(inode); 167 dquot_free_inode(inode);
169 dquot_drop(inode); 168 }
169 } else {
170 truncate_inode_pages(&inode->i_data, 0);
170 } 171 }
171 172 end_writeback(inode);
172 clear_inode(inode); 173 dquot_drop(inode);
173} 174}
174 175
175void jfs_dirty_inode(struct inode *inode) 176void jfs_dirty_inode(struct inode *inode)
@@ -303,8 +304,17 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
303 loff_t pos, unsigned len, unsigned flags, 304 loff_t pos, unsigned len, unsigned flags,
304 struct page **pagep, void **fsdata) 305 struct page **pagep, void **fsdata)
305{ 306{
306 return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 307 int ret;
308
309 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
307 jfs_get_block); 310 jfs_get_block);
311 if (unlikely(ret)) {
312 loff_t isize = mapping->host->i_size;
313 if (pos + len > isize)
314 vmtruncate(mapping->host, isize);
315 }
316
317 return ret;
308} 318}
309 319
310static sector_t jfs_bmap(struct address_space *mapping, sector_t block) 320static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
@@ -317,9 +327,24 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
317{ 327{
318 struct file *file = iocb->ki_filp; 328 struct file *file = iocb->ki_filp;
319 struct inode *inode = file->f_mapping->host; 329 struct inode *inode = file->f_mapping->host;
330 ssize_t ret;
320 331
321 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 332 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
322 offset, nr_segs, jfs_get_block, NULL); 333 offset, nr_segs, jfs_get_block, NULL);
334
335 /*
336 * In case of error extending write may have instantiated a few
337 * blocks outside i_size. Trim these off again.
338 */
339 if (unlikely((rw & WRITE) && ret < 0)) {
340 loff_t isize = i_size_read(inode);
341 loff_t end = offset + iov_length(iov, nr_segs);
342
343 if (end > isize)
344 vmtruncate(inode, isize);
345 }
346
347 return ret;
323} 348}
324 349
325const struct address_space_operations jfs_aops = { 350const struct address_space_operations jfs_aops = {
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9e2f6a721668..c92ea3b3ea5e 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2438,7 +2438,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2438 2438
2439 /* check if this is a control page update for an allocation. 2439 /* check if this is a control page update for an allocation.
2440 * if so, update the leaf to reflect the new leaf value using 2440 * if so, update the leaf to reflect the new leaf value using
2441 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate 2441 * dbSplit(); otherwise (deallocation), use dbJoin() to update
2442 * the leaf with the new value. in addition to updating the 2442 * the leaf with the new value. in addition to updating the
2443 * leaf, dbSplit() will also split the binary buddy system of 2443 * leaf, dbSplit() will also split the binary buddy system of
2444 * the leaves, if required, and bubble new values within the 2444 * the leaves, if required, and bubble new values within the
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 829921b67765..2686531e235a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
98 goto fail_unlock; 98 goto fail_unlock;
99 } 99 }
100 100
101 inode->i_uid = current_fsuid(); 101 inode_init_owner(inode, parent, mode);
102 if (parent->i_mode & S_ISGID) {
103 inode->i_gid = parent->i_gid;
104 if (S_ISDIR(mode))
105 mode |= S_ISGID;
106 } else
107 inode->i_gid = current_fsgid();
108
109 /* 102 /*
110 * New inodes need to save sane values on disk when 103 * New inodes need to save sane values on disk when
111 * uid & gid mount options are used 104 * uid & gid mount options are used
@@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
121 if (rc) 114 if (rc)
122 goto fail_drop; 115 goto fail_drop;
123 116
124 inode->i_mode = mode;
125 /* inherit flags from parent */ 117 /* inherit flags from parent */
126 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; 118 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
127 119
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
134 if (S_ISLNK(mode)) 126 if (S_ISLNK(mode))
135 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); 127 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
136 } 128 }
137 jfs_inode->mode2 |= mode; 129 jfs_inode->mode2 |= inode->i_mode;
138 130
139 inode->i_blocks = 0; 131 inode->i_blocks = 0;
140 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 132 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9e6bda30a6e8..155e91eff07d 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,13 +21,13 @@
21struct fid; 21struct fid;
22 22
23extern struct inode *ialloc(struct inode *, umode_t); 23extern struct inode *ialloc(struct inode *, umode_t);
24extern int jfs_fsync(struct file *, struct dentry *, int); 24extern int jfs_fsync(struct file *, int);
25extern long jfs_ioctl(struct file *, unsigned int, unsigned long); 25extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); 26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
28extern int jfs_commit_inode(struct inode *, int); 28extern int jfs_commit_inode(struct inode *, int);
29extern int jfs_write_inode(struct inode *, struct writeback_control *); 29extern int jfs_write_inode(struct inode *, struct writeback_control *);
30extern void jfs_delete_inode(struct inode *); 30extern void jfs_evict_inode(struct inode *);
31extern void jfs_dirty_inode(struct inode *); 31extern void jfs_dirty_inode(struct inode *);
32extern void jfs_truncate(struct inode *); 32extern void jfs_truncate(struct inode *);
33extern void jfs_truncate_nolock(struct inode *, loff_t); 33extern void jfs_truncate_nolock(struct inode *, loff_t);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b66832ac33ac..ec8c3e4baca3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -132,11 +132,6 @@ static void jfs_destroy_inode(struct inode *inode)
132 kmem_cache_free(jfs_inode_cachep, ji); 132 kmem_cache_free(jfs_inode_cachep, ji);
133} 133}
134 134
135static void jfs_clear_inode(struct inode *inode)
136{
137 dquot_drop(inode);
138}
139
140static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) 135static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
141{ 136{
142 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); 137 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -179,6 +174,8 @@ static void jfs_put_super(struct super_block *sb)
179 174
180 jfs_info("In jfs_put_super"); 175 jfs_info("In jfs_put_super");
181 176
177 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
178
182 lock_kernel(); 179 lock_kernel();
183 180
184 rc = jfs_umount(sb); 181 rc = jfs_umount(sb);
@@ -396,10 +393,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
396 393
397 JFS_SBI(sb)->flag = flag; 394 JFS_SBI(sb)->flag = flag;
398 ret = jfs_mount_rw(sb, 1); 395 ret = jfs_mount_rw(sb, 1);
396
397 /* mark the fs r/w for quota activity */
398 sb->s_flags &= ~MS_RDONLY;
399
399 unlock_kernel(); 400 unlock_kernel();
401 dquot_resume(sb, -1);
400 return ret; 402 return ret;
401 } 403 }
402 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 404 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
405 rc = dquot_suspend(sb, -1);
406 if (rc < 0) {
407 unlock_kernel();
408 return rc;
409 }
403 rc = jfs_umount_rw(sb); 410 rc = jfs_umount_rw(sb);
404 JFS_SBI(sb)->flag = flag; 411 JFS_SBI(sb)->flag = flag;
405 unlock_kernel(); 412 unlock_kernel();
@@ -469,6 +476,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
469 */ 476 */
470 sb->s_op = &jfs_super_operations; 477 sb->s_op = &jfs_super_operations;
471 sb->s_export_op = &jfs_export_operations; 478 sb->s_export_op = &jfs_export_operations;
479#ifdef CONFIG_QUOTA
480 sb->dq_op = &dquot_operations;
481 sb->s_qcop = &dquot_quotactl_ops;
482#endif
472 483
473 /* 484 /*
474 * Initialize direct-mapping inode/address-space 485 * Initialize direct-mapping inode/address-space
@@ -749,8 +760,7 @@ static const struct super_operations jfs_super_operations = {
749 .destroy_inode = jfs_destroy_inode, 760 .destroy_inode = jfs_destroy_inode,
750 .dirty_inode = jfs_dirty_inode, 761 .dirty_inode = jfs_dirty_inode,
751 .write_inode = jfs_write_inode, 762 .write_inode = jfs_write_inode,
752 .delete_inode = jfs_delete_inode, 763 .evict_inode = jfs_evict_inode,
753 .clear_inode = jfs_clear_inode,
754 .put_super = jfs_put_super, 764 .put_super = jfs_put_super,
755 .sync_fs = jfs_sync_fs, 765 .sync_fs = jfs_sync_fs,
756 .freeze_fs = jfs_freeze, 766 .freeze_fs = jfs_freeze,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fa96bbb26343..2d7f165d0f1d 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -86,46 +86,25 @@ struct ea_buffer {
86#define EA_MALLOC 0x0008 86#define EA_MALLOC 0x0008
87 87
88 88
89static int is_known_namespace(const char *name)
90{
91 if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
92 strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
93 strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
94 strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
95 return false;
96
97 return true;
98}
99
89/* 100/*
90 * These three routines are used to recognize on-disk extended attributes 101 * These three routines are used to recognize on-disk extended attributes
91 * that are in a recognized namespace. If the attribute is not recognized, 102 * that are in a recognized namespace. If the attribute is not recognized,
92 * "os2." is prepended to the name 103 * "os2." is prepended to the name
93 */ 104 */
94static inline int is_os2_xattr(struct jfs_ea *ea) 105static int is_os2_xattr(struct jfs_ea *ea)
95{ 106{
96 /* 107 return !is_known_namespace(ea->name);
97 * Check for "system."
98 */
99 if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) &&
100 !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
101 return false;
102 /*
103 * Check for "user."
104 */
105 if ((ea->namelen >= XATTR_USER_PREFIX_LEN) &&
106 !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
107 return false;
108 /*
109 * Check for "security."
110 */
111 if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) &&
112 !strncmp(ea->name, XATTR_SECURITY_PREFIX,
113 XATTR_SECURITY_PREFIX_LEN))
114 return false;
115 /*
116 * Check for "trusted."
117 */
118 if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) &&
119 !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
120 return false;
121 /*
122 * Add any other valid namespace prefixes here
123 */
124
125 /*
126 * We assume it's OS/2's flat namespace
127 */
128 return true;
129} 108}
130 109
131static inline int name_size(struct jfs_ea *ea) 110static inline int name_size(struct jfs_ea *ea)
@@ -764,13 +743,23 @@ static int can_set_xattr(struct inode *inode, const char *name,
764 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 743 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
765 return can_set_system_xattr(inode, name, value, value_len); 744 return can_set_system_xattr(inode, name, value, value_len);
766 745
746 if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
747 /*
748 * This makes sure that we aren't trying to set an
749 * attribute in a different namespace by prefixing it
750 * with "os2."
751 */
752 if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
753 return -EOPNOTSUPP;
754 return 0;
755 }
756
767 /* 757 /*
768 * Don't allow setting an attribute in an unknown namespace. 758 * Don't allow setting an attribute in an unknown namespace.
769 */ 759 */
770 if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && 760 if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
771 strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && 761 strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
772 strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && 762 strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
773 strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))
774 return -EOPNOTSUPP; 763 return -EOPNOTSUPP;
775 764
776 return 0; 765 return 0;
@@ -952,19 +941,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
952 int xattr_size; 941 int xattr_size;
953 ssize_t size; 942 ssize_t size;
954 int namelen = strlen(name); 943 int namelen = strlen(name);
955 char *os2name = NULL;
956 char *value; 944 char *value;
957 945
958 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
959 os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
960 GFP_KERNEL);
961 if (!os2name)
962 return -ENOMEM;
963 strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
964 name = os2name;
965 namelen -= XATTR_OS2_PREFIX_LEN;
966 }
967
968 down_read(&JFS_IP(inode)->xattr_sem); 946 down_read(&JFS_IP(inode)->xattr_sem);
969 947
970 xattr_size = ea_get(inode, &ea_buf, 0); 948 xattr_size = ea_get(inode, &ea_buf, 0);
@@ -1002,8 +980,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
1002 out: 980 out:
1003 up_read(&JFS_IP(inode)->xattr_sem); 981 up_read(&JFS_IP(inode)->xattr_sem);
1004 982
1005 kfree(os2name);
1006
1007 return size; 983 return size;
1008} 984}
1009 985
@@ -1012,6 +988,19 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
1012{ 988{
1013 int err; 989 int err;
1014 990
991 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
992 /*
993 * skip past "os2." prefix
994 */
995 name += XATTR_OS2_PREFIX_LEN;
996 /*
997 * Don't allow retrieving properly prefixed attributes
998 * by prepending them with "os2."
999 */
1000 if (is_known_namespace(name))
1001 return -EOPNOTSUPP;
1002 }
1003
1015 err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); 1004 err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
1016 1005
1017 return err; 1006 return err;
diff --git a/fs/libfs.c b/fs/libfs.c
index ea9a6cc9b35c..0a9da95317f7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/vfs.h> 10#include <linux/vfs.h>
11#include <linux/quotaops.h>
11#include <linux/mutex.h> 12#include <linux/mutex.h>
12#include <linux/exportfs.h> 13#include <linux/exportfs.h>
13#include <linux/writeback.h> 14#include <linux/writeback.h>
@@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
58 return NULL; 59 return NULL;
59} 60}
60 61
61int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
62{
63 return 0;
64}
65
66int dcache_dir_open(struct inode *inode, struct file *file) 62int dcache_dir_open(struct inode *inode, struct file *file)
67{ 63{
68 static struct qstr cursor_name = {.len = 1, .name = "."}; 64 static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = {
190 .llseek = dcache_dir_lseek, 186 .llseek = dcache_dir_lseek,
191 .read = generic_read_dir, 187 .read = generic_read_dir,
192 .readdir = dcache_readdir, 188 .readdir = dcache_readdir,
193 .fsync = simple_sync_file, 189 .fsync = noop_fsync,
194}; 190};
195 191
196const struct inode_operations simple_dir_inode_operations = { 192const struct inode_operations simple_dir_inode_operations = {
@@ -330,6 +326,39 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
330 return 0; 326 return 0;
331} 327}
332 328
329/**
330 * simple_setattr - setattr for simple filesystem
331 * @dentry: dentry
332 * @iattr: iattr structure
333 *
334 * Returns 0 on success, -error on failure.
335 *
336 * simple_setattr is a simple ->setattr implementation without a proper
337 * implementation of size changes.
338 *
339 * It can either be used for in-memory filesystems or special files
340 * on simple regular filesystems. Anything that needs to change on-disk
341 * or wire state on size changes needs its own setattr method.
342 */
343int simple_setattr(struct dentry *dentry, struct iattr *iattr)
344{
345 struct inode *inode = dentry->d_inode;
346 int error;
347
348 WARN_ON_ONCE(inode->i_op->truncate);
349
350 error = inode_change_ok(inode, iattr);
351 if (error)
352 return error;
353
354 if (iattr->ia_valid & ATTR_SIZE)
355 truncate_setsize(inode, iattr->ia_size);
356 setattr_copy(inode, iattr);
357 mark_inode_dirty(inode);
358 return 0;
359}
360EXPORT_SYMBOL(simple_setattr);
361
333int simple_readpage(struct file *file, struct page *page) 362int simple_readpage(struct file *file, struct page *page)
334{ 363{
335 clear_highpage(page); 364 clear_highpage(page);
@@ -418,7 +447,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
418 * unique inode values later for this filesystem, then you must take care 447 * unique inode values later for this filesystem, then you must take care
419 * to pass it an appropriate max_reserved value to avoid collisions. 448 * to pass it an appropriate max_reserved value to avoid collisions.
420 */ 449 */
421int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files) 450int simple_fill_super(struct super_block *s, unsigned long magic,
451 struct tree_descr *files)
422{ 452{
423 struct inode *inode; 453 struct inode *inode;
424 struct dentry *root; 454 struct dentry *root;
@@ -547,6 +577,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
547} 577}
548 578
549/** 579/**
580 * simple_write_to_buffer - copy data from user space to the buffer
581 * @to: the buffer to write to
582 * @available: the size of the buffer
583 * @ppos: the current position in the buffer
584 * @from: the user space buffer to read from
585 * @count: the maximum number of bytes to read
586 *
587 * The simple_write_to_buffer() function reads up to @count bytes from the user
588 * space address starting at @from into the buffer @to at offset @ppos.
589 *
590 * On success, the number of bytes written is returned and the offset @ppos is
591 * advanced by this number, or negative value is returned on error.
592 **/
593ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
594 const void __user *from, size_t count)
595{
596 loff_t pos = *ppos;
597 size_t res;
598
599 if (pos < 0)
600 return -EINVAL;
601 if (pos >= available || !count)
602 return 0;
603 if (count > available - pos)
604 count = available - pos;
605 res = copy_from_user(to + pos, from, count);
606 if (res == count)
607 return -EFAULT;
608 count -= res;
609 *ppos = pos + count;
610 return count;
611}
612
613/**
550 * memory_read_from_buffer - copy data from the buffer 614 * memory_read_from_buffer - copy data from the buffer
551 * @to: the kernel space buffer to read to 615 * @to: the kernel space buffer to read to
552 * @count: the maximum number of bytes to read 616 * @count: the maximum number of bytes to read
@@ -817,13 +881,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
817} 881}
818EXPORT_SYMBOL_GPL(generic_fh_to_parent); 882EXPORT_SYMBOL_GPL(generic_fh_to_parent);
819 883
820int simple_fsync(struct file *file, struct dentry *dentry, int datasync) 884/**
885 * generic_file_fsync - generic fsync implementation for simple filesystems
886 * @file: file to synchronize
887 * @datasync: only synchronize essential metadata if true
888 *
889 * This is a generic implementation of the fsync method for simple
890 * filesystems which track all non-inode metadata in the buffers list
891 * hanging off the address_space structure.
892 */
893int generic_file_fsync(struct file *file, int datasync)
821{ 894{
822 struct writeback_control wbc = { 895 struct writeback_control wbc = {
823 .sync_mode = WB_SYNC_ALL, 896 .sync_mode = WB_SYNC_ALL,
824 .nr_to_write = 0, /* metadata-only; caller takes care of data */ 897 .nr_to_write = 0, /* metadata-only; caller takes care of data */
825 }; 898 };
826 struct inode *inode = dentry->d_inode; 899 struct inode *inode = file->f_mapping->host;
827 int err; 900 int err;
828 int ret; 901 int ret;
829 902
@@ -838,7 +911,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
838 ret = err; 911 ret = err;
839 return ret; 912 return ret;
840} 913}
841EXPORT_SYMBOL(simple_fsync); 914EXPORT_SYMBOL(generic_file_fsync);
915
916/*
917 * No-op implementation of ->fsync for in-memory filesystems.
918 */
919int noop_fsync(struct file *file, int datasync)
920{
921 return 0;
922}
842 923
843EXPORT_SYMBOL(dcache_dir_close); 924EXPORT_SYMBOL(dcache_dir_close);
844EXPORT_SYMBOL(dcache_dir_lseek); 925EXPORT_SYMBOL(dcache_dir_lseek);
@@ -861,9 +942,10 @@ EXPORT_SYMBOL(simple_release_fs);
861EXPORT_SYMBOL(simple_rename); 942EXPORT_SYMBOL(simple_rename);
862EXPORT_SYMBOL(simple_rmdir); 943EXPORT_SYMBOL(simple_rmdir);
863EXPORT_SYMBOL(simple_statfs); 944EXPORT_SYMBOL(simple_statfs);
864EXPORT_SYMBOL(simple_sync_file); 945EXPORT_SYMBOL(noop_fsync);
865EXPORT_SYMBOL(simple_unlink); 946EXPORT_SYMBOL(simple_unlink);
866EXPORT_SYMBOL(simple_read_from_buffer); 947EXPORT_SYMBOL(simple_read_from_buffer);
948EXPORT_SYMBOL(simple_write_to_buffer);
867EXPORT_SYMBOL(memory_read_from_buffer); 949EXPORT_SYMBOL(memory_read_from_buffer);
868EXPORT_SYMBOL(simple_transaction_set); 950EXPORT_SYMBOL(simple_transaction_set);
869EXPORT_SYMBOL(simple_transaction_get); 951EXPORT_SYMBOL(simple_transaction_get);
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 243c00071f76..9bd2ce2a3040 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -303,6 +303,11 @@ static void bdev_put_device(struct super_block *sb)
303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); 303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
304} 304}
305 305
306static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
307{
308 return 0;
309}
310
306static const struct logfs_device_ops bd_devops = { 311static const struct logfs_device_ops bd_devops = {
307 .find_first_sb = bdev_find_first_sb, 312 .find_first_sb = bdev_find_first_sb,
308 .find_last_sb = bdev_find_last_sb, 313 .find_last_sb = bdev_find_last_sb,
@@ -310,6 +315,7 @@ static const struct logfs_device_ops bd_devops = {
310 .readpage = bdev_readpage, 315 .readpage = bdev_readpage,
311 .writeseg = bdev_writeseg, 316 .writeseg = bdev_writeseg,
312 .erase = bdev_erase, 317 .erase = bdev_erase,
318 .can_write_buf = bdev_can_write_buf,
313 .sync = bdev_sync, 319 .sync = bdev_sync,
314 .put_device = bdev_put_device, 320 .put_device = bdev_put_device,
315}; 321};
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index cafb6ef2e05b..a85d47d13e4b 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -9,6 +9,7 @@
9#include <linux/completion.h> 9#include <linux/completion.h>
10#include <linux/mount.h> 10#include <linux/mount.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/slab.h>
12 13
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) 14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14 15
@@ -126,7 +127,8 @@ static int mtd_readpage(void *_sb, struct page *page)
126 127
127 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, 128 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
128 page_address(page)); 129 page_address(page));
129 if (err == -EUCLEAN) { 130 if (err == -EUCLEAN || err == -EBADMSG) {
131 /* -EBADMSG happens regularly on power failures */
130 err = 0; 132 err = 0;
131 /* FIXME: force GC this segment */ 133 /* FIXME: force GC this segment */
132 } 134 }
@@ -233,12 +235,32 @@ static void mtd_put_device(struct super_block *sb)
233 put_mtd_device(logfs_super(sb)->s_mtd); 235 put_mtd_device(logfs_super(sb)->s_mtd);
234} 236}
235 237
238static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
239{
240 struct logfs_super *super = logfs_super(sb);
241 void *buf;
242 int err;
243
244 buf = kmalloc(super->s_writesize, GFP_KERNEL);
245 if (!buf)
246 return -ENOMEM;
247 err = mtd_read(sb, ofs, super->s_writesize, buf);
248 if (err)
249 goto out;
250 if (memchr_inv(buf, 0xff, super->s_writesize))
251 err = -EIO;
252 kfree(buf);
253out:
254 return err;
255}
256
236static const struct logfs_device_ops mtd_devops = { 257static const struct logfs_device_ops mtd_devops = {
237 .find_first_sb = mtd_find_first_sb, 258 .find_first_sb = mtd_find_first_sb,
238 .find_last_sb = mtd_find_last_sb, 259 .find_last_sb = mtd_find_last_sb,
239 .readpage = mtd_readpage, 260 .readpage = mtd_readpage,
240 .writeseg = mtd_writeseg, 261 .writeseg = mtd_writeseg,
241 .erase = mtd_erase, 262 .erase = mtd_erase,
263 .can_write_buf = mtd_can_write_buf,
242 .sync = mtd_sync, 264 .sync = mtd_sync,
243 .put_device = mtd_put_device, 265 .put_device = mtd_put_device,
244}; 266};
@@ -250,5 +272,7 @@ int logfs_get_sb_mtd(struct file_system_type *type, int flags,
250 const struct logfs_device_ops *devops = &mtd_devops; 272 const struct logfs_device_ops *devops = &mtd_devops;
251 273
252 mtd = get_mtd_device(NULL, mtdnr); 274 mtd = get_mtd_device(NULL, mtdnr);
275 if (IS_ERR(mtd))
276 return PTR_ERR(mtd);
253 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); 277 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
254} 278}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 2396a85c0f55..9777eb5b5522 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -12,7 +12,7 @@
12 * Atomic dir operations 12 * Atomic dir operations
13 * 13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are 14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in seperate operations. Therefore we need to do 15 * created/removed/altered in separate operations. Therefore we need to do
16 * a small amount of journaling. 16 * a small amount of journaling.
17 * 17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do 18 * Create, link, mkdir, mknod and symlink all share the same function to do
@@ -434,8 +434,11 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
434 int ret; 434 int ret;
435 435
436 ta = kzalloc(sizeof(*ta), GFP_KERNEL); 436 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
437 if (!ta) 437 if (!ta) {
438 inode->i_nlink--;
439 iput(inode);
438 return -ENOMEM; 440 return -ENOMEM;
441 }
439 442
440 ta->state = CREATE_1; 443 ta->state = CREATE_1;
441 ta->ino = inode->i_ino; 444 ta->ino = inode->i_ino;
@@ -821,7 +824,7 @@ const struct inode_operations logfs_dir_iops = {
821}; 824};
822const struct file_operations logfs_dir_fops = { 825const struct file_operations logfs_dir_fops = {
823 .fsync = logfs_fsync, 826 .fsync = logfs_fsync,
824 .ioctl = logfs_ioctl, 827 .unlocked_ioctl = logfs_ioctl,
825 .readdir = logfs_readdir, 828 .readdir = logfs_readdir,
826 .read = generic_read_dir, 829 .read = generic_read_dir,
827}; 830};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 370f367a933e..e86376b87af1 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -161,7 +161,17 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
161 161
162static void logfs_invalidatepage(struct page *page, unsigned long offset) 162static void logfs_invalidatepage(struct page *page, unsigned long offset)
163{ 163{
164 move_page_to_btree(page); 164 struct logfs_block *block = logfs_block(page);
165
166 if (block->reserved_bytes) {
167 struct super_block *sb = page->mapping->host->i_sb;
168 struct logfs_super *super = logfs_super(sb);
169
170 super->s_dirty_pages -= block->reserved_bytes;
171 block->ops->free_block(sb, block);
172 BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
173 } else
174 move_page_to_btree(page);
165 BUG_ON(PagePrivate(page) || page->private); 175 BUG_ON(PagePrivate(page) || page->private);
166} 176}
167 177
@@ -171,9 +181,9 @@ static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
171} 181}
172 182
173 183
174int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, 184long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
175 unsigned long arg)
176{ 185{
186 struct inode *inode = file->f_path.dentry->d_inode;
177 struct logfs_inode *li = logfs_inode(inode); 187 struct logfs_inode *li = logfs_inode(inode);
178 unsigned int oldflags, flags; 188 unsigned int oldflags, flags;
179 int err; 189 int err;
@@ -209,13 +219,11 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
209 } 219 }
210} 220}
211 221
212int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) 222int logfs_fsync(struct file *file, int datasync)
213{ 223{
214 struct super_block *sb = dentry->d_inode->i_sb; 224 struct super_block *sb = file->f_mapping->host->i_sb;
215 struct logfs_super *super = logfs_super(sb);
216 225
217 /* FIXME: write anchor */ 226 logfs_write_anchor(sb);
218 super->s_devops->sync(sb);
219 return 0; 227 return 0;
220} 228}
221 229
@@ -224,15 +232,19 @@ static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
224 struct inode *inode = dentry->d_inode; 232 struct inode *inode = dentry->d_inode;
225 int err = 0; 233 int err = 0;
226 234
227 if (attr->ia_valid & ATTR_SIZE) 235 err = inode_change_ok(inode, attr);
236 if (err)
237 return err;
238
239 if (attr->ia_valid & ATTR_SIZE) {
228 err = logfs_truncate(inode, attr->ia_size); 240 err = logfs_truncate(inode, attr->ia_size);
229 attr->ia_valid &= ~ATTR_SIZE; 241 if (err)
242 return err;
243 }
230 244
231 if (!err) 245 setattr_copy(inode, attr);
232 err = inode_change_ok(inode, attr); 246 mark_inode_dirty(inode);
233 if (!err) 247 return 0;
234 err = inode_setattr(inode, attr);
235 return err;
236} 248}
237 249
238const struct inode_operations logfs_reg_iops = { 250const struct inode_operations logfs_reg_iops = {
@@ -243,7 +255,7 @@ const struct file_operations logfs_reg_fops = {
243 .aio_read = generic_file_aio_read, 255 .aio_read = generic_file_aio_read,
244 .aio_write = generic_file_aio_write, 256 .aio_write = generic_file_aio_write,
245 .fsync = logfs_fsync, 257 .fsync = logfs_fsync,
246 .ioctl = logfs_ioctl, 258 .unlocked_ioctl = logfs_ioctl,
247 .llseek = generic_file_llseek, 259 .llseek = generic_file_llseek,
248 .mmap = generic_file_readonly_mmap, 260 .mmap = generic_file_readonly_mmap,
249 .open = generic_file_open, 261 .open = generic_file_open,
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 76c242fbe1b0..caa4419285dc 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -122,7 +122,7 @@ static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
122 logfs_safe_iput(inode, cookie); 122 logfs_safe_iput(inode, cookie);
123} 123}
124 124
125static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) 125static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
126{ 126{
127 struct logfs_super *super = logfs_super(sb); 127 struct logfs_super *super = logfs_super(sb);
128 struct logfs_segment_header sh; 128 struct logfs_segment_header sh;
@@ -401,7 +401,7 @@ static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
401 segno, (u64)segno << super->s_segshift, 401 segno, (u64)segno << super->s_segshift,
402 dist, no_free_segments(sb), valid, 402 dist, no_free_segments(sb), valid,
403 super->s_free_bytes); 403 super->s_free_bytes);
404 cleaned = logfs_gc_segment(sb, segno, dist); 404 cleaned = logfs_gc_segment(sb, segno);
405 log_gc("GC segment #%02x complete - now %x valid\n", segno, 405 log_gc("GC segment #%02x complete - now %x valid\n", segno,
406 valid - cleaned); 406 valid - cleaned);
407 BUG_ON(cleaned != valid); 407 BUG_ON(cleaned != valid);
@@ -632,38 +632,31 @@ static int check_area(struct super_block *sb, int i)
632{ 632{
633 struct logfs_super *super = logfs_super(sb); 633 struct logfs_super *super = logfs_super(sb);
634 struct logfs_area *area = super->s_area[i]; 634 struct logfs_area *area = super->s_area[i];
635 struct logfs_object_header oh; 635 gc_level_t gc_level;
636 u32 cleaned, valid, ec;
636 u32 segno = area->a_segno; 637 u32 segno = area->a_segno;
637 u32 ofs = area->a_used_bytes; 638 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
638 __be32 crc;
639 int err;
640 639
641 if (!area->a_is_open) 640 if (!area->a_is_open)
642 return 0; 641 return 0;
643 642
644 for (ofs = area->a_used_bytes; 643 if (super->s_devops->can_write_buf(sb, ofs) == 0)
645 ofs <= super->s_segsize - sizeof(oh); 644 return 0;
646 ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
647 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
648 if (err)
649 return err;
650
651 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
652 break;
653 645
654 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4); 646 printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
655 if (crc != oh.crc) { 647 /*
656 printk(KERN_INFO "interrupted header at %llx\n", 648 * The device cannot write back the write buffer. Most likely the
657 dev_ofs(sb, segno, ofs)); 649 * wbuf was already written out and the system crashed at some point
658 return 0; 650 * before the journal commit happened. In that case we wouldn't have
659 } 651 * to do anything. But if the crash happened before the wbuf was
660 } 652 * written out correctly, we must GC this segment. So assume the
661 if (ofs != area->a_used_bytes) { 653 * worst and always do the GC run.
662 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", 654 */
663 ofs - area->a_used_bytes, 655 area->a_is_open = 0;
664 dev_ofs(sb, segno, area->a_used_bytes)); 656 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
665 area->a_used_bytes = ofs; 657 cleaned = logfs_gc_segment(sb, segno);
666 } 658 if (cleaned != valid)
659 return -EIO;
667 return 0; 660 return 0;
668} 661}
669 662
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 14ed27274da2..d8c71ece098f 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -193,6 +193,7 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
193 inode->i_ctime = CURRENT_TIME; 193 inode->i_ctime = CURRENT_TIME;
194 inode->i_mtime = CURRENT_TIME; 194 inode->i_mtime = CURRENT_TIME;
195 inode->i_nlink = 1; 195 inode->i_nlink = 1;
196 li->li_refcount = 1;
196 INIT_LIST_HEAD(&li->li_freeing_list); 197 INIT_LIST_HEAD(&li->li_freeing_list);
197 198
198 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) 199 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
@@ -234,33 +235,21 @@ static struct inode *logfs_alloc_inode(struct super_block *sb)
234 * purpose is to create a new inode that will not trigger the warning if such 235 * purpose is to create a new inode that will not trigger the warning if such
235 * an inode is still in use. An ugly hack, no doubt. Suggections for 236 * an inode is still in use. An ugly hack, no doubt. Suggections for
236 * improvement are welcome. 237 * improvement are welcome.
238 *
239 * AV: that's what ->put_super() is for...
237 */ 240 */
238struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino) 241struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
239{ 242{
240 struct inode *inode; 243 struct inode *inode;
241 244
242 inode = logfs_alloc_inode(sb); 245 inode = new_inode(sb);
243 if (!inode) 246 if (!inode)
244 return ERR_PTR(-ENOMEM); 247 return ERR_PTR(-ENOMEM);
245 248
246 inode->i_mode = S_IFREG; 249 inode->i_mode = S_IFREG;
247 inode->i_ino = ino; 250 inode->i_ino = ino;
248 inode->i_sb = sb; 251 inode->i_data.a_ops = &logfs_reg_aops;
249 252 mapping_set_gfp_mask(&inode->i_data, GFP_NOFS);
250 /* This is a blatant copy of alloc_inode code. We'd need alloc_inode
251 * to be nonstatic, alas. */
252 {
253 struct address_space * const mapping = &inode->i_data;
254
255 mapping->a_ops = &logfs_reg_aops;
256 mapping->host = inode;
257 mapping->flags = 0;
258 mapping_set_gfp_mask(mapping, GFP_NOFS);
259 mapping->assoc_mapping = NULL;
260 mapping->backing_dev_info = &default_backing_dev_info;
261 inode->i_mapping = mapping;
262 inode->i_nlink = 1;
263 }
264 253
265 return inode; 254 return inode;
266} 255}
@@ -276,7 +265,7 @@ struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
276 265
277 err = logfs_read_inode(inode); 266 err = logfs_read_inode(inode);
278 if (err) { 267 if (err) {
279 destroy_meta_inode(inode); 268 iput(inode);
280 return ERR_PTR(err); 269 return ERR_PTR(err);
281 } 270 }
282 logfs_inode_setops(inode); 271 logfs_inode_setops(inode);
@@ -297,18 +286,8 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
297 return ret; 286 return ret;
298} 287}
299 288
300void destroy_meta_inode(struct inode *inode)
301{
302 if (inode) {
303 if (inode->i_data.nrpages)
304 truncate_inode_pages(&inode->i_data, 0);
305 logfs_clear_inode(inode);
306 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
307 }
308}
309
310/* called with inode_lock held */ 289/* called with inode_lock held */
311static void logfs_drop_inode(struct inode *inode) 290static int logfs_drop_inode(struct inode *inode)
312{ 291{
313 struct logfs_super *super = logfs_super(inode->i_sb); 292 struct logfs_super *super = logfs_super(inode->i_sb);
314 struct logfs_inode *li = logfs_inode(inode); 293 struct logfs_inode *li = logfs_inode(inode);
@@ -316,7 +295,7 @@ static void logfs_drop_inode(struct inode *inode)
316 spin_lock(&logfs_inode_lock); 295 spin_lock(&logfs_inode_lock);
317 list_move(&li->li_freeing_list, &super->s_freeing_list); 296 list_move(&li->li_freeing_list, &super->s_freeing_list);
318 spin_unlock(&logfs_inode_lock); 297 spin_unlock(&logfs_inode_lock);
319 generic_drop_inode(inode); 298 return generic_drop_inode(inode);
320} 299}
321 300
322static void logfs_set_ino_generation(struct super_block *sb, 301static void logfs_set_ino_generation(struct super_block *sb,
@@ -326,7 +305,7 @@ static void logfs_set_ino_generation(struct super_block *sb,
326 u64 ino; 305 u64 ino;
327 306
328 mutex_lock(&super->s_journal_mutex); 307 mutex_lock(&super->s_journal_mutex);
329 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino); 308 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
330 super->s_last_ino = ino; 309 super->s_last_ino = ino;
331 super->s_inos_till_wrap--; 310 super->s_inos_till_wrap--;
332 if (super->s_inos_till_wrap < 0) { 311 if (super->s_inos_till_wrap < 0) {
@@ -357,14 +336,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode)
357 inode->i_mode = mode; 336 inode->i_mode = mode;
358 logfs_set_ino_generation(sb, inode); 337 logfs_set_ino_generation(sb, inode);
359 338
360 inode->i_uid = current_fsuid(); 339 inode_init_owner(inode, dir, mode);
361 inode->i_gid = current_fsgid();
362 if (dir->i_mode & S_ISGID) {
363 inode->i_gid = dir->i_gid;
364 if (S_ISDIR(mode))
365 inode->i_mode |= S_ISGID;
366 }
367
368 logfs_inode_setops(inode); 340 logfs_inode_setops(inode);
369 insert_inode_hash(inode); 341 insert_inode_hash(inode);
370 342
@@ -386,17 +358,25 @@ static void logfs_init_once(void *_li)
386 358
387static int logfs_sync_fs(struct super_block *sb, int wait) 359static int logfs_sync_fs(struct super_block *sb, int wait)
388{ 360{
389 /* FIXME: write anchor */ 361 logfs_write_anchor(sb);
390 logfs_super(sb)->s_devops->sync(sb);
391 return 0; 362 return 0;
392} 363}
393 364
365static void logfs_put_super(struct super_block *sb)
366{
367 struct logfs_super *super = logfs_super(sb);
368 /* kill the meta-inodes */
369 iput(super->s_master_inode);
370 iput(super->s_segfile_inode);
371 iput(super->s_mapping_inode);
372}
373
394const struct super_operations logfs_super_operations = { 374const struct super_operations logfs_super_operations = {
395 .alloc_inode = logfs_alloc_inode, 375 .alloc_inode = logfs_alloc_inode,
396 .clear_inode = logfs_clear_inode,
397 .delete_inode = logfs_delete_inode,
398 .destroy_inode = logfs_destroy_inode, 376 .destroy_inode = logfs_destroy_inode,
377 .evict_inode = logfs_evict_inode,
399 .drop_inode = logfs_drop_inode, 378 .drop_inode = logfs_drop_inode,
379 .put_super = logfs_put_super,
400 .write_inode = logfs_write_inode, 380 .write_inode = logfs_write_inode,
401 .statfs = logfs_statfs, 381 .statfs = logfs_statfs,
402 .sync_fs = logfs_sync_fs, 382 .sync_fs = logfs_sync_fs,
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index fb0a613f885b..f46ee8b0e135 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -132,10 +132,9 @@ static int read_area(struct super_block *sb, struct logfs_je_area *a)
132 132
133 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); 133 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
134 if (super->s_writesize > 1) 134 if (super->s_writesize > 1)
135 logfs_buf_recover(area, ofs, a + 1, super->s_writesize); 135 return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
136 else 136 else
137 logfs_buf_recover(area, ofs, NULL, 0); 137 return logfs_buf_recover(area, ofs, NULL, 0);
138 return 0;
139} 138}
140 139
141static void *unpack(void *from, void *to) 140static void *unpack(void *from, void *to)
@@ -245,7 +244,7 @@ static int read_je(struct super_block *sb, u64 ofs)
245 read_erasecount(sb, unpack(jh, scratch)); 244 read_erasecount(sb, unpack(jh, scratch));
246 break; 245 break;
247 case JE_AREA: 246 case JE_AREA:
248 read_area(sb, unpack(jh, scratch)); 247 err = read_area(sb, unpack(jh, scratch));
249 break; 248 break;
250 case JE_OBJ_ALIAS: 249 case JE_OBJ_ALIAS:
251 err = logfs_load_object_aliases(sb, unpack(jh, scratch), 250 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
@@ -890,8 +889,6 @@ void logfs_cleanup_journal(struct super_block *sb)
890 struct logfs_super *super = logfs_super(sb); 889 struct logfs_super *super = logfs_super(sb);
891 890
892 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL); 891 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
893 destroy_meta_inode(super->s_master_inode);
894 super->s_master_inode = NULL;
895 892
896 kfree(super->s_compressed_je); 893 kfree(super->s_compressed_je);
897 kfree(super->s_je); 894 kfree(super->s_je);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 0a3df1a0c936..b8786264d243 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -144,6 +144,7 @@ struct logfs_area_ops {
144 * @erase: erase one segment 144 * @erase: erase one segment
145 * @read: read from the device 145 * @read: read from the device
146 * @erase: erase part of the device 146 * @erase: erase part of the device
147 * @can_write_buf: decide whether wbuf can be written to ofs
147 */ 148 */
148struct logfs_device_ops { 149struct logfs_device_ops {
149 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); 150 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
@@ -153,6 +154,7 @@ struct logfs_device_ops {
153 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); 154 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
154 int (*erase)(struct super_block *sb, loff_t ofs, size_t len, 155 int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
155 int ensure_write); 156 int ensure_write);
157 int (*can_write_buf)(struct super_block *sb, u64 ofs);
156 void (*sync)(struct super_block *sb); 158 void (*sync)(struct super_block *sb);
157 void (*put_device)(struct super_block *sb); 159 void (*put_device)(struct super_block *sb);
158}; 160};
@@ -394,6 +396,7 @@ struct logfs_super {
394 int s_lock_count; 396 int s_lock_count;
395 mempool_t *s_block_pool; /* struct logfs_block pool */ 397 mempool_t *s_block_pool; /* struct logfs_block pool */
396 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ 398 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
399 struct list_head s_writeback_list; /* writeback pages */
397 /* 400 /*
398 * Space accounting: 401 * Space accounting:
399 * - s_used_bytes specifies space used to store valid data objects. 402 * - s_used_bytes specifies space used to store valid data objects.
@@ -501,9 +504,8 @@ extern const struct inode_operations logfs_reg_iops;
501extern const struct file_operations logfs_reg_fops; 504extern const struct file_operations logfs_reg_fops;
502extern const struct address_space_operations logfs_reg_aops; 505extern const struct address_space_operations logfs_reg_aops;
503int logfs_readpage(struct file *file, struct page *page); 506int logfs_readpage(struct file *file, struct page *page);
504int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, 507long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
505 unsigned long arg); 508int logfs_fsync(struct file *file, int datasync);
506int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
507 509
508/* gc.c */ 510/* gc.c */
509u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); 511u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
@@ -522,13 +524,11 @@ struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
522struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); 524struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
523int logfs_init_inode_cache(void); 525int logfs_init_inode_cache(void);
524void logfs_destroy_inode_cache(void); 526void logfs_destroy_inode_cache(void);
525void destroy_meta_inode(struct inode *inode);
526void logfs_set_blocks(struct inode *inode, u64 no); 527void logfs_set_blocks(struct inode *inode, u64 no);
527/* these logically belong into inode.c but actually reside in readwrite.c */ 528/* these logically belong into inode.c but actually reside in readwrite.c */
528int logfs_read_inode(struct inode *inode); 529int logfs_read_inode(struct inode *inode);
529int __logfs_write_inode(struct inode *inode, long flags); 530int __logfs_write_inode(struct inode *inode, long flags);
530void logfs_delete_inode(struct inode *inode); 531void logfs_evict_inode(struct inode *inode);
531void logfs_clear_inode(struct inode *inode);
532 532
533/* journal.c */ 533/* journal.c */
534void logfs_write_anchor(struct super_block *sb); 534void logfs_write_anchor(struct super_block *sb);
@@ -598,19 +598,19 @@ void freeseg(struct super_block *sb, u32 segno);
598int logfs_init_areas(struct super_block *sb); 598int logfs_init_areas(struct super_block *sb);
599void logfs_cleanup_areas(struct super_block *sb); 599void logfs_cleanup_areas(struct super_block *sb);
600int logfs_open_area(struct logfs_area *area, size_t bytes); 600int logfs_open_area(struct logfs_area *area, size_t bytes);
601void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, 601int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
602 int use_filler); 602 int use_filler);
603 603
604static inline void logfs_buf_write(struct logfs_area *area, u64 ofs, 604static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
605 void *buf, size_t len) 605 void *buf, size_t len)
606{ 606{
607 __logfs_buf_write(area, ofs, buf, len, 0); 607 return __logfs_buf_write(area, ofs, buf, len, 0);
608} 608}
609 609
610static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs, 610static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
611 void *buf, size_t len) 611 void *buf, size_t len)
612{ 612{
613 __logfs_buf_write(area, ofs, buf, len, 1); 613 return __logfs_buf_write(area, ofs, buf, len, 1);
614} 614}
615 615
616/* super.c */ 616/* super.c */
@@ -704,7 +704,7 @@ static inline gc_level_t expand_level(u64 ino, level_t __level)
704 u8 level = (__force u8)__level; 704 u8 level = (__force u8)__level;
705 705
706 if (ino == LOGFS_INO_MASTER) { 706 if (ino == LOGFS_INO_MASTER) {
707 /* ifile has seperate areas */ 707 /* ifile has separate areas */
708 level += LOGFS_MAX_LEVELS; 708 level += LOGFS_MAX_LEVELS;
709 } 709 }
710 return (__force gc_level_t)level; 710 return (__force gc_level_t)level;
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
index f674725663fe..ae960519c54a 100644
--- a/fs/logfs/logfs_abi.h
+++ b/fs/logfs/logfs_abi.h
@@ -50,9 +50,9 @@ static inline void check_##type(void) \
50 * 12 - gc recycled blocks, long-lived data 50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data 51 * 13 - replacement blocks, short-lived data
52 * 52 *
53 * Levels 1-11 are necessary for robust gc operations and help seperate 53 * Levels 1-11 are necessary for robust gc operations and help separate
54 * short-lived metadata from longer-lived file data. In the future, 54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get seperated into several segments based on simple 55 * file data should get separated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be 56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data 57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be 58 * used to replace older blocks in existing files is expected to be
@@ -117,7 +117,7 @@ static inline void check_##type(void) \
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) 117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118 118
119/* 119/*
120 * LogFS needs to seperate data into levels. Each level is defined as the 120 * LogFS needs to separate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file). 121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc. 122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. 123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
@@ -204,7 +204,7 @@ SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
204 * @ds_crc: crc32 of structure starting with the next field 204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile 205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files 206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of seperate levels for data 207 * @ds_data_levels: number of separate levels for data
208 * @pad0: reserved, must be 0 208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features 209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features 210 * @ds_feature_ro_compat: read-only compatible filesystem features
@@ -456,7 +456,7 @@ enum logfs_vim {
456 * @vim: life expectancy of data 456 * @vim: life expectancy of data
457 * 457 *
458 * "Areas" are segments currently being used for writing. There is at least 458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to seperate long-living from 459 * one area per GC level. Several may be used to separate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can 460 * short-living data. If an area with unknown vim is encountered, it can
461 * simply be closed. 461 * simply be closed.
462 * The write buffer immediately follow this header. 462 * The write buffer immediately follow this header.
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 3159db6958e5..6127baf0e188 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -892,6 +892,8 @@ u64 logfs_seek_hole(struct inode *inode, u64 bix)
892 return bix; 892 return bix;
893 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) 893 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
894 bix = maxbix(li->li_height); 894 bix = maxbix(li->li_height);
895 else if (bix >= maxbix(li->li_height))
896 return bix;
895 else { 897 else {
896 bix = seek_holedata_loop(inode, bix, 0); 898 bix = seek_holedata_loop(inode, bix, 0);
897 if (bix < maxbix(li->li_height)) 899 if (bix < maxbix(li->li_height))
@@ -1093,17 +1095,25 @@ static int logfs_reserve_bytes(struct inode *inode, int bytes)
1093int get_page_reserve(struct inode *inode, struct page *page) 1095int get_page_reserve(struct inode *inode, struct page *page)
1094{ 1096{
1095 struct logfs_super *super = logfs_super(inode->i_sb); 1097 struct logfs_super *super = logfs_super(inode->i_sb);
1098 struct logfs_block *block = logfs_block(page);
1096 int ret; 1099 int ret;
1097 1100
1098 if (logfs_block(page) && logfs_block(page)->reserved_bytes) 1101 if (block && block->reserved_bytes)
1099 return 0; 1102 return 0;
1100 1103
1101 logfs_get_wblocks(inode->i_sb, page, WF_LOCK); 1104 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1102 ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE); 1105 while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
1106 !list_empty(&super->s_writeback_list)) {
1107 block = list_entry(super->s_writeback_list.next,
1108 struct logfs_block, alias_list);
1109 block->ops->write_block(block);
1110 }
1103 if (!ret) { 1111 if (!ret) {
1104 alloc_data_block(inode, page); 1112 alloc_data_block(inode, page);
1105 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; 1113 block = logfs_block(page);
1114 block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1106 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; 1115 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1116 list_move_tail(&block->alias_list, &super->s_writeback_list);
1107 } 1117 }
1108 logfs_put_wblocks(inode->i_sb, page, WF_LOCK); 1118 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1109 return ret; 1119 return ret;
@@ -1861,7 +1871,7 @@ int logfs_truncate(struct inode *inode, u64 target)
1861 size = target; 1871 size = target;
1862 1872
1863 logfs_get_wblocks(sb, NULL, 1); 1873 logfs_get_wblocks(sb, NULL, 1);
1864 err = __logfs_truncate(inode, target); 1874 err = __logfs_truncate(inode, size);
1865 if (!err) 1875 if (!err)
1866 err = __logfs_write_inode(inode, 0); 1876 err = __logfs_write_inode(inode, 0);
1867 logfs_put_wblocks(sb, NULL, 1); 1877 logfs_put_wblocks(sb, NULL, 1);
@@ -1962,31 +1972,6 @@ static struct page *inode_to_page(struct inode *inode)
1962 return page; 1972 return page;
1963} 1973}
1964 1974
1965/* Cheaper version of write_inode. All changes are concealed in
1966 * aliases, which are moved back. No write to the medium happens.
1967 */
1968void logfs_clear_inode(struct inode *inode)
1969{
1970 struct super_block *sb = inode->i_sb;
1971 struct logfs_inode *li = logfs_inode(inode);
1972 struct logfs_block *block = li->li_block;
1973 struct page *page;
1974
1975 /* Only deleted files may be dirty at this point */
1976 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
1977 if (!block)
1978 return;
1979 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
1980 block->ops->free_block(inode->i_sb, block);
1981 return;
1982 }
1983
1984 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
1985 page = inode_to_page(inode);
1986 BUG_ON(!page); /* FIXME: Use emergency page */
1987 logfs_put_write_page(page);
1988}
1989
1990static int do_write_inode(struct inode *inode) 1975static int do_write_inode(struct inode *inode)
1991{ 1976{
1992 struct super_block *sb = inode->i_sb; 1977 struct super_block *sb = inode->i_sb;
@@ -2154,18 +2139,40 @@ static int do_delete_inode(struct inode *inode)
2154 * ZOMBIE inodes have already been deleted before and should remain dead, 2139 * ZOMBIE inodes have already been deleted before and should remain dead,
2155 * if it weren't for valid checking. No need to kill them again here. 2140 * if it weren't for valid checking. No need to kill them again here.
2156 */ 2141 */
2157void logfs_delete_inode(struct inode *inode) 2142void logfs_evict_inode(struct inode *inode)
2158{ 2143{
2144 struct super_block *sb = inode->i_sb;
2159 struct logfs_inode *li = logfs_inode(inode); 2145 struct logfs_inode *li = logfs_inode(inode);
2146 struct logfs_block *block = li->li_block;
2147 struct page *page;
2160 2148
2161 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) { 2149 if (!inode->i_nlink) {
2162 li->li_flags |= LOGFS_IF_ZOMBIE; 2150 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2163 if (i_size_read(inode) > 0) 2151 li->li_flags |= LOGFS_IF_ZOMBIE;
2164 logfs_truncate(inode, 0); 2152 if (i_size_read(inode) > 0)
2165 do_delete_inode(inode); 2153 logfs_truncate(inode, 0);
2154 do_delete_inode(inode);
2155 }
2166 } 2156 }
2167 truncate_inode_pages(&inode->i_data, 0); 2157 truncate_inode_pages(&inode->i_data, 0);
2168 clear_inode(inode); 2158 end_writeback(inode);
2159
2160 /* Cheaper version of write_inode. All changes are concealed in
2161 * aliases, which are moved back. No write to the medium happens.
2162 */
2163 /* Only deleted files may be dirty at this point */
2164 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
2165 if (!block)
2166 return;
2167 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
2168 block->ops->free_block(inode->i_sb, block);
2169 return;
2170 }
2171
2172 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
2173 page = inode_to_page(inode);
2174 BUG_ON(!page); /* FIXME: Use emergency page */
2175 logfs_put_write_page(page);
2169} 2176}
2170 2177
2171void btree_write_block(struct logfs_block *block) 2178void btree_write_block(struct logfs_block *block)
@@ -2249,6 +2256,7 @@ int logfs_init_rw(struct super_block *sb)
2249 int min_fill = 3 * super->s_no_blocks; 2256 int min_fill = 3 * super->s_no_blocks;
2250 2257
2251 INIT_LIST_HEAD(&super->s_object_alias); 2258 INIT_LIST_HEAD(&super->s_object_alias);
2259 INIT_LIST_HEAD(&super->s_writeback_list);
2252 mutex_init(&super->s_write_mutex); 2260 mutex_init(&super->s_write_mutex);
2253 super->s_block_pool = mempool_create_kmalloc_pool(min_fill, 2261 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2254 sizeof(struct logfs_block)); 2262 sizeof(struct logfs_block));
@@ -2261,7 +2269,6 @@ void logfs_cleanup_rw(struct super_block *sb)
2261{ 2269{
2262 struct logfs_super *super = logfs_super(sb); 2270 struct logfs_super *super = logfs_super(sb);
2263 2271
2264 destroy_meta_inode(super->s_segfile_inode);
2265 logfs_mempool_destroy(super->s_block_pool); 2272 logfs_mempool_destroy(super->s_block_pool);
2266 logfs_mempool_destroy(super->s_shadow_pool); 2273 logfs_mempool_destroy(super->s_shadow_pool);
2267} 2274}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index f77ce2b470ba..9d5187353255 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -67,7 +67,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
67 return page; 67 return page;
68} 68}
69 69
70void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, 70int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
71 int use_filler) 71 int use_filler)
72{ 72{
73 pgoff_t index = ofs >> PAGE_SHIFT; 73 pgoff_t index = ofs >> PAGE_SHIFT;
@@ -81,8 +81,10 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
81 copylen = min((ulong)len, PAGE_SIZE - offset); 81 copylen = min((ulong)len, PAGE_SIZE - offset);
82 82
83 page = get_mapping_page(area->a_sb, index, use_filler); 83 page = get_mapping_page(area->a_sb, index, use_filler);
84 SetPageUptodate(page); 84 if (IS_ERR(page))
85 return PTR_ERR(page);
85 BUG_ON(!page); /* FIXME: reserve a pool */ 86 BUG_ON(!page); /* FIXME: reserve a pool */
87 SetPageUptodate(page);
86 memcpy(page_address(page) + offset, buf, copylen); 88 memcpy(page_address(page) + offset, buf, copylen);
87 SetPagePrivate(page); 89 SetPagePrivate(page);
88 page_cache_release(page); 90 page_cache_release(page);
@@ -92,6 +94,7 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
92 offset = 0; 94 offset = 0;
93 index++; 95 index++;
94 } while (len); 96 } while (len);
97 return 0;
95} 98}
96 99
97static void pad_partial_page(struct logfs_area *area) 100static void pad_partial_page(struct logfs_area *area)
@@ -926,5 +929,4 @@ void logfs_cleanup_areas(struct super_block *sb)
926 for_each_area(i) 929 for_each_area(i)
927 free_area(super->s_area[i]); 930 free_area(super->s_area[i]);
928 free_area(super->s_journal_area); 931 free_area(super->s_journal_area);
929 destroy_meta_inode(super->s_mapping_inode);
930} 932}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index d7c23ed8349a..5336155c5d81 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -138,10 +138,14 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
138 sb->s_fs_info = super; 138 sb->s_fs_info = super;
139 sb->s_mtd = super->s_mtd; 139 sb->s_mtd = super->s_mtd;
140 sb->s_bdev = super->s_bdev; 140 sb->s_bdev = super->s_bdev;
141#ifdef CONFIG_BLOCK
141 if (sb->s_bdev) 142 if (sb->s_bdev)
142 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; 143 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
144#endif
145#ifdef CONFIG_MTD
143 if (sb->s_mtd) 146 if (sb->s_mtd)
144 sb->s_bdi = sb->s_mtd->backing_dev_info; 147 sb->s_bdi = sb->s_mtd->backing_dev_info;
148#endif
145 return 0; 149 return 0;
146} 150}
147 151
@@ -338,24 +342,27 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
338 goto fail; 342 goto fail;
339 } 343 }
340 344
345 /* at that point we know that ->put_super() will be called */
341 super->s_erase_page = alloc_pages(GFP_KERNEL, 0); 346 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
342 if (!super->s_erase_page) 347 if (!super->s_erase_page)
343 goto fail; 348 return -ENOMEM;
344 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); 349 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
345 350
346 /* FIXME: check for read-only mounts */ 351 /* FIXME: check for read-only mounts */
347 err = logfs_make_writeable(sb); 352 err = logfs_make_writeable(sb);
348 if (err) 353 if (err) {
349 goto fail1; 354 __free_page(super->s_erase_page);
355 return err;
356 }
350 357
351 log_super("LogFS: Finished mounting\n"); 358 log_super("LogFS: Finished mounting\n");
352 simple_set_mnt(mnt, sb); 359 simple_set_mnt(mnt, sb);
353 return 0; 360 return 0;
354 361
355fail1:
356 __free_page(super->s_erase_page);
357fail: 362fail:
358 iput(logfs_super(sb)->s_master_inode); 363 iput(super->s_master_inode);
364 iput(super->s_segfile_inode);
365 iput(super->s_mapping_inode);
359 return -EIO; 366 return -EIO;
360} 367}
361 368
@@ -382,7 +389,7 @@ static struct page *find_super_block(struct super_block *sb)
382 if (!first || IS_ERR(first)) 389 if (!first || IS_ERR(first))
383 return NULL; 390 return NULL;
384 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); 391 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
385 if (!last || IS_ERR(first)) { 392 if (!last || IS_ERR(last)) {
386 page_cache_release(first); 393 page_cache_release(first);
387 return NULL; 394 return NULL;
388 } 395 }
@@ -413,7 +420,7 @@ static int __logfs_read_sb(struct super_block *sb)
413 420
414 page = find_super_block(sb); 421 page = find_super_block(sb);
415 if (!page) 422 if (!page)
416 return -EIO; 423 return -EINVAL;
417 424
418 ds = page_address(page); 425 ds = page_address(page);
419 super->s_size = be64_to_cpu(ds->ds_filesystem_size); 426 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
@@ -576,10 +583,14 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
576 sb->s_flags |= MS_ACTIVE; 583 sb->s_flags |= MS_ACTIVE;
577 err = logfs_get_sb_final(sb, mnt); 584 err = logfs_get_sb_final(sb, mnt);
578 if (err) 585 if (err)
579 goto err1; 586 deactivate_locked_super(sb);
580 return 0; 587 return err;
581 588
582err1: 589err1:
590 /* no ->s_root, no ->put_super() */
591 iput(super->s_master_inode);
592 iput(super->s_segfile_inode);
593 iput(super->s_mapping_inode);
583 deactivate_locked_super(sb); 594 deactivate_locked_super(sb);
584 return err; 595 return err;
585err0: 596err0:
diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3d04a9..93444747237b 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -79,15 +79,12 @@ EXPORT_SYMBOL(mb_cache_entry_find_next);
79struct mb_cache { 79struct mb_cache {
80 struct list_head c_cache_list; 80 struct list_head c_cache_list;
81 const char *c_name; 81 const char *c_name;
82 struct mb_cache_op c_op;
83 atomic_t c_entry_count; 82 atomic_t c_entry_count;
83 int c_max_entries;
84 int c_bucket_bits; 84 int c_bucket_bits;
85#ifndef MB_CACHE_INDEXES_COUNT 85 struct kmem_cache *c_entry_cache;
86 int c_indexes_count;
87#endif
88 struct kmem_cache *c_entry_cache;
89 struct list_head *c_block_hash; 86 struct list_head *c_block_hash;
90 struct list_head *c_indexes_hash[0]; 87 struct list_head *c_index_hash;
91}; 88};
92 89
93 90
@@ -101,21 +98,11 @@ static LIST_HEAD(mb_cache_list);
101static LIST_HEAD(mb_cache_lru_list); 98static LIST_HEAD(mb_cache_lru_list);
102static DEFINE_SPINLOCK(mb_cache_spinlock); 99static DEFINE_SPINLOCK(mb_cache_spinlock);
103 100
104static inline int
105mb_cache_indexes(struct mb_cache *cache)
106{
107#ifdef MB_CACHE_INDEXES_COUNT
108 return MB_CACHE_INDEXES_COUNT;
109#else
110 return cache->c_indexes_count;
111#endif
112}
113
114/* 101/*
115 * What the mbcache registers as to get shrunk dynamically. 102 * What the mbcache registers as to get shrunk dynamically.
116 */ 103 */
117 104
118static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); 105static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
119 106
120static struct shrinker mb_cache_shrinker = { 107static struct shrinker mb_cache_shrinker = {
121 .shrink = mb_cache_shrink_fn, 108 .shrink = mb_cache_shrink_fn,
@@ -132,12 +119,9 @@ __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
132static void 119static void
133__mb_cache_entry_unhash(struct mb_cache_entry *ce) 120__mb_cache_entry_unhash(struct mb_cache_entry *ce)
134{ 121{
135 int n;
136
137 if (__mb_cache_entry_is_hashed(ce)) { 122 if (__mb_cache_entry_is_hashed(ce)) {
138 list_del_init(&ce->e_block_list); 123 list_del_init(&ce->e_block_list);
139 for (n=0; n<mb_cache_indexes(ce->e_cache); n++) 124 list_del(&ce->e_index.o_list);
140 list_del(&ce->e_indexes[n].o_list);
141 } 125 }
142} 126}
143 127
@@ -148,16 +132,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
148 struct mb_cache *cache = ce->e_cache; 132 struct mb_cache *cache = ce->e_cache;
149 133
150 mb_assert(!(ce->e_used || ce->e_queued)); 134 mb_assert(!(ce->e_used || ce->e_queued));
151 if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) { 135 kmem_cache_free(cache->c_entry_cache, ce);
152 /* free failed -- put back on the lru list 136 atomic_dec(&cache->c_entry_count);
153 for freeing later. */
154 spin_lock(&mb_cache_spinlock);
155 list_add(&ce->e_lru_list, &mb_cache_lru_list);
156 spin_unlock(&mb_cache_spinlock);
157 } else {
158 kmem_cache_free(cache->c_entry_cache, ce);
159 atomic_dec(&cache->c_entry_count);
160 }
161} 137}
162 138
163 139
@@ -191,31 +167,22 @@ forget:
191 * This function is called by the kernel memory management when memory 167 * This function is called by the kernel memory management when memory
192 * gets low. 168 * gets low.
193 * 169 *
170 * @shrink: (ignored)
194 * @nr_to_scan: Number of objects to scan 171 * @nr_to_scan: Number of objects to scan
195 * @gfp_mask: (ignored) 172 * @gfp_mask: (ignored)
196 * 173 *
197 * Returns the number of objects which are present in the cache. 174 * Returns the number of objects which are present in the cache.
198 */ 175 */
199static int 176static int
200mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) 177mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
201{ 178{
202 LIST_HEAD(free_list); 179 LIST_HEAD(free_list);
203 struct list_head *l, *ltmp; 180 struct mb_cache *cache;
181 struct mb_cache_entry *entry, *tmp;
204 int count = 0; 182 int count = 0;
205 183
206 spin_lock(&mb_cache_spinlock);
207 list_for_each(l, &mb_cache_list) {
208 struct mb_cache *cache =
209 list_entry(l, struct mb_cache, c_cache_list);
210 mb_debug("cache %s (%d)", cache->c_name,
211 atomic_read(&cache->c_entry_count));
212 count += atomic_read(&cache->c_entry_count);
213 }
214 mb_debug("trying to free %d entries", nr_to_scan); 184 mb_debug("trying to free %d entries", nr_to_scan);
215 if (nr_to_scan == 0) { 185 spin_lock(&mb_cache_spinlock);
216 spin_unlock(&mb_cache_spinlock);
217 goto out;
218 }
219 while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { 186 while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
220 struct mb_cache_entry *ce = 187 struct mb_cache_entry *ce =
221 list_entry(mb_cache_lru_list.next, 188 list_entry(mb_cache_lru_list.next,
@@ -223,12 +190,15 @@ mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
223 list_move_tail(&ce->e_lru_list, &free_list); 190 list_move_tail(&ce->e_lru_list, &free_list);
224 __mb_cache_entry_unhash(ce); 191 __mb_cache_entry_unhash(ce);
225 } 192 }
193 list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
194 mb_debug("cache %s (%d)", cache->c_name,
195 atomic_read(&cache->c_entry_count));
196 count += atomic_read(&cache->c_entry_count);
197 }
226 spin_unlock(&mb_cache_spinlock); 198 spin_unlock(&mb_cache_spinlock);
227 list_for_each_safe(l, ltmp, &free_list) { 199 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
228 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 200 __mb_cache_entry_forget(entry, gfp_mask);
229 e_lru_list), gfp_mask);
230 } 201 }
231out:
232 return (count / 100) * sysctl_vfs_cache_pressure; 202 return (count / 100) * sysctl_vfs_cache_pressure;
233} 203}
234 204
@@ -242,72 +212,55 @@ out:
242 * memory was available. 212 * memory was available.
243 * 213 *
244 * @name: name of the cache (informal) 214 * @name: name of the cache (informal)
245 * @cache_op: contains the callback called when freeing a cache entry
246 * @entry_size: The size of a cache entry, including
247 * struct mb_cache_entry
248 * @indexes_count: number of additional indexes in the cache. Must equal
249 * MB_CACHE_INDEXES_COUNT if the number of indexes is
250 * hardwired.
251 * @bucket_bits: log2(number of hash buckets) 215 * @bucket_bits: log2(number of hash buckets)
252 */ 216 */
253struct mb_cache * 217struct mb_cache *
254mb_cache_create(const char *name, struct mb_cache_op *cache_op, 218mb_cache_create(const char *name, int bucket_bits)
255 size_t entry_size, int indexes_count, int bucket_bits)
256{ 219{
257 int m=0, n, bucket_count = 1 << bucket_bits; 220 int n, bucket_count = 1 << bucket_bits;
258 struct mb_cache *cache = NULL; 221 struct mb_cache *cache = NULL;
259 222
260 if(entry_size < sizeof(struct mb_cache_entry) + 223 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
261 indexes_count * sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]))
262 return NULL;
263
264 cache = kmalloc(sizeof(struct mb_cache) +
265 indexes_count * sizeof(struct list_head), GFP_KERNEL);
266 if (!cache) 224 if (!cache)
267 goto fail; 225 return NULL;
268 cache->c_name = name; 226 cache->c_name = name;
269 cache->c_op.free = NULL;
270 if (cache_op)
271 cache->c_op.free = cache_op->free;
272 atomic_set(&cache->c_entry_count, 0); 227 atomic_set(&cache->c_entry_count, 0);
273 cache->c_bucket_bits = bucket_bits; 228 cache->c_bucket_bits = bucket_bits;
274#ifdef MB_CACHE_INDEXES_COUNT
275 mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
276#else
277 cache->c_indexes_count = indexes_count;
278#endif
279 cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), 229 cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
280 GFP_KERNEL); 230 GFP_KERNEL);
281 if (!cache->c_block_hash) 231 if (!cache->c_block_hash)
282 goto fail; 232 goto fail;
283 for (n=0; n<bucket_count; n++) 233 for (n=0; n<bucket_count; n++)
284 INIT_LIST_HEAD(&cache->c_block_hash[n]); 234 INIT_LIST_HEAD(&cache->c_block_hash[n]);
285 for (m=0; m<indexes_count; m++) { 235 cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
286 cache->c_indexes_hash[m] = kmalloc(bucket_count * 236 GFP_KERNEL);
287 sizeof(struct list_head), 237 if (!cache->c_index_hash)
288 GFP_KERNEL); 238 goto fail;
289 if (!cache->c_indexes_hash[m]) 239 for (n=0; n<bucket_count; n++)
290 goto fail; 240 INIT_LIST_HEAD(&cache->c_index_hash[n]);
291 for (n=0; n<bucket_count; n++) 241 cache->c_entry_cache = kmem_cache_create(name,
292 INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]); 242 sizeof(struct mb_cache_entry), 0,
293 }
294 cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
295 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 243 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
296 if (!cache->c_entry_cache) 244 if (!cache->c_entry_cache)
297 goto fail; 245 goto fail2;
246
247 /*
248 * Set an upper limit on the number of cache entries so that the hash
249 * chains won't grow too long.
250 */
251 cache->c_max_entries = bucket_count << 4;
298 252
299 spin_lock(&mb_cache_spinlock); 253 spin_lock(&mb_cache_spinlock);
300 list_add(&cache->c_cache_list, &mb_cache_list); 254 list_add(&cache->c_cache_list, &mb_cache_list);
301 spin_unlock(&mb_cache_spinlock); 255 spin_unlock(&mb_cache_spinlock);
302 return cache; 256 return cache;
303 257
258fail2:
259 kfree(cache->c_index_hash);
260
304fail: 261fail:
305 if (cache) { 262 kfree(cache->c_block_hash);
306 while (--m >= 0) 263 kfree(cache);
307 kfree(cache->c_indexes_hash[m]);
308 kfree(cache->c_block_hash);
309 kfree(cache);
310 }
311 return NULL; 264 return NULL;
312} 265}
313 266
@@ -356,7 +309,6 @@ mb_cache_destroy(struct mb_cache *cache)
356{ 309{
357 LIST_HEAD(free_list); 310 LIST_HEAD(free_list);
358 struct list_head *l, *ltmp; 311 struct list_head *l, *ltmp;
359 int n;
360 312
361 spin_lock(&mb_cache_spinlock); 313 spin_lock(&mb_cache_spinlock);
362 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 314 list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
@@ -383,13 +335,11 @@ mb_cache_destroy(struct mb_cache *cache)
383 335
384 kmem_cache_destroy(cache->c_entry_cache); 336 kmem_cache_destroy(cache->c_entry_cache);
385 337
386 for (n=0; n < mb_cache_indexes(cache); n++) 338 kfree(cache->c_index_hash);
387 kfree(cache->c_indexes_hash[n]);
388 kfree(cache->c_block_hash); 339 kfree(cache->c_block_hash);
389 kfree(cache); 340 kfree(cache);
390} 341}
391 342
392
393/* 343/*
394 * mb_cache_entry_alloc() 344 * mb_cache_entry_alloc()
395 * 345 *
@@ -401,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache)
401struct mb_cache_entry * 351struct mb_cache_entry *
402mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) 352mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
403{ 353{
404 struct mb_cache_entry *ce; 354 struct mb_cache_entry *ce = NULL;
405 355
406 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 356 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
407 if (ce) { 357 spin_lock(&mb_cache_spinlock);
358 if (!list_empty(&mb_cache_lru_list)) {
359 ce = list_entry(mb_cache_lru_list.next,
360 struct mb_cache_entry, e_lru_list);
361 list_del_init(&ce->e_lru_list);
362 __mb_cache_entry_unhash(ce);
363 }
364 spin_unlock(&mb_cache_spinlock);
365 }
366 if (!ce) {
367 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
368 if (!ce)
369 return NULL;
408 atomic_inc(&cache->c_entry_count); 370 atomic_inc(&cache->c_entry_count);
409 INIT_LIST_HEAD(&ce->e_lru_list); 371 INIT_LIST_HEAD(&ce->e_lru_list);
410 INIT_LIST_HEAD(&ce->e_block_list); 372 INIT_LIST_HEAD(&ce->e_block_list);
411 ce->e_cache = cache; 373 ce->e_cache = cache;
412 ce->e_used = 1 + MB_CACHE_WRITER;
413 ce->e_queued = 0; 374 ce->e_queued = 0;
414 } 375 }
376 ce->e_used = 1 + MB_CACHE_WRITER;
415 return ce; 377 return ce;
416} 378}
417 379
@@ -428,17 +390,16 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
428 * 390 *
429 * @bdev: device the cache entry belongs to 391 * @bdev: device the cache entry belongs to
430 * @block: block number 392 * @block: block number
431 * @keys: array of additional keys. There must be indexes_count entries 393 * @key: lookup key
432 * in the array (as specified when creating the cache).
433 */ 394 */
434int 395int
435mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, 396mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
436 sector_t block, unsigned int keys[]) 397 sector_t block, unsigned int key)
437{ 398{
438 struct mb_cache *cache = ce->e_cache; 399 struct mb_cache *cache = ce->e_cache;
439 unsigned int bucket; 400 unsigned int bucket;
440 struct list_head *l; 401 struct list_head *l;
441 int error = -EBUSY, n; 402 int error = -EBUSY;
442 403
443 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 404 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
444 cache->c_bucket_bits); 405 cache->c_bucket_bits);
@@ -453,12 +414,9 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
453 ce->e_bdev = bdev; 414 ce->e_bdev = bdev;
454 ce->e_block = block; 415 ce->e_block = block;
455 list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); 416 list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
456 for (n=0; n<mb_cache_indexes(cache); n++) { 417 ce->e_index.o_key = key;
457 ce->e_indexes[n].o_key = keys[n]; 418 bucket = hash_long(key, cache->c_bucket_bits);
458 bucket = hash_long(keys[n], cache->c_bucket_bits); 419 list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
459 list_add(&ce->e_indexes[n].o_list,
460 &cache->c_indexes_hash[n][bucket]);
461 }
462 error = 0; 420 error = 0;
463out: 421out:
464 spin_unlock(&mb_cache_spinlock); 422 spin_unlock(&mb_cache_spinlock);
@@ -554,13 +512,12 @@ cleanup:
554 512
555static struct mb_cache_entry * 513static struct mb_cache_entry *
556__mb_cache_entry_find(struct list_head *l, struct list_head *head, 514__mb_cache_entry_find(struct list_head *l, struct list_head *head,
557 int index, struct block_device *bdev, unsigned int key) 515 struct block_device *bdev, unsigned int key)
558{ 516{
559 while (l != head) { 517 while (l != head) {
560 struct mb_cache_entry *ce = 518 struct mb_cache_entry *ce =
561 list_entry(l, struct mb_cache_entry, 519 list_entry(l, struct mb_cache_entry, e_index.o_list);
562 e_indexes[index].o_list); 520 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
563 if (ce->e_bdev == bdev && ce->e_indexes[index].o_key == key) {
564 DEFINE_WAIT(wait); 521 DEFINE_WAIT(wait);
565 522
566 if (!list_empty(&ce->e_lru_list)) 523 if (!list_empty(&ce->e_lru_list))
@@ -602,23 +559,20 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
602 * returned cache entry is locked for shared access ("multiple readers"). 559 * returned cache entry is locked for shared access ("multiple readers").
603 * 560 *
604 * @cache: the cache to search 561 * @cache: the cache to search
605 * @index: the number of the additonal index to search (0<=index<indexes_count)
606 * @bdev: the device the cache entry should belong to 562 * @bdev: the device the cache entry should belong to
607 * @key: the key in the index 563 * @key: the key in the index
608 */ 564 */
609struct mb_cache_entry * 565struct mb_cache_entry *
610mb_cache_entry_find_first(struct mb_cache *cache, int index, 566mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
611 struct block_device *bdev, unsigned int key) 567 unsigned int key)
612{ 568{
613 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 569 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
614 struct list_head *l; 570 struct list_head *l;
615 struct mb_cache_entry *ce; 571 struct mb_cache_entry *ce;
616 572
617 mb_assert(index < mb_cache_indexes(cache));
618 spin_lock(&mb_cache_spinlock); 573 spin_lock(&mb_cache_spinlock);
619 l = cache->c_indexes_hash[index][bucket].next; 574 l = cache->c_index_hash[bucket].next;
620 ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], 575 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
621 index, bdev, key);
622 spin_unlock(&mb_cache_spinlock); 576 spin_unlock(&mb_cache_spinlock);
623 return ce; 577 return ce;
624} 578}
@@ -639,12 +593,11 @@ mb_cache_entry_find_first(struct mb_cache *cache, int index,
639 * } 593 * }
640 * 594 *
641 * @prev: The previous match 595 * @prev: The previous match
642 * @index: the number of the additonal index to search (0<=index<indexes_count)
643 * @bdev: the device the cache entry should belong to 596 * @bdev: the device the cache entry should belong to
644 * @key: the key in the index 597 * @key: the key in the index
645 */ 598 */
646struct mb_cache_entry * 599struct mb_cache_entry *
647mb_cache_entry_find_next(struct mb_cache_entry *prev, int index, 600mb_cache_entry_find_next(struct mb_cache_entry *prev,
648 struct block_device *bdev, unsigned int key) 601 struct block_device *bdev, unsigned int key)
649{ 602{
650 struct mb_cache *cache = prev->e_cache; 603 struct mb_cache *cache = prev->e_cache;
@@ -652,11 +605,9 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
652 struct list_head *l; 605 struct list_head *l;
653 struct mb_cache_entry *ce; 606 struct mb_cache_entry *ce;
654 607
655 mb_assert(index < mb_cache_indexes(cache));
656 spin_lock(&mb_cache_spinlock); 608 spin_lock(&mb_cache_spinlock);
657 l = prev->e_indexes[index].o_list.next; 609 l = prev->e_index.o_list.next;
658 ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], 610 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
659 index, bdev, key);
660 __mb_cache_entry_release_unlock(prev); 611 __mb_cache_entry_release_unlock(prev);
661 return ce; 612 return ce;
662} 613}
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae49..3f32bcb0d9bd 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -200,13 +200,13 @@ void minix_free_inode(struct inode * inode)
200 ino = inode->i_ino; 200 ino = inode->i_ino;
201 if (ino < 1 || ino > sbi->s_ninodes) { 201 if (ino < 1 || ino > sbi->s_ninodes) {
202 printk("minix_free_inode: inode 0 or nonexistent inode\n"); 202 printk("minix_free_inode: inode 0 or nonexistent inode\n");
203 goto out; 203 return;
204 } 204 }
205 bit = ino & ((1<<k) - 1); 205 bit = ino & ((1<<k) - 1);
206 ino >>= k; 206 ino >>= k;
207 if (ino >= sbi->s_imap_blocks) { 207 if (ino >= sbi->s_imap_blocks) {
208 printk("minix_free_inode: nonexistent imap in superblock\n"); 208 printk("minix_free_inode: nonexistent imap in superblock\n");
209 goto out; 209 return;
210 } 210 }
211 211
212 minix_clear_inode(inode); /* clear on-disk copy */ 212 minix_clear_inode(inode); /* clear on-disk copy */
@@ -217,11 +217,9 @@ void minix_free_inode(struct inode * inode)
217 printk("minix_free_inode: bit %lu already cleared\n", bit); 217 printk("minix_free_inode: bit %lu already cleared\n", bit);
218 spin_unlock(&bitmap_lock); 218 spin_unlock(&bitmap_lock);
219 mark_buffer_dirty(bh); 219 mark_buffer_dirty(bh);
220 out:
221 clear_inode(inode); /* clear in-memory copy */
222} 220}
223 221
224struct inode * minix_new_inode(const struct inode * dir, int * error) 222struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
225{ 223{
226 struct super_block *sb = dir->i_sb; 224 struct super_block *sb = dir->i_sb;
227 struct minix_sb_info *sbi = minix_sb(sb); 225 struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +261,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
263 iput(inode); 261 iput(inode);
264 return NULL; 262 return NULL;
265 } 263 }
266 inode->i_uid = current_fsuid(); 264 inode_init_owner(inode, dir, mode);
267 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
268 inode->i_ino = j; 265 inode->i_ino = j;
269 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 266 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
270 inode->i_blocks = 0; 267 inode->i_blocks = 0;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..085a9262c692 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .read = generic_read_dir, 23 .read = generic_read_dir,
24 .readdir = minix_readdir, 24 .readdir = minix_readdir,
25 .fsync = simple_fsync, 25 .fsync = generic_file_fsync,
26}; 26};
27 27
28static inline void dir_put_page(struct page *page) 28static inline void dir_put_page(struct page *page)
@@ -72,16 +72,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
72{ 72{
73 struct address_space *mapping = dir->i_mapping; 73 struct address_space *mapping = dir->i_mapping;
74 struct page *page = read_mapping_page(mapping, n, NULL); 74 struct page *page = read_mapping_page(mapping, n, NULL);
75 if (!IS_ERR(page)) { 75 if (!IS_ERR(page))
76 kmap(page); 76 kmap(page);
77 if (!PageUptodate(page))
78 goto fail;
79 }
80 return page; 77 return page;
81
82fail:
83 dir_put_page(page);
84 return ERR_PTR(-EIO);
85} 78}
86 79
87static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) 80static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
@@ -278,8 +271,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
278 271
279got_it: 272got_it:
280 pos = page_offset(page) + p - (char *)page_address(page); 273 pos = page_offset(page) + p - (char *)page_address(page);
281 err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, 274 err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
282 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
283 if (err) 275 if (err)
284 goto out_unlock; 276 goto out_unlock;
285 memcpy (namx, name, namelen); 277 memcpy (namx, name, namelen);
@@ -304,8 +296,7 @@ out_unlock:
304 296
305int minix_delete_entry(struct minix_dir_entry *de, struct page *page) 297int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
306{ 298{
307 struct address_space *mapping = page->mapping; 299 struct inode *inode = page->mapping->host;
308 struct inode *inode = (struct inode*)mapping->host;
309 char *kaddr = page_address(page); 300 char *kaddr = page_address(page);
310 loff_t pos = page_offset(page) + (char*)de - kaddr; 301 loff_t pos = page_offset(page) + (char*)de - kaddr;
311 struct minix_sb_info *sbi = minix_sb(inode->i_sb); 302 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
@@ -313,8 +304,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
313 int err; 304 int err;
314 305
315 lock_page(page); 306 lock_page(page);
316 err = __minix_write_begin(NULL, mapping, pos, len, 307 err = minix_prepare_chunk(page, pos, len);
317 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
318 if (err == 0) { 308 if (err == 0) {
319 if (sbi->s_version == MINIX_V3) 309 if (sbi->s_version == MINIX_V3)
320 ((minix3_dirent *) de)->inode = 0; 310 ((minix3_dirent *) de)->inode = 0;
@@ -332,16 +322,14 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
332 322
333int minix_make_empty(struct inode *inode, struct inode *dir) 323int minix_make_empty(struct inode *inode, struct inode *dir)
334{ 324{
335 struct address_space *mapping = inode->i_mapping; 325 struct page *page = grab_cache_page(inode->i_mapping, 0);
336 struct page *page = grab_cache_page(mapping, 0);
337 struct minix_sb_info *sbi = minix_sb(inode->i_sb); 326 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
338 char *kaddr; 327 char *kaddr;
339 int err; 328 int err;
340 329
341 if (!page) 330 if (!page)
342 return -ENOMEM; 331 return -ENOMEM;
343 err = __minix_write_begin(NULL, mapping, 0, 2 * sbi->s_dirsize, 332 err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize);
344 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
345 if (err) { 333 if (err) {
346 unlock_page(page); 334 unlock_page(page);
347 goto fail; 335 goto fail;
@@ -432,8 +420,7 @@ not_empty:
432void minix_set_link(struct minix_dir_entry *de, struct page *page, 420void minix_set_link(struct minix_dir_entry *de, struct page *page,
433 struct inode *inode) 421 struct inode *inode)
434{ 422{
435 struct address_space *mapping = page->mapping; 423 struct inode *dir = page->mapping->host;
436 struct inode *dir = mapping->host;
437 struct minix_sb_info *sbi = minix_sb(dir->i_sb); 424 struct minix_sb_info *sbi = minix_sb(dir->i_sb);
438 loff_t pos = page_offset(page) + 425 loff_t pos = page_offset(page) +
439 (char *)de-(char*)page_address(page); 426 (char *)de-(char*)page_address(page);
@@ -441,8 +428,7 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page,
441 428
442 lock_page(page); 429 lock_page(page);
443 430
444 err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize, 431 err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
445 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
446 if (err == 0) { 432 if (err == 0) {
447 if (sbi->s_version == MINIX_V3) 433 if (sbi->s_version == MINIX_V3)
448 ((minix3_dirent *) de)->inode = inode->i_ino; 434 ((minix3_dirent *) de)->inode = inode->i_ino;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..4493ce695ab8 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,11 +19,33 @@ const struct file_operations minix_file_operations = {
19 .write = do_sync_write, 19 .write = do_sync_write,
20 .aio_write = generic_file_aio_write, 20 .aio_write = generic_file_aio_write,
21 .mmap = generic_file_mmap, 21 .mmap = generic_file_mmap,
22 .fsync = simple_fsync, 22 .fsync = generic_file_fsync,
23 .splice_read = generic_file_splice_read, 23 .splice_read = generic_file_splice_read,
24}; 24};
25 25
26static int minix_setattr(struct dentry *dentry, struct iattr *attr)
27{
28 struct inode *inode = dentry->d_inode;
29 int error;
30
31 error = inode_change_ok(inode, attr);
32 if (error)
33 return error;
34
35 if ((attr->ia_valid & ATTR_SIZE) &&
36 attr->ia_size != i_size_read(inode)) {
37 error = vmtruncate(inode, attr->ia_size);
38 if (error)
39 return error;
40 }
41
42 setattr_copy(inode, attr);
43 mark_inode_dirty(inode);
44 return 0;
45}
46
26const struct inode_operations minix_file_inode_operations = { 47const struct inode_operations minix_file_inode_operations = {
27 .truncate = minix_truncate, 48 .truncate = minix_truncate,
49 .setattr = minix_setattr,
28 .getattr = minix_getattr, 50 .getattr = minix_getattr,
29}; 51};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 756f8c93780c..e39d6bf2e8fb 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -24,12 +24,17 @@ static int minix_write_inode(struct inode *inode,
24static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); 24static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
25static int minix_remount (struct super_block * sb, int * flags, char * data); 25static int minix_remount (struct super_block * sb, int * flags, char * data);
26 26
27static void minix_delete_inode(struct inode *inode) 27static void minix_evict_inode(struct inode *inode)
28{ 28{
29 truncate_inode_pages(&inode->i_data, 0); 29 truncate_inode_pages(&inode->i_data, 0);
30 inode->i_size = 0; 30 if (!inode->i_nlink) {
31 minix_truncate(inode); 31 inode->i_size = 0;
32 minix_free_inode(inode); 32 minix_truncate(inode);
33 }
34 invalidate_inode_buffers(inode);
35 end_writeback(inode);
36 if (!inode->i_nlink)
37 minix_free_inode(inode);
33} 38}
34 39
35static void minix_put_super(struct super_block *sb) 40static void minix_put_super(struct super_block *sb)
@@ -96,7 +101,7 @@ static const struct super_operations minix_sops = {
96 .alloc_inode = minix_alloc_inode, 101 .alloc_inode = minix_alloc_inode,
97 .destroy_inode = minix_destroy_inode, 102 .destroy_inode = minix_destroy_inode,
98 .write_inode = minix_write_inode, 103 .write_inode = minix_write_inode,
99 .delete_inode = minix_delete_inode, 104 .evict_inode = minix_evict_inode,
100 .put_super = minix_put_super, 105 .put_super = minix_put_super,
101 .statfs = minix_statfs, 106 .statfs = minix_statfs,
102 .remount_fs = minix_remount, 107 .remount_fs = minix_remount,
@@ -357,20 +362,26 @@ static int minix_readpage(struct file *file, struct page *page)
357 return block_read_full_page(page,minix_get_block); 362 return block_read_full_page(page,minix_get_block);
358} 363}
359 364
360int __minix_write_begin(struct file *file, struct address_space *mapping, 365int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
361 loff_t pos, unsigned len, unsigned flags,
362 struct page **pagep, void **fsdata)
363{ 366{
364 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 367 return __block_write_begin(page, pos, len, minix_get_block);
365 minix_get_block);
366} 368}
367 369
368static int minix_write_begin(struct file *file, struct address_space *mapping, 370static int minix_write_begin(struct file *file, struct address_space *mapping,
369 loff_t pos, unsigned len, unsigned flags, 371 loff_t pos, unsigned len, unsigned flags,
370 struct page **pagep, void **fsdata) 372 struct page **pagep, void **fsdata)
371{ 373{
372 *pagep = NULL; 374 int ret;
373 return __minix_write_begin(file, mapping, pos, len, flags, pagep, fsdata); 375
376 ret = block_write_begin(mapping, pos, len, flags, pagep,
377 minix_get_block);
378 if (unlikely(ret)) {
379 loff_t isize = mapping->host->i_size;
380 if (pos + len > isize)
381 vmtruncate(mapping->host, isize);
382 }
383
384 return ret;
374} 385}
375 386
376static sector_t minix_bmap(struct address_space *mapping, sector_t block) 387static sector_t minix_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
20 return (block_t *)minix_i(inode)->u.i2_data; 20 return (block_t *)minix_i(inode)->u.i2_data;
21} 21}
22 22
23#define DIRCOUNT 7
24#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
25
23static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) 26static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
24{ 27{
25 int n = 0; 28 int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
34 printk("MINIX-fs: block_to_path: " 37 printk("MINIX-fs: block_to_path: "
35 "block %ld too big on dev %s\n", 38 "block %ld too big on dev %s\n",
36 block, bdevname(sb->s_bdev, b)); 39 block, bdevname(sb->s_bdev, b));
37 } else if (block < 7) { 40 } else if (block < DIRCOUNT) {
38 offsets[n++] = block; 41 offsets[n++] = block;
39 } else if ((block -= 7) < 256) { 42 } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
40 offsets[n++] = 7; 43 offsets[n++] = DIRCOUNT;
41 offsets[n++] = block; 44 offsets[n++] = block;
42 } else if ((block -= 256) < 256*256) { 45 } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
43 offsets[n++] = 8; 46 offsets[n++] = DIRCOUNT + 1;
44 offsets[n++] = block>>8; 47 offsets[n++] = block / INDIRCOUNT(sb);
45 offsets[n++] = block & 255; 48 offsets[n++] = block % INDIRCOUNT(sb);
46 } else { 49 } else {
47 block -= 256*256; 50 block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
48 offsets[n++] = 9; 51 offsets[n++] = DIRCOUNT + 2;
49 offsets[n++] = block>>16; 52 offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
50 offsets[n++] = (block>>8) & 255; 53 offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
51 offsets[n++] = block & 255; 54 offsets[n++] = block % INDIRCOUNT(sb);
52 } 55 }
53 return n; 56 return n;
54} 57}
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b42116..407b1c84911e 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,16 +46,14 @@ struct minix_sb_info {
46extern struct inode *minix_iget(struct super_block *, unsigned long); 46extern struct inode *minix_iget(struct super_block *, unsigned long);
47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); 47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); 48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
49extern struct inode * minix_new_inode(const struct inode * dir, int * error); 49extern struct inode * minix_new_inode(const struct inode *, int, int *);
50extern void minix_free_inode(struct inode * inode); 50extern void minix_free_inode(struct inode * inode);
51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); 51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
52extern int minix_new_block(struct inode * inode); 52extern int minix_new_block(struct inode * inode);
53extern void minix_free_block(struct inode *inode, unsigned long block); 53extern void minix_free_block(struct inode *inode, unsigned long block);
54extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); 54extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi);
55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); 55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
56extern int __minix_write_begin(struct file *file, struct address_space *mapping, 56extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
57 loff_t pos, unsigned len, unsigned flags,
58 struct page **pagep, void **fsdata);
59 57
60extern void V1_minix_truncate(struct inode *); 58extern void V1_minix_truncate(struct inode *);
61extern void V2_minix_truncate(struct inode *); 59extern void V2_minix_truncate(struct inode *);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd6121..f3f3578393a4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
46 if (!old_valid_dev(rdev)) 46 if (!old_valid_dev(rdev))
47 return -EINVAL; 47 return -EINVAL;
48 48
49 inode = minix_new_inode(dir, &error); 49 inode = minix_new_inode(dir, mode, &error);
50 50
51 if (inode) { 51 if (inode) {
52 inode->i_mode = mode;
53 minix_set_inode(inode, rdev); 52 minix_set_inode(inode, rdev);
54 mark_inode_dirty(inode); 53 mark_inode_dirty(inode);
55 error = add_nondir(dentry, inode); 54 error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
73 if (i > dir->i_sb->s_blocksize) 72 if (i > dir->i_sb->s_blocksize)
74 goto out; 73 goto out;
75 74
76 inode = minix_new_inode(dir, &err); 75 inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
77 if (!inode) 76 if (!inode)
78 goto out; 77 goto out;
79 78
80 inode->i_mode = S_IFLNK | 0777;
81 minix_set_inode(inode, 0); 79 minix_set_inode(inode, 0);
82 err = page_symlink(inode, symname, i); 80 err = page_symlink(inode, symname, i);
83 if (err) 81 if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
117 115
118 inode_inc_link_count(dir); 116 inode_inc_link_count(dir);
119 117
120 inode = minix_new_inode(dir, &err); 118 inode = minix_new_inode(dir, S_IFDIR | mode, &err);
121 if (!inode) 119 if (!inode)
122 goto out_dir; 120 goto out_dir;
123 121
124 inode->i_mode = S_IFDIR | mode;
125 if (dir->i_mode & S_ISGID)
126 inode->i_mode |= S_ISGID;
127 minix_set_inode(inode, 0); 122 minix_set_inode(inode, 0);
128 123
129 inode_inc_link_count(inode); 124 inode_inc_link_count(inode);
diff --git a/fs/namei.c b/fs/namei.c
index b86b96fe1dc3..24896e833565 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -282,8 +282,7 @@ int inode_permission(struct inode *inode, int mask)
282 if (retval) 282 if (retval)
283 return retval; 283 return retval;
284 284
285 return security_inode_permission(inode, 285 return security_inode_permission(inode, mask);
286 mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
287} 286}
288 287
289/** 288/**
@@ -484,13 +483,8 @@ ok:
484 483
485static __always_inline void set_root(struct nameidata *nd) 484static __always_inline void set_root(struct nameidata *nd)
486{ 485{
487 if (!nd->root.mnt) { 486 if (!nd->root.mnt)
488 struct fs_struct *fs = current->fs; 487 get_fs_root(current->fs, &nd->root);
489 read_lock(&fs->lock);
490 nd->root = fs->root;
491 path_get(&nd->root);
492 read_unlock(&fs->lock);
493 }
494} 488}
495 489
496static int link_path_walk(const char *, struct nameidata *); 490static int link_path_walk(const char *, struct nameidata *);
@@ -523,9 +517,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
523static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 517static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
524{ 518{
525 dput(nd->path.dentry); 519 dput(nd->path.dentry);
526 if (nd->path.mnt != path->mnt) 520 if (nd->path.mnt != path->mnt) {
527 mntput(nd->path.mnt); 521 mntput(nd->path.mnt);
528 nd->path.mnt = path->mnt; 522 nd->path.mnt = path->mnt;
523 }
529 nd->path.dentry = path->dentry; 524 nd->path.dentry = path->dentry;
530} 525}
531 526
@@ -600,15 +595,16 @@ int follow_up(struct path *path)
600{ 595{
601 struct vfsmount *parent; 596 struct vfsmount *parent;
602 struct dentry *mountpoint; 597 struct dentry *mountpoint;
603 spin_lock(&vfsmount_lock); 598
599 br_read_lock(vfsmount_lock);
604 parent = path->mnt->mnt_parent; 600 parent = path->mnt->mnt_parent;
605 if (parent == path->mnt) { 601 if (parent == path->mnt) {
606 spin_unlock(&vfsmount_lock); 602 br_read_unlock(vfsmount_lock);
607 return 0; 603 return 0;
608 } 604 }
609 mntget(parent); 605 mntget(parent);
610 mountpoint = dget(path->mnt->mnt_mountpoint); 606 mountpoint = dget(path->mnt->mnt_mountpoint);
611 spin_unlock(&vfsmount_lock); 607 br_read_unlock(vfsmount_lock);
612 dput(path->dentry); 608 dput(path->dentry);
613 path->dentry = mountpoint; 609 path->dentry = mountpoint;
614 mntput(path->mnt); 610 mntput(path->mnt);
@@ -691,6 +687,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
691} 687}
692 688
693/* 689/*
690 * Allocate a dentry with name and parent, and perform a parent
691 * directory ->lookup on it. Returns the new dentry, or ERR_PTR
692 * on error. parent->d_inode->i_mutex must be held. d_lookup must
693 * have verified that no child exists while under i_mutex.
694 */
695static struct dentry *d_alloc_and_lookup(struct dentry *parent,
696 struct qstr *name, struct nameidata *nd)
697{
698 struct inode *inode = parent->d_inode;
699 struct dentry *dentry;
700 struct dentry *old;
701
702 /* Don't create child dentry for a dead directory. */
703 if (unlikely(IS_DEADDIR(inode)))
704 return ERR_PTR(-ENOENT);
705
706 dentry = d_alloc(parent, name);
707 if (unlikely(!dentry))
708 return ERR_PTR(-ENOMEM);
709
710 old = inode->i_op->lookup(inode, dentry, nd);
711 if (unlikely(old)) {
712 dput(dentry);
713 dentry = old;
714 }
715 return dentry;
716}
717
718/*
694 * It's more convoluted than I'd like it to be, but... it's still fairly 719 * It's more convoluted than I'd like it to be, but... it's still fairly
695 * small and for now I'd prefer to have fast path as straight as possible. 720 * small and for now I'd prefer to have fast path as straight as possible.
696 * It _is_ time-critical. 721 * It _is_ time-critical.
@@ -711,9 +736,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
711 return err; 736 return err;
712 } 737 }
713 738
739 /*
740 * Rename seqlock is not required here because in the off chance
741 * of a false negative due to a concurrent rename, we're going to
742 * do the non-racy lookup, below.
743 */
714 dentry = __d_lookup(nd->path.dentry, name); 744 dentry = __d_lookup(nd->path.dentry, name);
715 if (!dentry) 745 if (!dentry)
716 goto need_lookup; 746 goto need_lookup;
747found:
717 if (dentry->d_op && dentry->d_op->d_revalidate) 748 if (dentry->d_op && dentry->d_op->d_revalidate)
718 goto need_revalidate; 749 goto need_revalidate;
719done: 750done:
@@ -729,56 +760,28 @@ need_lookup:
729 mutex_lock(&dir->i_mutex); 760 mutex_lock(&dir->i_mutex);
730 /* 761 /*
731 * First re-do the cached lookup just in case it was created 762 * First re-do the cached lookup just in case it was created
732 * while we waited for the directory semaphore.. 763 * while we waited for the directory semaphore, or the first
733 * 764 * lookup failed due to an unrelated rename.
734 * FIXME! This could use version numbering or similar to
735 * avoid unnecessary cache lookups.
736 *
737 * The "dcache_lock" is purely to protect the RCU list walker
738 * from concurrent renames at this point (we mustn't get false
739 * negatives from the RCU list walk here, unlike the optimistic
740 * fast walk).
741 * 765 *
742 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup 766 * This could use version numbering or similar to avoid unnecessary
767 * cache lookups, but then we'd have to do the first lookup in the
768 * non-racy way. However in the common case here, everything should
769 * be hot in cache, so would it be a big win?
743 */ 770 */
744 dentry = d_lookup(parent, name); 771 dentry = d_lookup(parent, name);
745 if (!dentry) { 772 if (likely(!dentry)) {
746 struct dentry *new; 773 dentry = d_alloc_and_lookup(parent, name, nd);
747
748 /* Don't create child dentry for a dead directory. */
749 dentry = ERR_PTR(-ENOENT);
750 if (IS_DEADDIR(dir))
751 goto out_unlock;
752
753 new = d_alloc(parent, name);
754 dentry = ERR_PTR(-ENOMEM);
755 if (new) {
756 dentry = dir->i_op->lookup(dir, new, nd);
757 if (dentry)
758 dput(new);
759 else
760 dentry = new;
761 }
762out_unlock:
763 mutex_unlock(&dir->i_mutex); 774 mutex_unlock(&dir->i_mutex);
764 if (IS_ERR(dentry)) 775 if (IS_ERR(dentry))
765 goto fail; 776 goto fail;
766 goto done; 777 goto done;
767 } 778 }
768
769 /* 779 /*
770 * Uhhuh! Nasty case: the cache was re-populated while 780 * Uhhuh! Nasty case: the cache was re-populated while
771 * we waited on the semaphore. Need to revalidate. 781 * we waited on the semaphore. Need to revalidate.
772 */ 782 */
773 mutex_unlock(&dir->i_mutex); 783 mutex_unlock(&dir->i_mutex);
774 if (dentry->d_op && dentry->d_op->d_revalidate) { 784 goto found;
775 dentry = do_revalidate(dentry, nd);
776 if (!dentry)
777 dentry = ERR_PTR(-ENOENT);
778 }
779 if (IS_ERR(dentry))
780 goto fail;
781 goto done;
782 785
783need_revalidate: 786need_revalidate:
784 dentry = do_revalidate(dentry, nd); 787 dentry = do_revalidate(dentry, nd);
@@ -1015,11 +1018,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
1015 nd->path = nd->root; 1018 nd->path = nd->root;
1016 path_get(&nd->root); 1019 path_get(&nd->root);
1017 } else if (dfd == AT_FDCWD) { 1020 } else if (dfd == AT_FDCWD) {
1018 struct fs_struct *fs = current->fs; 1021 get_fs_pwd(current->fs, &nd->path);
1019 read_lock(&fs->lock);
1020 nd->path = fs->pwd;
1021 path_get(&fs->pwd);
1022 read_unlock(&fs->lock);
1023 } else { 1022 } else {
1024 struct dentry *dentry; 1023 struct dentry *dentry;
1025 1024
@@ -1139,35 +1138,18 @@ static struct dentry *__lookup_hash(struct qstr *name,
1139 goto out; 1138 goto out;
1140 } 1139 }
1141 1140
1142 dentry = __d_lookup(base, name); 1141 /*
1143 1142 * Don't bother with __d_lookup: callers are for creat as
1144 /* lockess __d_lookup may fail due to concurrent d_move() 1143 * well as unlink, so a lot of the time it would cost
1145 * in some unrelated directory, so try with d_lookup 1144 * a double lookup.
1146 */ 1145 */
1147 if (!dentry) 1146 dentry = d_lookup(base, name);
1148 dentry = d_lookup(base, name);
1149 1147
1150 if (dentry && dentry->d_op && dentry->d_op->d_revalidate) 1148 if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1151 dentry = do_revalidate(dentry, nd); 1149 dentry = do_revalidate(dentry, nd);
1152 1150
1153 if (!dentry) { 1151 if (!dentry)
1154 struct dentry *new; 1152 dentry = d_alloc_and_lookup(base, name, nd);
1155
1156 /* Don't create child dentry for a dead directory. */
1157 dentry = ERR_PTR(-ENOENT);
1158 if (IS_DEADDIR(inode))
1159 goto out;
1160
1161 new = d_alloc(base, name);
1162 dentry = ERR_PTR(-ENOMEM);
1163 if (!new)
1164 goto out;
1165 dentry = inode->i_op->lookup(inode, new, nd);
1166 if (!dentry)
1167 dentry = new;
1168 else
1169 dput(new);
1170 }
1171out: 1153out:
1172 return dentry; 1154 return dentry;
1173} 1155}
@@ -1483,8 +1465,7 @@ static int handle_truncate(struct path *path)
1483 */ 1465 */
1484 error = locks_verify_locked(inode); 1466 error = locks_verify_locked(inode);
1485 if (!error) 1467 if (!error)
1486 error = security_path_truncate(path, 0, 1468 error = security_path_truncate(path);
1487 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
1488 if (!error) { 1469 if (!error) {
1489 error = do_truncate(path->dentry, 0, 1470 error = do_truncate(path->dentry, 0,
1490 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 1471 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -1620,6 +1601,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1620 case LAST_DOTDOT: 1601 case LAST_DOTDOT:
1621 follow_dotdot(nd); 1602 follow_dotdot(nd);
1622 dir = nd->path.dentry; 1603 dir = nd->path.dentry;
1604 case LAST_DOT:
1623 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { 1605 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1624 if (!dir->d_op->d_revalidate(dir, nd)) { 1606 if (!dir->d_op->d_revalidate(dir, nd)) {
1625 error = -ESTALE; 1607 error = -ESTALE;
@@ -1627,7 +1609,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1627 } 1609 }
1628 } 1610 }
1629 /* fallthrough */ 1611 /* fallthrough */
1630 case LAST_DOT:
1631 case LAST_ROOT: 1612 case LAST_ROOT:
1632 if (open_flag & O_CREAT) 1613 if (open_flag & O_CREAT)
1633 goto exit; 1614 goto exit;
@@ -2634,7 +2615,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2634{ 2615{
2635 int error; 2616 int error;
2636 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); 2617 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2637 const char *old_name; 2618 const unsigned char *old_name;
2638 2619
2639 if (old_dentry->d_inode == new_dentry->d_inode) 2620 if (old_dentry->d_inode == new_dentry->d_inode)
2640 return 0; 2621 return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index f20cb57d1067..a72eaabfe8f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
11#include <linux/syscalls.h> 11#include <linux/syscalls.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/spinlock.h>
15#include <linux/percpu.h>
14#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
15#include <linux/init.h> 17#include <linux/init.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
@@ -29,6 +31,7 @@
29#include <linux/log2.h> 31#include <linux/log2.h>
30#include <linux/idr.h> 32#include <linux/idr.h>
31#include <linux/fs_struct.h> 33#include <linux/fs_struct.h>
34#include <linux/fsnotify.h>
32#include <asm/uaccess.h> 35#include <asm/uaccess.h>
33#include <asm/unistd.h> 36#include <asm/unistd.h>
34#include "pnode.h" 37#include "pnode.h"
@@ -37,12 +40,10 @@
37#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 40#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
38#define HASH_SIZE (1UL << HASH_SHIFT) 41#define HASH_SIZE (1UL << HASH_SHIFT)
39 42
40/* spinlock for vfsmount related operations, inplace of dcache_lock */
41__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
42
43static int event; 43static int event;
44static DEFINE_IDA(mnt_id_ida); 44static DEFINE_IDA(mnt_id_ida);
45static DEFINE_IDA(mnt_group_ida); 45static DEFINE_IDA(mnt_group_ida);
46static DEFINE_SPINLOCK(mnt_id_lock);
46static int mnt_id_start = 0; 47static int mnt_id_start = 0;
47static int mnt_group_start = 1; 48static int mnt_group_start = 1;
48 49
@@ -54,6 +55,16 @@ static struct rw_semaphore namespace_sem;
54struct kobject *fs_kobj; 55struct kobject *fs_kobj;
55EXPORT_SYMBOL_GPL(fs_kobj); 56EXPORT_SYMBOL_GPL(fs_kobj);
56 57
58/*
59 * vfsmount lock may be taken for read to prevent changes to the
60 * vfsmount hash, ie. during mountpoint lookups or walking back
61 * up the tree.
62 *
63 * It should be taken for write in all cases where the vfsmount
64 * tree or hash is modified or when a vfsmount structure is modified.
65 */
66DEFINE_BRLOCK(vfsmount_lock);
67
57static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 68static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
58{ 69{
59 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 70 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -64,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
64 75
65#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) 76#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
66 77
67/* allocation is serialized by namespace_sem */ 78/*
79 * allocation is serialized by namespace_sem, but we need the spinlock to
80 * serialize with freeing.
81 */
68static int mnt_alloc_id(struct vfsmount *mnt) 82static int mnt_alloc_id(struct vfsmount *mnt)
69{ 83{
70 int res; 84 int res;
71 85
72retry: 86retry:
73 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 87 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
74 spin_lock(&vfsmount_lock); 88 spin_lock(&mnt_id_lock);
75 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 89 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
76 if (!res) 90 if (!res)
77 mnt_id_start = mnt->mnt_id + 1; 91 mnt_id_start = mnt->mnt_id + 1;
78 spin_unlock(&vfsmount_lock); 92 spin_unlock(&mnt_id_lock);
79 if (res == -EAGAIN) 93 if (res == -EAGAIN)
80 goto retry; 94 goto retry;
81 95
@@ -85,11 +99,11 @@ retry:
85static void mnt_free_id(struct vfsmount *mnt) 99static void mnt_free_id(struct vfsmount *mnt)
86{ 100{
87 int id = mnt->mnt_id; 101 int id = mnt->mnt_id;
88 spin_lock(&vfsmount_lock); 102 spin_lock(&mnt_id_lock);
89 ida_remove(&mnt_id_ida, id); 103 ida_remove(&mnt_id_ida, id);
90 if (mnt_id_start > id) 104 if (mnt_id_start > id)
91 mnt_id_start = id; 105 mnt_id_start = id;
92 spin_unlock(&vfsmount_lock); 106 spin_unlock(&mnt_id_lock);
93} 107}
94 108
95/* 109/*
@@ -150,6 +164,9 @@ struct vfsmount *alloc_vfsmnt(const char *name)
150 INIT_LIST_HEAD(&mnt->mnt_share); 164 INIT_LIST_HEAD(&mnt->mnt_share);
151 INIT_LIST_HEAD(&mnt->mnt_slave_list); 165 INIT_LIST_HEAD(&mnt->mnt_slave_list);
152 INIT_LIST_HEAD(&mnt->mnt_slave); 166 INIT_LIST_HEAD(&mnt->mnt_slave);
167#ifdef CONFIG_FSNOTIFY
168 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
169#endif
153#ifdef CONFIG_SMP 170#ifdef CONFIG_SMP
154 mnt->mnt_writers = alloc_percpu(int); 171 mnt->mnt_writers = alloc_percpu(int);
155 if (!mnt->mnt_writers) 172 if (!mnt->mnt_writers)
@@ -344,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
344{ 361{
345 int ret = 0; 362 int ret = 0;
346 363
347 spin_lock(&vfsmount_lock); 364 br_write_lock(vfsmount_lock);
348 mnt->mnt_flags |= MNT_WRITE_HOLD; 365 mnt->mnt_flags |= MNT_WRITE_HOLD;
349 /* 366 /*
350 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 367 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -378,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
378 */ 395 */
379 smp_wmb(); 396 smp_wmb();
380 mnt->mnt_flags &= ~MNT_WRITE_HOLD; 397 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
381 spin_unlock(&vfsmount_lock); 398 br_write_unlock(vfsmount_lock);
382 return ret; 399 return ret;
383} 400}
384 401
385static void __mnt_unmake_readonly(struct vfsmount *mnt) 402static void __mnt_unmake_readonly(struct vfsmount *mnt)
386{ 403{
387 spin_lock(&vfsmount_lock); 404 br_write_lock(vfsmount_lock);
388 mnt->mnt_flags &= ~MNT_READONLY; 405 mnt->mnt_flags &= ~MNT_READONLY;
389 spin_unlock(&vfsmount_lock); 406 br_write_unlock(vfsmount_lock);
390} 407}
391 408
392void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 409void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -410,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
410/* 427/*
411 * find the first or last mount at @dentry on vfsmount @mnt depending on 428 * find the first or last mount at @dentry on vfsmount @mnt depending on
412 * @dir. If @dir is set return the first mount else return the last mount. 429 * @dir. If @dir is set return the first mount else return the last mount.
430 * vfsmount_lock must be held for read or write.
413 */ 431 */
414struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 432struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
415 int dir) 433 int dir)
@@ -439,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
439struct vfsmount *lookup_mnt(struct path *path) 457struct vfsmount *lookup_mnt(struct path *path)
440{ 458{
441 struct vfsmount *child_mnt; 459 struct vfsmount *child_mnt;
442 spin_lock(&vfsmount_lock); 460
461 br_read_lock(vfsmount_lock);
443 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) 462 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
444 mntget(child_mnt); 463 mntget(child_mnt);
445 spin_unlock(&vfsmount_lock); 464 br_read_unlock(vfsmount_lock);
446 return child_mnt; 465 return child_mnt;
447} 466}
448 467
@@ -451,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
451 return mnt->mnt_ns == current->nsproxy->mnt_ns; 470 return mnt->mnt_ns == current->nsproxy->mnt_ns;
452} 471}
453 472
473/*
474 * vfsmount lock must be held for write
475 */
454static void touch_mnt_namespace(struct mnt_namespace *ns) 476static void touch_mnt_namespace(struct mnt_namespace *ns)
455{ 477{
456 if (ns) { 478 if (ns) {
@@ -459,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
459 } 481 }
460} 482}
461 483
484/*
485 * vfsmount lock must be held for write
486 */
462static void __touch_mnt_namespace(struct mnt_namespace *ns) 487static void __touch_mnt_namespace(struct mnt_namespace *ns)
463{ 488{
464 if (ns && ns->event != event) { 489 if (ns && ns->event != event) {
@@ -467,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
467 } 492 }
468} 493}
469 494
495/*
496 * vfsmount lock must be held for write
497 */
470static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 498static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
471{ 499{
472 old_path->dentry = mnt->mnt_mountpoint; 500 old_path->dentry = mnt->mnt_mountpoint;
@@ -478,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
478 old_path->dentry->d_mounted--; 506 old_path->dentry->d_mounted--;
479} 507}
480 508
509/*
510 * vfsmount lock must be held for write
511 */
481void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 512void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
482 struct vfsmount *child_mnt) 513 struct vfsmount *child_mnt)
483{ 514{
@@ -486,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
486 dentry->d_mounted++; 517 dentry->d_mounted++;
487} 518}
488 519
520/*
521 * vfsmount lock must be held for write
522 */
489static void attach_mnt(struct vfsmount *mnt, struct path *path) 523static void attach_mnt(struct vfsmount *mnt, struct path *path)
490{ 524{
491 mnt_set_mountpoint(path->mnt, path->dentry, mnt); 525 mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -495,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
495} 529}
496 530
497/* 531/*
498 * the caller must hold vfsmount_lock 532 * vfsmount lock must be held for write
499 */ 533 */
500static void commit_tree(struct vfsmount *mnt) 534static void commit_tree(struct vfsmount *mnt)
501{ 535{
@@ -610,6 +644,7 @@ static inline void __mntput(struct vfsmount *mnt)
610 * provides barriers, so count_mnt_writers() below is safe. AV 644 * provides barriers, so count_mnt_writers() below is safe. AV
611 */ 645 */
612 WARN_ON(count_mnt_writers(mnt)); 646 WARN_ON(count_mnt_writers(mnt));
647 fsnotify_vfsmount_delete(mnt);
613 dput(mnt->mnt_root); 648 dput(mnt->mnt_root);
614 free_vfsmnt(mnt); 649 free_vfsmnt(mnt);
615 deactivate_super(sb); 650 deactivate_super(sb);
@@ -618,40 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
618void mntput_no_expire(struct vfsmount *mnt) 653void mntput_no_expire(struct vfsmount *mnt)
619{ 654{
620repeat: 655repeat:
621 if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { 656 if (atomic_add_unless(&mnt->mnt_count, -1, 1))
622 if (likely(!mnt->mnt_pinned)) { 657 return;
623 spin_unlock(&vfsmount_lock); 658 br_write_lock(vfsmount_lock);
624 __mntput(mnt); 659 if (!atomic_dec_and_test(&mnt->mnt_count)) {
625 return; 660 br_write_unlock(vfsmount_lock);
626 } 661 return;
627 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); 662 }
628 mnt->mnt_pinned = 0; 663 if (likely(!mnt->mnt_pinned)) {
629 spin_unlock(&vfsmount_lock); 664 br_write_unlock(vfsmount_lock);
630 acct_auto_close_mnt(mnt); 665 __mntput(mnt);
631 security_sb_umount_close(mnt); 666 return;
632 goto repeat;
633 } 667 }
668 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
669 mnt->mnt_pinned = 0;
670 br_write_unlock(vfsmount_lock);
671 acct_auto_close_mnt(mnt);
672 goto repeat;
634} 673}
635
636EXPORT_SYMBOL(mntput_no_expire); 674EXPORT_SYMBOL(mntput_no_expire);
637 675
638void mnt_pin(struct vfsmount *mnt) 676void mnt_pin(struct vfsmount *mnt)
639{ 677{
640 spin_lock(&vfsmount_lock); 678 br_write_lock(vfsmount_lock);
641 mnt->mnt_pinned++; 679 mnt->mnt_pinned++;
642 spin_unlock(&vfsmount_lock); 680 br_write_unlock(vfsmount_lock);
643} 681}
644 682
645EXPORT_SYMBOL(mnt_pin); 683EXPORT_SYMBOL(mnt_pin);
646 684
647void mnt_unpin(struct vfsmount *mnt) 685void mnt_unpin(struct vfsmount *mnt)
648{ 686{
649 spin_lock(&vfsmount_lock); 687 br_write_lock(vfsmount_lock);
650 if (mnt->mnt_pinned) { 688 if (mnt->mnt_pinned) {
651 atomic_inc(&mnt->mnt_count); 689 atomic_inc(&mnt->mnt_count);
652 mnt->mnt_pinned--; 690 mnt->mnt_pinned--;
653 } 691 }
654 spin_unlock(&vfsmount_lock); 692 br_write_unlock(vfsmount_lock);
655} 693}
656 694
657EXPORT_SYMBOL(mnt_unpin); 695EXPORT_SYMBOL(mnt_unpin);
@@ -742,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
742 struct mnt_namespace *ns = p->ns; 780 struct mnt_namespace *ns = p->ns;
743 int res = 0; 781 int res = 0;
744 782
745 spin_lock(&vfsmount_lock); 783 br_read_lock(vfsmount_lock);
746 if (p->event != ns->event) { 784 if (p->event != ns->event) {
747 p->event = ns->event; 785 p->event = ns->event;
748 res = 1; 786 res = 1;
749 } 787 }
750 spin_unlock(&vfsmount_lock); 788 br_read_unlock(vfsmount_lock);
751 789
752 return res; 790 return res;
753} 791}
@@ -784,7 +822,6 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
784 { MNT_NOATIME, ",noatime" }, 822 { MNT_NOATIME, ",noatime" },
785 { MNT_NODIRATIME, ",nodiratime" }, 823 { MNT_NODIRATIME, ",nodiratime" },
786 { MNT_RELATIME, ",relatime" }, 824 { MNT_RELATIME, ",relatime" },
787 { MNT_STRICTATIME, ",strictatime" },
788 { 0, NULL } 825 { 0, NULL }
789 }; 826 };
790 const struct proc_fs_info *fs_infop; 827 const struct proc_fs_info *fs_infop;
@@ -949,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
949 int minimum_refs = 0; 986 int minimum_refs = 0;
950 struct vfsmount *p; 987 struct vfsmount *p;
951 988
952 spin_lock(&vfsmount_lock); 989 br_read_lock(vfsmount_lock);
953 for (p = mnt; p; p = next_mnt(p, mnt)) { 990 for (p = mnt; p; p = next_mnt(p, mnt)) {
954 actual_refs += atomic_read(&p->mnt_count); 991 actual_refs += atomic_read(&p->mnt_count);
955 minimum_refs += 2; 992 minimum_refs += 2;
956 } 993 }
957 spin_unlock(&vfsmount_lock); 994 br_read_unlock(vfsmount_lock);
958 995
959 if (actual_refs > minimum_refs) 996 if (actual_refs > minimum_refs)
960 return 0; 997 return 0;
@@ -981,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
981{ 1018{
982 int ret = 1; 1019 int ret = 1;
983 down_read(&namespace_sem); 1020 down_read(&namespace_sem);
984 spin_lock(&vfsmount_lock); 1021 br_read_lock(vfsmount_lock);
985 if (propagate_mount_busy(mnt, 2)) 1022 if (propagate_mount_busy(mnt, 2))
986 ret = 0; 1023 ret = 0;
987 spin_unlock(&vfsmount_lock); 1024 br_read_unlock(vfsmount_lock);
988 up_read(&namespace_sem); 1025 up_read(&namespace_sem);
989 return ret; 1026 return ret;
990} 1027}
@@ -1000,13 +1037,14 @@ void release_mounts(struct list_head *head)
1000 if (mnt->mnt_parent != mnt) { 1037 if (mnt->mnt_parent != mnt) {
1001 struct dentry *dentry; 1038 struct dentry *dentry;
1002 struct vfsmount *m; 1039 struct vfsmount *m;
1003 spin_lock(&vfsmount_lock); 1040
1041 br_write_lock(vfsmount_lock);
1004 dentry = mnt->mnt_mountpoint; 1042 dentry = mnt->mnt_mountpoint;
1005 m = mnt->mnt_parent; 1043 m = mnt->mnt_parent;
1006 mnt->mnt_mountpoint = mnt->mnt_root; 1044 mnt->mnt_mountpoint = mnt->mnt_root;
1007 mnt->mnt_parent = mnt; 1045 mnt->mnt_parent = mnt;
1008 m->mnt_ghosts--; 1046 m->mnt_ghosts--;
1009 spin_unlock(&vfsmount_lock); 1047 br_write_unlock(vfsmount_lock);
1010 dput(dentry); 1048 dput(dentry);
1011 mntput(m); 1049 mntput(m);
1012 } 1050 }
@@ -1014,6 +1052,10 @@ void release_mounts(struct list_head *head)
1014 } 1052 }
1015} 1053}
1016 1054
1055/*
1056 * vfsmount lock must be held for write
1057 * namespace_sem must be held for write
1058 */
1017void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1059void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1018{ 1060{
1019 struct vfsmount *p; 1061 struct vfsmount *p;
@@ -1104,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1104 } 1146 }
1105 1147
1106 down_write(&namespace_sem); 1148 down_write(&namespace_sem);
1107 spin_lock(&vfsmount_lock); 1149 br_write_lock(vfsmount_lock);
1108 event++; 1150 event++;
1109 1151
1110 if (!(flags & MNT_DETACH)) 1152 if (!(flags & MNT_DETACH))
@@ -1116,9 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1116 umount_tree(mnt, 1, &umount_list); 1158 umount_tree(mnt, 1, &umount_list);
1117 retval = 0; 1159 retval = 0;
1118 } 1160 }
1119 spin_unlock(&vfsmount_lock); 1161 br_write_unlock(vfsmount_lock);
1120 if (retval)
1121 security_sb_umount_busy(mnt);
1122 up_write(&namespace_sem); 1162 up_write(&namespace_sem);
1123 release_mounts(&umount_list); 1163 release_mounts(&umount_list);
1124 return retval; 1164 return retval;
@@ -1230,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
1230 q = clone_mnt(p, p->mnt_root, flag); 1270 q = clone_mnt(p, p->mnt_root, flag);
1231 if (!q) 1271 if (!q)
1232 goto Enomem; 1272 goto Enomem;
1233 spin_lock(&vfsmount_lock); 1273 br_write_lock(vfsmount_lock);
1234 list_add_tail(&q->mnt_list, &res->mnt_list); 1274 list_add_tail(&q->mnt_list, &res->mnt_list);
1235 attach_mnt(q, &path); 1275 attach_mnt(q, &path);
1236 spin_unlock(&vfsmount_lock); 1276 br_write_unlock(vfsmount_lock);
1237 } 1277 }
1238 } 1278 }
1239 return res; 1279 return res;
1240Enomem: 1280Enomem:
1241 if (res) { 1281 if (res) {
1242 LIST_HEAD(umount_list); 1282 LIST_HEAD(umount_list);
1243 spin_lock(&vfsmount_lock); 1283 br_write_lock(vfsmount_lock);
1244 umount_tree(res, 0, &umount_list); 1284 umount_tree(res, 0, &umount_list);
1245 spin_unlock(&vfsmount_lock); 1285 br_write_unlock(vfsmount_lock);
1246 release_mounts(&umount_list); 1286 release_mounts(&umount_list);
1247 } 1287 }
1248 return NULL; 1288 return NULL;
@@ -1261,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
1261{ 1301{
1262 LIST_HEAD(umount_list); 1302 LIST_HEAD(umount_list);
1263 down_write(&namespace_sem); 1303 down_write(&namespace_sem);
1264 spin_lock(&vfsmount_lock); 1304 br_write_lock(vfsmount_lock);
1265 umount_tree(mnt, 0, &umount_list); 1305 umount_tree(mnt, 0, &umount_list);
1266 spin_unlock(&vfsmount_lock); 1306 br_write_unlock(vfsmount_lock);
1267 up_write(&namespace_sem); 1307 up_write(&namespace_sem);
1268 release_mounts(&umount_list); 1308 release_mounts(&umount_list);
1269} 1309}
@@ -1391,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1391 if (err) 1431 if (err)
1392 goto out_cleanup_ids; 1432 goto out_cleanup_ids;
1393 1433
1394 spin_lock(&vfsmount_lock); 1434 br_write_lock(vfsmount_lock);
1395 1435
1396 if (IS_MNT_SHARED(dest_mnt)) { 1436 if (IS_MNT_SHARED(dest_mnt)) {
1397 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1437 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1410,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1410 list_del_init(&child->mnt_hash); 1450 list_del_init(&child->mnt_hash);
1411 commit_tree(child); 1451 commit_tree(child);
1412 } 1452 }
1413 spin_unlock(&vfsmount_lock); 1453 br_write_unlock(vfsmount_lock);
1454
1414 return 0; 1455 return 0;
1415 1456
1416 out_cleanup_ids: 1457 out_cleanup_ids:
@@ -1435,28 +1476,38 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1435 if (cant_mount(path->dentry)) 1476 if (cant_mount(path->dentry))
1436 goto out_unlock; 1477 goto out_unlock;
1437 1478
1438 err = security_sb_check_sb(mnt, path);
1439 if (err)
1440 goto out_unlock;
1441
1442 err = -ENOENT;
1443 if (!d_unlinked(path->dentry)) 1479 if (!d_unlinked(path->dentry))
1444 err = attach_recursive_mnt(mnt, path, NULL); 1480 err = attach_recursive_mnt(mnt, path, NULL);
1445out_unlock: 1481out_unlock:
1446 mutex_unlock(&path->dentry->d_inode->i_mutex); 1482 mutex_unlock(&path->dentry->d_inode->i_mutex);
1447 if (!err)
1448 security_sb_post_addmount(mnt, path);
1449 return err; 1483 return err;
1450} 1484}
1451 1485
1452/* 1486/*
1487 * Sanity check the flags to change_mnt_propagation.
1488 */
1489
1490static int flags_to_propagation_type(int flags)
1491{
1492 int type = flags & ~MS_REC;
1493
1494 /* Fail if any non-propagation flags are set */
1495 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
1496 return 0;
1497 /* Only one propagation flag should be set */
1498 if (!is_power_of_2(type))
1499 return 0;
1500 return type;
1501}
1502
1503/*
1453 * recursively change the type of the mountpoint. 1504 * recursively change the type of the mountpoint.
1454 */ 1505 */
1455static int do_change_type(struct path *path, int flag) 1506static int do_change_type(struct path *path, int flag)
1456{ 1507{
1457 struct vfsmount *m, *mnt = path->mnt; 1508 struct vfsmount *m, *mnt = path->mnt;
1458 int recurse = flag & MS_REC; 1509 int recurse = flag & MS_REC;
1459 int type = flag & ~MS_REC; 1510 int type;
1460 int err = 0; 1511 int err = 0;
1461 1512
1462 if (!capable(CAP_SYS_ADMIN)) 1513 if (!capable(CAP_SYS_ADMIN))
@@ -1465,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
1465 if (path->dentry != path->mnt->mnt_root) 1516 if (path->dentry != path->mnt->mnt_root)
1466 return -EINVAL; 1517 return -EINVAL;
1467 1518
1519 type = flags_to_propagation_type(flag);
1520 if (!type)
1521 return -EINVAL;
1522
1468 down_write(&namespace_sem); 1523 down_write(&namespace_sem);
1469 if (type == MS_SHARED) { 1524 if (type == MS_SHARED) {
1470 err = invent_group_ids(mnt, recurse); 1525 err = invent_group_ids(mnt, recurse);
@@ -1472,10 +1527,10 @@ static int do_change_type(struct path *path, int flag)
1472 goto out_unlock; 1527 goto out_unlock;
1473 } 1528 }
1474 1529
1475 spin_lock(&vfsmount_lock); 1530 br_write_lock(vfsmount_lock);
1476 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1531 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1477 change_mnt_propagation(m, type); 1532 change_mnt_propagation(m, type);
1478 spin_unlock(&vfsmount_lock); 1533 br_write_unlock(vfsmount_lock);
1479 1534
1480 out_unlock: 1535 out_unlock:
1481 up_write(&namespace_sem); 1536 up_write(&namespace_sem);
@@ -1519,9 +1574,10 @@ static int do_loopback(struct path *path, char *old_name,
1519 err = graft_tree(mnt, path); 1574 err = graft_tree(mnt, path);
1520 if (err) { 1575 if (err) {
1521 LIST_HEAD(umount_list); 1576 LIST_HEAD(umount_list);
1522 spin_lock(&vfsmount_lock); 1577
1578 br_write_lock(vfsmount_lock);
1523 umount_tree(mnt, 0, &umount_list); 1579 umount_tree(mnt, 0, &umount_list);
1524 spin_unlock(&vfsmount_lock); 1580 br_write_unlock(vfsmount_lock);
1525 release_mounts(&umount_list); 1581 release_mounts(&umount_list);
1526 } 1582 }
1527 1583
@@ -1574,18 +1630,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1574 else 1630 else
1575 err = do_remount_sb(sb, flags, data, 0); 1631 err = do_remount_sb(sb, flags, data, 0);
1576 if (!err) { 1632 if (!err) {
1577 spin_lock(&vfsmount_lock); 1633 br_write_lock(vfsmount_lock);
1578 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; 1634 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
1579 path->mnt->mnt_flags = mnt_flags; 1635 path->mnt->mnt_flags = mnt_flags;
1580 spin_unlock(&vfsmount_lock); 1636 br_write_unlock(vfsmount_lock);
1581 } 1637 }
1582 up_write(&sb->s_umount); 1638 up_write(&sb->s_umount);
1583 if (!err) { 1639 if (!err) {
1584 security_sb_post_remount(path->mnt, flags, data); 1640 br_write_lock(vfsmount_lock);
1585
1586 spin_lock(&vfsmount_lock);
1587 touch_mnt_namespace(path->mnt->mnt_ns); 1641 touch_mnt_namespace(path->mnt->mnt_ns);
1588 spin_unlock(&vfsmount_lock); 1642 br_write_unlock(vfsmount_lock);
1589 } 1643 }
1590 return err; 1644 return err;
1591} 1645}
@@ -1762,7 +1816,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1762 return; 1816 return;
1763 1817
1764 down_write(&namespace_sem); 1818 down_write(&namespace_sem);
1765 spin_lock(&vfsmount_lock); 1819 br_write_lock(vfsmount_lock);
1766 1820
1767 /* extract from the expiration list every vfsmount that matches the 1821 /* extract from the expiration list every vfsmount that matches the
1768 * following criteria: 1822 * following criteria:
@@ -1781,7 +1835,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1781 touch_mnt_namespace(mnt->mnt_ns); 1835 touch_mnt_namespace(mnt->mnt_ns);
1782 umount_tree(mnt, 1, &umounts); 1836 umount_tree(mnt, 1, &umounts);
1783 } 1837 }
1784 spin_unlock(&vfsmount_lock); 1838 br_write_unlock(vfsmount_lock);
1785 up_write(&namespace_sem); 1839 up_write(&namespace_sem);
1786 1840
1787 release_mounts(&umounts); 1841 release_mounts(&umounts);
@@ -1838,6 +1892,8 @@ resume:
1838/* 1892/*
1839 * process a list of expirable mountpoints with the intent of discarding any 1893 * process a list of expirable mountpoints with the intent of discarding any
1840 * submounts of a specific parent mountpoint 1894 * submounts of a specific parent mountpoint
1895 *
1896 * vfsmount_lock must be held for write
1841 */ 1897 */
1842static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) 1898static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
1843{ 1899{
@@ -1996,7 +2052,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1996 if (flags & MS_RDONLY) 2052 if (flags & MS_RDONLY)
1997 mnt_flags |= MNT_READONLY; 2053 mnt_flags |= MNT_READONLY;
1998 2054
1999 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 2055 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
2000 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2056 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
2001 MS_STRICTATIME); 2057 MS_STRICTATIME);
2002 2058
@@ -2056,9 +2112,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2056 kfree(new_ns); 2112 kfree(new_ns);
2057 return ERR_PTR(-ENOMEM); 2113 return ERR_PTR(-ENOMEM);
2058 } 2114 }
2059 spin_lock(&vfsmount_lock); 2115 br_write_lock(vfsmount_lock);
2060 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 2116 list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
2061 spin_unlock(&vfsmount_lock); 2117 br_write_unlock(vfsmount_lock);
2062 2118
2063 /* 2119 /*
2064 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2120 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2220,10 +2276,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2220 goto out1; 2276 goto out1;
2221 } 2277 }
2222 2278
2223 read_lock(&current->fs->lock); 2279 get_fs_root(current->fs, &root);
2224 root = current->fs->root;
2225 path_get(&current->fs->root);
2226 read_unlock(&current->fs->lock);
2227 down_write(&namespace_sem); 2280 down_write(&namespace_sem);
2228 mutex_lock(&old.dentry->d_inode->i_mutex); 2281 mutex_lock(&old.dentry->d_inode->i_mutex);
2229 error = -EINVAL; 2282 error = -EINVAL;
@@ -2255,7 +2308,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2255 goto out2; /* not attached */ 2308 goto out2; /* not attached */
2256 /* make sure we can reach put_old from new_root */ 2309 /* make sure we can reach put_old from new_root */
2257 tmp = old.mnt; 2310 tmp = old.mnt;
2258 spin_lock(&vfsmount_lock); 2311 br_write_lock(vfsmount_lock);
2259 if (tmp != new.mnt) { 2312 if (tmp != new.mnt) {
2260 for (;;) { 2313 for (;;) {
2261 if (tmp->mnt_parent == tmp) 2314 if (tmp->mnt_parent == tmp)
@@ -2275,9 +2328,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2275 /* mount new_root on / */ 2328 /* mount new_root on / */
2276 attach_mnt(new.mnt, &root_parent); 2329 attach_mnt(new.mnt, &root_parent);
2277 touch_mnt_namespace(current->nsproxy->mnt_ns); 2330 touch_mnt_namespace(current->nsproxy->mnt_ns);
2278 spin_unlock(&vfsmount_lock); 2331 br_write_unlock(vfsmount_lock);
2279 chroot_fs_refs(&root, &new); 2332 chroot_fs_refs(&root, &new);
2280 security_sb_post_pivotroot(&root, &new);
2281 error = 0; 2333 error = 0;
2282 path_put(&root_parent); 2334 path_put(&root_parent);
2283 path_put(&parent_path); 2335 path_put(&parent_path);
@@ -2291,7 +2343,7 @@ out1:
2291out0: 2343out0:
2292 return error; 2344 return error;
2293out3: 2345out3:
2294 spin_unlock(&vfsmount_lock); 2346 br_write_unlock(vfsmount_lock);
2295 goto out2; 2347 goto out2;
2296} 2348}
2297 2349
@@ -2338,6 +2390,8 @@ void __init mnt_init(void)
2338 for (u = 0; u < HASH_SIZE; u++) 2390 for (u = 0; u < HASH_SIZE; u++)
2339 INIT_LIST_HEAD(&mount_hashtable[u]); 2391 INIT_LIST_HEAD(&mount_hashtable[u]);
2340 2392
2393 br_lock_init(vfsmount_lock);
2394
2341 err = sysfs_init(); 2395 err = sysfs_init();
2342 if (err) 2396 if (err)
2343 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2397 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2356,9 +2410,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
2356 if (!atomic_dec_and_test(&ns->count)) 2410 if (!atomic_dec_and_test(&ns->count))
2357 return; 2411 return;
2358 down_write(&namespace_sem); 2412 down_write(&namespace_sem);
2359 spin_lock(&vfsmount_lock); 2413 br_write_lock(vfsmount_lock);
2360 umount_tree(ns->root, 0, &umount_list); 2414 umount_tree(ns->root, 0, &umount_list);
2361 spin_unlock(&vfsmount_lock); 2415 br_write_unlock(vfsmount_lock);
2362 up_write(&namespace_sem); 2416 up_write(&namespace_sem);
2363 release_mounts(&umount_list); 2417 release_mounts(&umount_list);
2364 kfree(ns); 2418 kfree(ns);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7edfcd4d5e52..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
49 49
50const struct file_operations ncp_dir_operations = 50const struct file_operations ncp_dir_operations =
51{ 51{
52 .llseek = generic_file_llseek,
52 .read = generic_read_dir, 53 .read = generic_read_dir,
53 .readdir = ncp_readdir, 54 .readdir = ncp_readdir,
54 .ioctl = ncp_ioctl, 55 .unlocked_ioctl = ncp_ioctl,
55#ifdef CONFIG_COMPAT 56#ifdef CONFIG_COMPAT
56 .compat_ioctl = ncp_compat_ioctl, 57 .compat_ioctl = ncp_compat_ioctl,
57#endif 58#endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1daabb90e0a5..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -22,7 +22,7 @@
22#include <linux/ncp_fs.h> 22#include <linux/ncp_fs.h>
23#include "ncplib_kernel.h" 23#include "ncplib_kernel.h"
24 24
25static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync) 25static int ncp_fsync(struct file *file, int datasync)
26{ 26{
27 return 0; 27 return 0;
28} 28}
@@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations =
295 .llseek = ncp_remote_llseek, 295 .llseek = ncp_remote_llseek,
296 .read = ncp_file_read, 296 .read = ncp_file_read,
297 .write = ncp_file_write, 297 .write = ncp_file_write,
298 .ioctl = ncp_ioctl, 298 .unlocked_ioctl = ncp_ioctl,
299#ifdef CONFIG_COMPAT 299#ifdef CONFIG_COMPAT
300 .compat_ioctl = ncp_compat_ioctl, 300 .compat_ioctl = ncp_compat_ioctl,
301#endif 301#endif
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index fa3385154023..b4de38cf49f5 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -43,7 +43,7 @@
43#define NCP_DEFAULT_TIME_OUT 10 43#define NCP_DEFAULT_TIME_OUT 10
44#define NCP_DEFAULT_RETRY_COUNT 20 44#define NCP_DEFAULT_RETRY_COUNT 20
45 45
46static void ncp_delete_inode(struct inode *); 46static void ncp_evict_inode(struct inode *);
47static void ncp_put_super(struct super_block *); 47static void ncp_put_super(struct super_block *);
48static int ncp_statfs(struct dentry *, struct kstatfs *); 48static int ncp_statfs(struct dentry *, struct kstatfs *);
49static int ncp_show_options(struct seq_file *, struct vfsmount *); 49static int ncp_show_options(struct seq_file *, struct vfsmount *);
@@ -100,7 +100,7 @@ static const struct super_operations ncp_sops =
100 .alloc_inode = ncp_alloc_inode, 100 .alloc_inode = ncp_alloc_inode,
101 .destroy_inode = ncp_destroy_inode, 101 .destroy_inode = ncp_destroy_inode,
102 .drop_inode = generic_delete_inode, 102 .drop_inode = generic_delete_inode,
103 .delete_inode = ncp_delete_inode, 103 .evict_inode = ncp_evict_inode,
104 .put_super = ncp_put_super, 104 .put_super = ncp_put_super,
105 .statfs = ncp_statfs, 105 .statfs = ncp_statfs,
106 .remount_fs = ncp_remount, 106 .remount_fs = ncp_remount,
@@ -282,19 +282,19 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
282} 282}
283 283
284static void 284static void
285ncp_delete_inode(struct inode *inode) 285ncp_evict_inode(struct inode *inode)
286{ 286{
287 truncate_inode_pages(&inode->i_data, 0); 287 truncate_inode_pages(&inode->i_data, 0);
288 end_writeback(inode);
288 289
289 if (S_ISDIR(inode->i_mode)) { 290 if (S_ISDIR(inode->i_mode)) {
290 DDPRINTK("ncp_delete_inode: put directory %ld\n", inode->i_ino); 291 DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino);
291 } 292 }
292 293
293 if (ncp_make_closed(inode) != 0) { 294 if (ncp_make_closed(inode) != 0) {
294 /* We can't do anything but complain. */ 295 /* We can't do anything but complain. */
295 printk(KERN_ERR "ncp_delete_inode: could not close\n"); 296 printk(KERN_ERR "ncp_evict_inode: could not close\n");
296 } 297 }
297 clear_inode(inode);
298} 298}
299 299
300static void ncp_stop_tasks(struct ncp_server *server) { 300static void ncp_stop_tasks(struct ncp_server *server) {
@@ -728,8 +728,8 @@ out_fput:
728out_bdi: 728out_bdi:
729 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: 729 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
730 * 730 *
731 * The previously used put_filp(ncp_filp); was bogous, since 731 * The previously used put_filp(ncp_filp); was bogus, since
732 * it doesn't proper unlocking. 732 * it doesn't perform proper unlocking.
733 */ 733 */
734 fput(ncp_filp); 734 fput(ncp_filp);
735out: 735out:
@@ -924,9 +924,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
924 tmpattr.ia_valid = ATTR_MODE; 924 tmpattr.ia_valid = ATTR_MODE;
925 tmpattr.ia_mode = attr->ia_mode; 925 tmpattr.ia_mode = attr->ia_mode;
926 926
927 result = inode_setattr(inode, &tmpattr); 927 setattr_copy(inode, &tmpattr);
928 if (result) 928 mark_inode_dirty(inode);
929 goto out;
930 } 929 }
931 } 930 }
932#endif 931#endif
@@ -954,15 +953,12 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
954 result = ncp_make_closed(inode); 953 result = ncp_make_closed(inode);
955 if (result) 954 if (result)
956 goto out; 955 goto out;
957 { 956
958 struct iattr tmpattr; 957 if (attr->ia_size != i_size_read(inode)) {
959 958 result = vmtruncate(inode, attr->ia_size);
960 tmpattr.ia_valid = ATTR_SIZE;
961 tmpattr.ia_size = attr->ia_size;
962
963 result = inode_setattr(inode, &tmpattr);
964 if (result) 959 if (result)
965 goto out; 960 goto out;
961 mark_inode_dirty(inode);
966 } 962 }
967 } 963 }
968 if ((attr->ia_valid & ATTR_CTIME) != 0) { 964 if ((attr->ia_valid & ATTR_CTIME) != 0) {
@@ -1002,8 +998,12 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
1002 NCP_FINFO(inode)->nwattr = info.attributes; 998 NCP_FINFO(inode)->nwattr = info.attributes;
1003#endif 999#endif
1004 } 1000 }
1005 if (!result) 1001 if (result)
1006 result = inode_setattr(inode, attr); 1002 goto out;
1003
1004 setattr_copy(inode, attr);
1005 mark_inode_dirty(inode);
1006
1007out: 1007out:
1008 unlock_kernel(); 1008 unlock_kernel();
1009 return result; 1009 return result;
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60a5e2864ea8..84a8cfc4e38e 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -261,9 +261,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
261} 261}
262#endif /* CONFIG_NCPFS_NLS */ 262#endif /* CONFIG_NCPFS_NLS */
263 263
264static int __ncp_ioctl(struct inode *inode, struct file *filp, 264static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
265 unsigned int cmd, unsigned long arg)
266{ 265{
266 struct inode *inode = filp->f_dentry->d_inode;
267 struct ncp_server *server = NCP_SERVER(inode); 267 struct ncp_server *server = NCP_SERVER(inode);
268 int result; 268 int result;
269 struct ncp_ioctl_request request; 269 struct ncp_ioctl_request request;
@@ -841,11 +841,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
841 } 841 }
842} 842}
843 843
844int ncp_ioctl(struct inode *inode, struct file *filp, 844long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
845 unsigned int cmd, unsigned long arg)
846{ 845{
847 int ret; 846 long ret;
848 847
848 lock_kernel();
849 if (ncp_ioctl_need_write(cmd)) { 849 if (ncp_ioctl_need_write(cmd)) {
850 /* 850 /*
851 * inside the ioctl(), any failures which 851 * inside the ioctl(), any failures which
@@ -853,24 +853,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
853 * -EACCESS, so it seems consistent to keep 853 * -EACCESS, so it seems consistent to keep
854 * that here. 854 * that here.
855 */ 855 */
856 if (mnt_want_write(filp->f_path.mnt)) 856 if (mnt_want_write(filp->f_path.mnt)) {
857 return -EACCES; 857 ret = -EACCES;
858 goto out;
859 }
858 } 860 }
859 ret = __ncp_ioctl(inode, filp, cmd, arg); 861 ret = __ncp_ioctl(filp, cmd, arg);
860 if (ncp_ioctl_need_write(cmd)) 862 if (ncp_ioctl_need_write(cmd))
861 mnt_drop_write(filp->f_path.mnt); 863 mnt_drop_write(filp->f_path.mnt);
864
865out:
866 unlock_kernel();
862 return ret; 867 return ret;
863} 868}
864 869
865#ifdef CONFIG_COMPAT 870#ifdef CONFIG_COMPAT
866long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 871long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
867{ 872{
868 struct inode *inode = file->f_path.dentry->d_inode; 873 long ret;
869 int ret;
870 874
871 lock_kernel(); 875 lock_kernel();
872 arg = (unsigned long) compat_ptr(arg); 876 arg = (unsigned long) compat_ptr(arg);
873 ret = ncp_ioctl(inode, file, cmd, arg); 877 ret = ncp_ioctl(file, cmd, arg);
874 unlock_kernel(); 878 unlock_kernel();
875 return ret; 879 return ret;
876} 880}
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index a43d07e7b924..f7e13db613cb 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -61,9 +61,9 @@ config NFS_V3_ACL
61 If unsure, say N. 61 If unsure, say N.
62 62
63config NFS_V4 63config NFS_V4
64 bool "NFS client support for NFS version 4 (EXPERIMENTAL)" 64 bool "NFS client support for NFS version 4"
65 depends on NFS_FS && EXPERIMENTAL 65 depends on NFS_FS
66 select RPCSEC_GSS_KRB5 66 select SUNRPC_GSS
67 help 67 help
68 This option enables support for version 4 of the NFS protocol 68 This option enables support for version 4 of the NFS protocol
69 (RFC 3530) in the kernel's NFS client. 69 (RFC 3530) in the kernel's NFS client.
@@ -72,16 +72,16 @@ config NFS_V4
72 space programs which can be found in the Linux nfs-utils package, 72 space programs which can be found in the Linux nfs-utils package,
73 available from http://linux-nfs.org/. 73 available from http://linux-nfs.org/.
74 74
75 If unsure, say N. 75 If unsure, say Y.
76 76
77config NFS_V4_1 77config NFS_V4_1
78 bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)" 78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
79 depends on NFS_V4 && EXPERIMENTAL 79 depends on NFS_V4 && EXPERIMENTAL
80 help 80 help
81 This option enables support for minor version 1 of the NFSv4 protocol 81 This option enables support for minor version 1 of the NFSv4 protocol
82 (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. 82 (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
83 83
84 Unless you're an NFS developer, say N. 84 If unsure, say N.
85 85
86config ROOT_NFS 86config ROOT_NFS
87 bool "Root file system on NFS" 87 bool "Root file system on NFS"
@@ -100,3 +100,20 @@ config NFS_FSCACHE
100 help 100 help
101 Say Y here if you want NFS data to be cached locally on disc through 101 Say Y here if you want NFS data to be cached locally on disc through
102 the general filesystem cache manager 102 the general filesystem cache manager
103
104config NFS_USE_LEGACY_DNS
105 bool "Use the legacy NFS DNS resolver"
106 depends on NFS_V4
107 help
108 The kernel now provides a method for translating a host name into an
109 IP address. Select Y here if you would rather use your own DNS
110 resolver script.
111
112 If unsure, say N
113
114config NFS_USE_KERNEL_DNS
115 bool
116 depends on NFS_V4 && !NFS_USE_LEGACY_DNS
117 select DNS_RESOLVER
118 select KEYS
119 default y
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 36dfdae95123..e17b49e2eabd 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -45,7 +45,7 @@ unsigned short nfs_callback_tcpport;
45unsigned short nfs_callback_tcpport6; 45unsigned short nfs_callback_tcpport6;
46#define NFS_CALLBACK_MAXPORTNR (65535U) 46#define NFS_CALLBACK_MAXPORTNR (65535U)
47 47
48static int param_set_portnr(const char *val, struct kernel_param *kp) 48static int param_set_portnr(const char *val, const struct kernel_param *kp)
49{ 49{
50 unsigned long num; 50 unsigned long num;
51 int ret; 51 int ret;
@@ -58,11 +58,10 @@ static int param_set_portnr(const char *val, struct kernel_param *kp)
58 *((unsigned int *)kp->arg) = num; 58 *((unsigned int *)kp->arg) = num;
59 return 0; 59 return 0;
60} 60}
61 61static struct kernel_param_ops param_ops_portnr = {
62static int param_get_portnr(char *buffer, struct kernel_param *kp) 62 .set = param_set_portnr,
63{ 63 .get = param_get_uint,
64 return param_get_uint(buffer, kp); 64};
65}
66#define param_check_portnr(name, p) __param_check(name, p, unsigned int); 65#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
67 66
68module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); 67module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a08770a7e857..930d10fecdaf 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -37,8 +37,8 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
37 if (inode == NULL) 37 if (inode == NULL)
38 goto out_putclient; 38 goto out_putclient;
39 nfsi = NFS_I(inode); 39 nfsi = NFS_I(inode);
40 down_read(&nfsi->rwsem); 40 rcu_read_lock();
41 delegation = nfsi->delegation; 41 delegation = rcu_dereference(nfsi->delegation);
42 if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) 42 if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
43 goto out_iput; 43 goto out_iput;
44 res->size = i_size_read(inode); 44 res->size = i_size_read(inode);
@@ -53,7 +53,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
53 args->bitmap[1]; 53 args->bitmap[1];
54 res->status = 0; 54 res->status = 0;
55out_iput: 55out_iput:
56 up_read(&nfsi->rwsem); 56 rcu_read_unlock();
57 iput(inode); 57 iput(inode);
58out_putclient: 58out_putclient:
59 nfs_put_client(clp); 59 nfs_put_client(clp);
@@ -62,16 +62,6 @@ out:
62 return res->status; 62 return res->status;
63} 63}
64 64
65static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *)
66{
67#if defined(CONFIG_NFS_V4_1)
68 if (clp->cl_minorversion > 0)
69 return nfs41_validate_delegation_stateid;
70#endif
71 return nfs4_validate_delegation_stateid;
72}
73
74
75__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) 65__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
76{ 66{
77 struct nfs_client *clp; 67 struct nfs_client *clp;
@@ -92,8 +82,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
92 inode = nfs_delegation_find_inode(clp, &args->fh); 82 inode = nfs_delegation_find_inode(clp, &args->fh);
93 if (inode != NULL) { 83 if (inode != NULL) {
94 /* Set up a helper thread to actually return the delegation */ 84 /* Set up a helper thread to actually return the delegation */
95 switch (nfs_async_inode_return_delegation(inode, &args->stateid, 85 switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
96 nfs_validate_delegation_stateid(clp))) {
97 case 0: 86 case 0:
98 res = 0; 87 res = 0;
99 break; 88 break;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index acc9c4943b84..e7340729af89 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -150,6 +150,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
150 clp->cl_boot_time = CURRENT_TIME; 150 clp->cl_boot_time = CURRENT_TIME;
151 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; 151 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
152 clp->cl_minorversion = cl_init->minorversion; 152 clp->cl_minorversion = cl_init->minorversion;
153 clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
153#endif 154#endif
154 cred = rpc_lookup_machine_cred(); 155 cred = rpc_lookup_machine_cred();
155 if (!IS_ERR(cred)) 156 if (!IS_ERR(cred))
@@ -178,7 +179,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
178 clp->cl_session = NULL; 179 clp->cl_session = NULL;
179 } 180 }
180 181
181 clp->cl_call_sync = _nfs4_call_sync; 182 clp->cl_mvops = nfs_v4_minor_ops[0];
182#endif /* CONFIG_NFS_V4_1 */ 183#endif /* CONFIG_NFS_V4_1 */
183} 184}
184 185
@@ -188,7 +189,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
188static void nfs4_destroy_callback(struct nfs_client *clp) 189static void nfs4_destroy_callback(struct nfs_client *clp)
189{ 190{
190 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) 191 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
191 nfs_callback_down(clp->cl_minorversion); 192 nfs_callback_down(clp->cl_mvops->minor_version);
192} 193}
193 194
194static void nfs4_shutdown_client(struct nfs_client *clp) 195static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -274,7 +275,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
274 sin1->sin6_scope_id != sin2->sin6_scope_id) 275 sin1->sin6_scope_id != sin2->sin6_scope_id)
275 return 0; 276 return 0;
276 277
277 return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); 278 return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
278} 279}
279#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ 280#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
280static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, 281static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -934,7 +935,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
934 } 935 }
935 936
936 fsinfo.fattr = fattr; 937 fsinfo.fattr = fattr;
937 nfs_fattr_init(fattr);
938 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); 938 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
939 if (error < 0) 939 if (error < 0)
940 goto out_error; 940 goto out_error;
@@ -1047,13 +1047,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1047 struct nfs_fh *mntfh) 1047 struct nfs_fh *mntfh)
1048{ 1048{
1049 struct nfs_server *server; 1049 struct nfs_server *server;
1050 struct nfs_fattr fattr; 1050 struct nfs_fattr *fattr;
1051 int error; 1051 int error;
1052 1052
1053 server = nfs_alloc_server(); 1053 server = nfs_alloc_server();
1054 if (!server) 1054 if (!server)
1055 return ERR_PTR(-ENOMEM); 1055 return ERR_PTR(-ENOMEM);
1056 1056
1057 error = -ENOMEM;
1058 fattr = nfs_alloc_fattr();
1059 if (fattr == NULL)
1060 goto error;
1061
1057 /* Get a client representation */ 1062 /* Get a client representation */
1058 error = nfs_init_server(server, data); 1063 error = nfs_init_server(server, data);
1059 if (error < 0) 1064 if (error < 0)
@@ -1064,7 +1069,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1064 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1069 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1065 1070
1066 /* Probe the root fh to retrieve its FSID */ 1071 /* Probe the root fh to retrieve its FSID */
1067 error = nfs_probe_fsinfo(server, mntfh, &fattr); 1072 error = nfs_probe_fsinfo(server, mntfh, fattr);
1068 if (error < 0) 1073 if (error < 0)
1069 goto error; 1074 goto error;
1070 if (server->nfs_client->rpc_ops->version == 3) { 1075 if (server->nfs_client->rpc_ops->version == 3) {
@@ -1077,14 +1082,14 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1077 server->namelen = NFS2_MAXNAMLEN; 1082 server->namelen = NFS2_MAXNAMLEN;
1078 } 1083 }
1079 1084
1080 if (!(fattr.valid & NFS_ATTR_FATTR)) { 1085 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1081 error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); 1086 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
1082 if (error < 0) { 1087 if (error < 0) {
1083 dprintk("nfs_create_server: getattr error = %d\n", -error); 1088 dprintk("nfs_create_server: getattr error = %d\n", -error);
1084 goto error; 1089 goto error;
1085 } 1090 }
1086 } 1091 }
1087 memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); 1092 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
1088 1093
1089 dprintk("Server FSID: %llx:%llx\n", 1094 dprintk("Server FSID: %llx:%llx\n",
1090 (unsigned long long) server->fsid.major, 1095 (unsigned long long) server->fsid.major,
@@ -1096,9 +1101,11 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1096 spin_unlock(&nfs_client_lock); 1101 spin_unlock(&nfs_client_lock);
1097 1102
1098 server->mount_time = jiffies; 1103 server->mount_time = jiffies;
1104 nfs_free_fattr(fattr);
1099 return server; 1105 return server;
1100 1106
1101error: 1107error:
1108 nfs_free_fattr(fattr);
1102 nfs_free_server(server); 1109 nfs_free_server(server);
1103 return ERR_PTR(error); 1110 return ERR_PTR(error);
1104} 1111}
@@ -1120,7 +1127,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
1120 return error; 1127 return error;
1121 } 1128 }
1122 1129
1123 error = nfs_callback_up(clp->cl_minorversion, 1130 error = nfs_callback_up(clp->cl_mvops->minor_version,
1124 clp->cl_rpcclient->cl_xprt); 1131 clp->cl_rpcclient->cl_xprt);
1125 if (error < 0) { 1132 if (error < 0) {
1126 dprintk("%s: failed to start callback. Error = %d\n", 1133 dprintk("%s: failed to start callback. Error = %d\n",
@@ -1137,10 +1144,8 @@ static int nfs4_init_callback(struct nfs_client *clp)
1137 */ 1144 */
1138static int nfs4_init_client_minor_version(struct nfs_client *clp) 1145static int nfs4_init_client_minor_version(struct nfs_client *clp)
1139{ 1146{
1140 clp->cl_call_sync = _nfs4_call_sync;
1141
1142#if defined(CONFIG_NFS_V4_1) 1147#if defined(CONFIG_NFS_V4_1)
1143 if (clp->cl_minorversion) { 1148 if (clp->cl_mvops->minor_version) {
1144 struct nfs4_session *session = NULL; 1149 struct nfs4_session *session = NULL;
1145 /* 1150 /*
1146 * Create the session and mark it expired. 1151 * Create the session and mark it expired.
@@ -1152,7 +1157,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
1152 return -ENOMEM; 1157 return -ENOMEM;
1153 1158
1154 clp->cl_session = session; 1159 clp->cl_session = session;
1155 clp->cl_call_sync = _nfs4_call_sync_session; 1160 /*
1161 * The create session reply races with the server back
1162 * channel probe. Mark the client NFS_CS_SESSION_INITING
1163 * so that the client back channel can find the
1164 * nfs_client struct
1165 */
1166 clp->cl_cons_state = NFS_CS_SESSION_INITING;
1156 } 1167 }
1157#endif /* CONFIG_NFS_V4_1 */ 1168#endif /* CONFIG_NFS_V4_1 */
1158 1169
@@ -1280,6 +1291,55 @@ static void nfs4_session_set_rwsize(struct nfs_server *server)
1280#endif /* CONFIG_NFS_V4_1 */ 1291#endif /* CONFIG_NFS_V4_1 */
1281} 1292}
1282 1293
1294static int nfs4_server_common_setup(struct nfs_server *server,
1295 struct nfs_fh *mntfh)
1296{
1297 struct nfs_fattr *fattr;
1298 int error;
1299
1300 BUG_ON(!server->nfs_client);
1301 BUG_ON(!server->nfs_client->rpc_ops);
1302 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1303
1304 fattr = nfs_alloc_fattr();
1305 if (fattr == NULL)
1306 return -ENOMEM;
1307
1308 /* We must ensure the session is initialised first */
1309 error = nfs4_init_session(server);
1310 if (error < 0)
1311 goto out;
1312
1313 /* Probe the root fh to retrieve its FSID and filehandle */
1314 error = nfs4_get_rootfh(server, mntfh);
1315 if (error < 0)
1316 goto out;
1317
1318 dprintk("Server FSID: %llx:%llx\n",
1319 (unsigned long long) server->fsid.major,
1320 (unsigned long long) server->fsid.minor);
1321 dprintk("Mount FH: %d\n", mntfh->size);
1322
1323 nfs4_session_set_rwsize(server);
1324
1325 error = nfs_probe_fsinfo(server, mntfh, fattr);
1326 if (error < 0)
1327 goto out;
1328
1329 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
1330 server->namelen = NFS4_MAXNAMLEN;
1331
1332 spin_lock(&nfs_client_lock);
1333 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1334 list_add_tail(&server->master_link, &nfs_volume_list);
1335 spin_unlock(&nfs_client_lock);
1336
1337 server->mount_time = jiffies;
1338out:
1339 nfs_free_fattr(fattr);
1340 return error;
1341}
1342
1283/* 1343/*
1284 * Create a version 4 volume record 1344 * Create a version 4 volume record
1285 */ 1345 */
@@ -1340,7 +1400,6 @@ error:
1340struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, 1400struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1341 struct nfs_fh *mntfh) 1401 struct nfs_fh *mntfh)
1342{ 1402{
1343 struct nfs_fattr fattr;
1344 struct nfs_server *server; 1403 struct nfs_server *server;
1345 int error; 1404 int error;
1346 1405
@@ -1355,39 +1414,10 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1355 if (error < 0) 1414 if (error < 0)
1356 goto error; 1415 goto error;
1357 1416
1358 BUG_ON(!server->nfs_client); 1417 error = nfs4_server_common_setup(server, mntfh);
1359 BUG_ON(!server->nfs_client->rpc_ops);
1360 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1361
1362 error = nfs4_init_session(server);
1363 if (error < 0)
1364 goto error;
1365
1366 /* Probe the root fh to retrieve its FSID */
1367 error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
1368 if (error < 0)
1369 goto error;
1370
1371 dprintk("Server FSID: %llx:%llx\n",
1372 (unsigned long long) server->fsid.major,
1373 (unsigned long long) server->fsid.minor);
1374 dprintk("Mount FH: %d\n", mntfh->size);
1375
1376 nfs4_session_set_rwsize(server);
1377
1378 error = nfs_probe_fsinfo(server, mntfh, &fattr);
1379 if (error < 0) 1418 if (error < 0)
1380 goto error; 1419 goto error;
1381 1420
1382 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
1383 server->namelen = NFS4_MAXNAMLEN;
1384
1385 spin_lock(&nfs_client_lock);
1386 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1387 list_add_tail(&server->master_link, &nfs_volume_list);
1388 spin_unlock(&nfs_client_lock);
1389
1390 server->mount_time = jiffies;
1391 dprintk("<-- nfs4_create_server() = %p\n", server); 1421 dprintk("<-- nfs4_create_server() = %p\n", server);
1392 return server; 1422 return server;
1393 1423
@@ -1405,7 +1435,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1405{ 1435{
1406 struct nfs_client *parent_client; 1436 struct nfs_client *parent_client;
1407 struct nfs_server *server, *parent_server; 1437 struct nfs_server *server, *parent_server;
1408 struct nfs_fattr fattr;
1409 int error; 1438 int error;
1410 1439
1411 dprintk("--> nfs4_create_referral_server()\n"); 1440 dprintk("--> nfs4_create_referral_server()\n");
@@ -1430,7 +1459,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1430 data->authflavor, 1459 data->authflavor,
1431 parent_server->client->cl_xprt->prot, 1460 parent_server->client->cl_xprt->prot,
1432 parent_server->client->cl_timeout, 1461 parent_server->client->cl_timeout,
1433 parent_client->cl_minorversion); 1462 parent_client->cl_mvops->minor_version);
1434 if (error < 0) 1463 if (error < 0)
1435 goto error; 1464 goto error;
1436 1465
@@ -1438,34 +1467,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1438 if (error < 0) 1467 if (error < 0)
1439 goto error; 1468 goto error;
1440 1469
1441 BUG_ON(!server->nfs_client); 1470 error = nfs4_server_common_setup(server, mntfh);
1442 BUG_ON(!server->nfs_client->rpc_ops);
1443 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1444
1445 /* Probe the root fh to retrieve its FSID and filehandle */
1446 error = nfs4_path_walk(server, mntfh, data->mnt_path);
1447 if (error < 0) 1471 if (error < 0)
1448 goto error; 1472 goto error;
1449 1473
1450 /* probe the filesystem info for this server filesystem */
1451 error = nfs_probe_fsinfo(server, mntfh, &fattr);
1452 if (error < 0)
1453 goto error;
1454
1455 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
1456 server->namelen = NFS4_MAXNAMLEN;
1457
1458 dprintk("Referral FSID: %llx:%llx\n",
1459 (unsigned long long) server->fsid.major,
1460 (unsigned long long) server->fsid.minor);
1461
1462 spin_lock(&nfs_client_lock);
1463 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1464 list_add_tail(&server->master_link, &nfs_volume_list);
1465 spin_unlock(&nfs_client_lock);
1466
1467 server->mount_time = jiffies;
1468
1469 dprintk("<-- nfs_create_referral_server() = %p\n", server); 1474 dprintk("<-- nfs_create_referral_server() = %p\n", server);
1470 return server; 1475 return server;
1471 1476
@@ -1485,7 +1490,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1485 struct nfs_fattr *fattr) 1490 struct nfs_fattr *fattr)
1486{ 1491{
1487 struct nfs_server *server; 1492 struct nfs_server *server;
1488 struct nfs_fattr fattr_fsinfo; 1493 struct nfs_fattr *fattr_fsinfo;
1489 int error; 1494 int error;
1490 1495
1491 dprintk("--> nfs_clone_server(,%llx:%llx,)\n", 1496 dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
@@ -1496,6 +1501,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1496 if (!server) 1501 if (!server)
1497 return ERR_PTR(-ENOMEM); 1502 return ERR_PTR(-ENOMEM);
1498 1503
1504 error = -ENOMEM;
1505 fattr_fsinfo = nfs_alloc_fattr();
1506 if (fattr_fsinfo == NULL)
1507 goto out_free_server;
1508
1499 /* Copy data from the source */ 1509 /* Copy data from the source */
1500 server->nfs_client = source->nfs_client; 1510 server->nfs_client = source->nfs_client;
1501 atomic_inc(&server->nfs_client->cl_count); 1511 atomic_inc(&server->nfs_client->cl_count);
@@ -1512,7 +1522,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1512 nfs_init_server_aclclient(server); 1522 nfs_init_server_aclclient(server);
1513 1523
1514 /* probe the filesystem info for this server filesystem */ 1524 /* probe the filesystem info for this server filesystem */
1515 error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo); 1525 error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
1516 if (error < 0) 1526 if (error < 0)
1517 goto out_free_server; 1527 goto out_free_server;
1518 1528
@@ -1534,10 +1544,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1534 1544
1535 server->mount_time = jiffies; 1545 server->mount_time = jiffies;
1536 1546
1547 nfs_free_fattr(fattr_fsinfo);
1537 dprintk("<-- nfs_clone_server() = %p\n", server); 1548 dprintk("<-- nfs_clone_server() = %p\n", server);
1538 return server; 1549 return server;
1539 1550
1540out_free_server: 1551out_free_server:
1552 nfs_free_fattr(fattr_fsinfo);
1541 nfs_free_server(server); 1553 nfs_free_server(server);
1542 dprintk("<-- nfs_clone_server() = error %d\n", error); 1554 dprintk("<-- nfs_clone_server() = error %d\n", error);
1543 return ERR_PTR(error); 1555 return ERR_PTR(error);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ea61d26e7871..b9c3c43cea1d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -213,7 +213,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
213 struct nfs_delegation *freeme = NULL; 213 struct nfs_delegation *freeme = NULL;
214 int status = 0; 214 int status = 0;
215 215
216 delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); 216 delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
217 if (delegation == NULL) 217 if (delegation == NULL)
218 return -ENOMEM; 218 return -ENOMEM;
219 memcpy(delegation->stateid.data, res->delegation.data, 219 memcpy(delegation->stateid.data, res->delegation.data,
@@ -268,14 +268,6 @@ out:
268 return status; 268 return status;
269} 269}
270 270
271/* Sync all data to disk upon delegation return */
272static void nfs_msync_inode(struct inode *inode)
273{
274 filemap_fdatawrite(inode->i_mapping);
275 nfs_wb_all(inode);
276 filemap_fdatawait(inode->i_mapping);
277}
278
279/* 271/*
280 * Basic procedure for returning a delegation to the server 272 * Basic procedure for returning a delegation to the server
281 */ 273 */
@@ -367,7 +359,7 @@ int nfs_inode_return_delegation(struct inode *inode)
367 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); 359 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
368 spin_unlock(&clp->cl_lock); 360 spin_unlock(&clp->cl_lock);
369 if (delegation != NULL) { 361 if (delegation != NULL) {
370 nfs_msync_inode(inode); 362 nfs_wb_all(inode);
371 err = __nfs_inode_return_delegation(inode, delegation, 1); 363 err = __nfs_inode_return_delegation(inode, delegation, 1);
372 } 364 }
373 } 365 }
@@ -471,9 +463,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
471/* 463/*
472 * Asynchronous delegation recall! 464 * Asynchronous delegation recall!
473 */ 465 */
474int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, 466int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
475 int (*validate_stateid)(struct nfs_delegation *delegation,
476 const nfs4_stateid *stateid))
477{ 467{
478 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 468 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
479 struct nfs_delegation *delegation; 469 struct nfs_delegation *delegation;
@@ -481,7 +471,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
481 rcu_read_lock(); 471 rcu_read_lock();
482 delegation = rcu_dereference(NFS_I(inode)->delegation); 472 delegation = rcu_dereference(NFS_I(inode)->delegation);
483 473
484 if (!validate_stateid(delegation, stateid)) { 474 if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
485 rcu_read_unlock(); 475 rcu_read_unlock();
486 return -ENOENT; 476 return -ENOENT;
487 } 477 }
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 69e7b8140122..2026304bda19 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,9 +34,7 @@ enum {
34int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 34int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
35void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 35void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
36int nfs_inode_return_delegation(struct inode *inode); 36int nfs_inode_return_delegation(struct inode *inode);
37int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, 37int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
38 int (*validate_stateid)(struct nfs_delegation *delegation,
39 const nfs4_stateid *stateid));
40void nfs_inode_return_delegation_noreclaim(struct inode *inode); 38void nfs_inode_return_delegation_noreclaim(struct inode *inode);
41 39
42struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); 40struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a7bb5c694aa3..e257172d438c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); 53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
54static int nfs_rename(struct inode *, struct dentry *, 54static int nfs_rename(struct inode *, struct dentry *,
55 struct inode *, struct dentry *); 55 struct inode *, struct dentry *);
56static int nfs_fsync_dir(struct file *, struct dentry *, int); 56static int nfs_fsync_dir(struct file *, int);
57static loff_t nfs_llseek_dir(struct file *, loff_t, int); 57static loff_t nfs_llseek_dir(struct file *, loff_t, int);
58 58
59const struct file_operations nfs_dir_operations = { 59const struct file_operations nfs_dir_operations = {
@@ -140,6 +140,13 @@ nfs_opendir(struct inode *inode, struct file *filp)
140 140
141 /* Call generic open code in order to cache credentials */ 141 /* Call generic open code in order to cache credentials */
142 res = nfs_open(inode, filp); 142 res = nfs_open(inode, filp);
143 if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
144 /* This is a mountpoint, so d_revalidate will never
145 * have been called, so we need to refresh the
146 * inode (for close-open consistency) ourselves.
147 */
148 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
149 }
143 return res; 150 return res;
144} 151}
145 152
@@ -530,9 +537,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
530 nfs_readdir_descriptor_t my_desc, 537 nfs_readdir_descriptor_t my_desc,
531 *desc = &my_desc; 538 *desc = &my_desc;
532 struct nfs_entry my_entry; 539 struct nfs_entry my_entry;
533 struct nfs_fh fh; 540 int res = -ENOMEM;
534 struct nfs_fattr fattr;
535 long res;
536 541
537 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 542 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
538 dentry->d_parent->d_name.name, dentry->d_name.name, 543 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -554,9 +559,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
554 559
555 my_entry.cookie = my_entry.prev_cookie = 0; 560 my_entry.cookie = my_entry.prev_cookie = 0;
556 my_entry.eof = 0; 561 my_entry.eof = 0;
557 my_entry.fh = &fh; 562 my_entry.fh = nfs_alloc_fhandle();
558 my_entry.fattr = &fattr; 563 my_entry.fattr = nfs_alloc_fattr();
559 nfs_fattr_init(&fattr); 564 if (my_entry.fh == NULL || my_entry.fattr == NULL)
565 goto out_alloc_failed;
566
560 desc->entry = &my_entry; 567 desc->entry = &my_entry;
561 568
562 nfs_block_sillyrename(dentry); 569 nfs_block_sillyrename(dentry);
@@ -598,7 +605,10 @@ out:
598 nfs_unblock_sillyrename(dentry); 605 nfs_unblock_sillyrename(dentry);
599 if (res > 0) 606 if (res > 0)
600 res = 0; 607 res = 0;
601 dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", 608out_alloc_failed:
609 nfs_free_fattr(my_entry.fattr);
610 nfs_free_fhandle(my_entry.fh);
611 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
602 dentry->d_parent->d_name.name, dentry->d_name.name, 612 dentry->d_parent->d_name.name, dentry->d_name.name,
603 res); 613 res);
604 return res; 614 return res;
@@ -638,8 +648,10 @@ out:
638 * All directory operations under NFS are synchronous, so fsync() 648 * All directory operations under NFS are synchronous, so fsync()
639 * is a dummy operation. 649 * is a dummy operation.
640 */ 650 */
641static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) 651static int nfs_fsync_dir(struct file *filp, int datasync)
642{ 652{
653 struct dentry *dentry = filp->f_path.dentry;
654
643 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", 655 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
644 dentry->d_parent->d_name.name, dentry->d_name.name, 656 dentry->d_parent->d_name.name, dentry->d_name.name,
645 datasync); 657 datasync);
@@ -776,9 +788,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
776 struct inode *dir; 788 struct inode *dir;
777 struct inode *inode; 789 struct inode *inode;
778 struct dentry *parent; 790 struct dentry *parent;
791 struct nfs_fh *fhandle = NULL;
792 struct nfs_fattr *fattr = NULL;
779 int error; 793 int error;
780 struct nfs_fh fhandle;
781 struct nfs_fattr fattr;
782 794
783 parent = dget_parent(dentry); 795 parent = dget_parent(dentry);
784 dir = parent->d_inode; 796 dir = parent->d_inode;
@@ -811,14 +823,22 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
811 if (NFS_STALE(inode)) 823 if (NFS_STALE(inode))
812 goto out_bad; 824 goto out_bad;
813 825
814 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); 826 error = -ENOMEM;
827 fhandle = nfs_alloc_fhandle();
828 fattr = nfs_alloc_fattr();
829 if (fhandle == NULL || fattr == NULL)
830 goto out_error;
831
832 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
815 if (error) 833 if (error)
816 goto out_bad; 834 goto out_bad;
817 if (nfs_compare_fh(NFS_FH(inode), &fhandle)) 835 if (nfs_compare_fh(NFS_FH(inode), fhandle))
818 goto out_bad; 836 goto out_bad;
819 if ((error = nfs_refresh_inode(inode, &fattr)) != 0) 837 if ((error = nfs_refresh_inode(inode, fattr)) != 0)
820 goto out_bad; 838 goto out_bad;
821 839
840 nfs_free_fattr(fattr);
841 nfs_free_fhandle(fhandle);
822out_set_verifier: 842out_set_verifier:
823 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 843 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
824 out_valid: 844 out_valid:
@@ -842,11 +862,21 @@ out_zap_parent:
842 shrink_dcache_parent(dentry); 862 shrink_dcache_parent(dentry);
843 } 863 }
844 d_drop(dentry); 864 d_drop(dentry);
865 nfs_free_fattr(fattr);
866 nfs_free_fhandle(fhandle);
845 dput(parent); 867 dput(parent);
846 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", 868 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
847 __func__, dentry->d_parent->d_name.name, 869 __func__, dentry->d_parent->d_name.name,
848 dentry->d_name.name); 870 dentry->d_name.name);
849 return 0; 871 return 0;
872out_error:
873 nfs_free_fattr(fattr);
874 nfs_free_fhandle(fhandle);
875 dput(parent);
876 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
877 __func__, dentry->d_parent->d_name.name,
878 dentry->d_name.name, error);
879 return error;
850} 880}
851 881
852/* 882/*
@@ -911,9 +941,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
911 struct dentry *res; 941 struct dentry *res;
912 struct dentry *parent; 942 struct dentry *parent;
913 struct inode *inode = NULL; 943 struct inode *inode = NULL;
944 struct nfs_fh *fhandle = NULL;
945 struct nfs_fattr *fattr = NULL;
914 int error; 946 int error;
915 struct nfs_fh fhandle;
916 struct nfs_fattr fattr;
917 947
918 dfprintk(VFS, "NFS: lookup(%s/%s)\n", 948 dfprintk(VFS, "NFS: lookup(%s/%s)\n",
919 dentry->d_parent->d_name.name, dentry->d_name.name); 949 dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -923,7 +953,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
923 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) 953 if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
924 goto out; 954 goto out;
925 955
926 res = ERR_PTR(-ENOMEM);
927 dentry->d_op = NFS_PROTO(dir)->dentry_ops; 956 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
928 957
929 /* 958 /*
@@ -936,17 +965,23 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
936 goto out; 965 goto out;
937 } 966 }
938 967
968 res = ERR_PTR(-ENOMEM);
969 fhandle = nfs_alloc_fhandle();
970 fattr = nfs_alloc_fattr();
971 if (fhandle == NULL || fattr == NULL)
972 goto out;
973
939 parent = dentry->d_parent; 974 parent = dentry->d_parent;
940 /* Protect against concurrent sillydeletes */ 975 /* Protect against concurrent sillydeletes */
941 nfs_block_sillyrename(parent); 976 nfs_block_sillyrename(parent);
942 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); 977 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
943 if (error == -ENOENT) 978 if (error == -ENOENT)
944 goto no_entry; 979 goto no_entry;
945 if (error < 0) { 980 if (error < 0) {
946 res = ERR_PTR(error); 981 res = ERR_PTR(error);
947 goto out_unblock_sillyrename; 982 goto out_unblock_sillyrename;
948 } 983 }
949 inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); 984 inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
950 res = (struct dentry *)inode; 985 res = (struct dentry *)inode;
951 if (IS_ERR(res)) 986 if (IS_ERR(res))
952 goto out_unblock_sillyrename; 987 goto out_unblock_sillyrename;
@@ -962,6 +997,8 @@ no_entry:
962out_unblock_sillyrename: 997out_unblock_sillyrename:
963 nfs_unblock_sillyrename(parent); 998 nfs_unblock_sillyrename(parent);
964out: 999out:
1000 nfs_free_fattr(fattr);
1001 nfs_free_fhandle(fhandle);
965 return res; 1002 return res;
966} 1003}
967 1004
@@ -1073,7 +1110,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1073 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) 1110 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
1074 goto no_open_dput; 1111 goto no_open_dput;
1075 /* We can't create new files, or truncate existing ones here */ 1112 /* We can't create new files, or truncate existing ones here */
1076 openflags &= ~(O_CREAT|O_TRUNC); 1113 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
1077 1114
1078 /* 1115 /*
1079 * Note: we're not holding inode->i_mutex and so may be racing with 1116 * Note: we're not holding inode->i_mutex and so may be racing with
@@ -1622,16 +1659,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1622 } 1659 }
1623 } 1660 }
1624 1661
1625 /*
1626 * ... prune child dentries and writebacks if needed.
1627 */
1628 if (atomic_read(&old_dentry->d_count) > 1) {
1629 if (S_ISREG(old_inode->i_mode))
1630 nfs_wb_all(old_inode);
1631 shrink_dcache_parent(old_dentry);
1632 }
1633 nfs_inode_return_delegation(old_inode); 1662 nfs_inode_return_delegation(old_inode);
1634
1635 if (new_inode != NULL) 1663 if (new_inode != NULL)
1636 nfs_inode_return_delegation(new_inode); 1664 nfs_inode_return_delegation(new_inode);
1637 1665
@@ -1669,28 +1697,33 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
1669 smp_mb__after_atomic_dec(); 1697 smp_mb__after_atomic_dec();
1670} 1698}
1671 1699
1672int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) 1700static void nfs_access_free_list(struct list_head *head)
1701{
1702 struct nfs_access_entry *cache;
1703
1704 while (!list_empty(head)) {
1705 cache = list_entry(head->next, struct nfs_access_entry, lru);
1706 list_del(&cache->lru);
1707 nfs_access_free_entry(cache);
1708 }
1709}
1710
1711int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
1673{ 1712{
1674 LIST_HEAD(head); 1713 LIST_HEAD(head);
1675 struct nfs_inode *nfsi; 1714 struct nfs_inode *nfsi;
1676 struct nfs_access_entry *cache; 1715 struct nfs_access_entry *cache;
1677 1716
1678restart: 1717 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1718 return (nr_to_scan == 0) ? 0 : -1;
1719
1679 spin_lock(&nfs_access_lru_lock); 1720 spin_lock(&nfs_access_lru_lock);
1680 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { 1721 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
1681 struct rw_semaphore *s_umount;
1682 struct inode *inode; 1722 struct inode *inode;
1683 1723
1684 if (nr_to_scan-- == 0) 1724 if (nr_to_scan-- == 0)
1685 break; 1725 break;
1686 s_umount = &nfsi->vfs_inode.i_sb->s_umount; 1726 inode = &nfsi->vfs_inode;
1687 if (!down_read_trylock(s_umount))
1688 continue;
1689 inode = igrab(&nfsi->vfs_inode);
1690 if (inode == NULL) {
1691 up_read(s_umount);
1692 continue;
1693 }
1694 spin_lock(&inode->i_lock); 1727 spin_lock(&inode->i_lock);
1695 if (list_empty(&nfsi->access_cache_entry_lru)) 1728 if (list_empty(&nfsi->access_cache_entry_lru))
1696 goto remove_lru_entry; 1729 goto remove_lru_entry;
@@ -1704,61 +1737,48 @@ restart:
1704 else { 1737 else {
1705remove_lru_entry: 1738remove_lru_entry:
1706 list_del_init(&nfsi->access_cache_inode_lru); 1739 list_del_init(&nfsi->access_cache_inode_lru);
1740 smp_mb__before_clear_bit();
1707 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); 1741 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1742 smp_mb__after_clear_bit();
1708 } 1743 }
1709 spin_unlock(&inode->i_lock); 1744 spin_unlock(&inode->i_lock);
1710 spin_unlock(&nfs_access_lru_lock);
1711 iput(inode);
1712 up_read(s_umount);
1713 goto restart;
1714 } 1745 }
1715 spin_unlock(&nfs_access_lru_lock); 1746 spin_unlock(&nfs_access_lru_lock);
1716 while (!list_empty(&head)) { 1747 nfs_access_free_list(&head);
1717 cache = list_entry(head.next, struct nfs_access_entry, lru);
1718 list_del(&cache->lru);
1719 nfs_access_free_entry(cache);
1720 }
1721 return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; 1748 return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
1722} 1749}
1723 1750
1724static void __nfs_access_zap_cache(struct inode *inode) 1751static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
1725{ 1752{
1726 struct nfs_inode *nfsi = NFS_I(inode);
1727 struct rb_root *root_node = &nfsi->access_cache; 1753 struct rb_root *root_node = &nfsi->access_cache;
1728 struct rb_node *n, *dispose = NULL; 1754 struct rb_node *n;
1729 struct nfs_access_entry *entry; 1755 struct nfs_access_entry *entry;
1730 1756
1731 /* Unhook entries from the cache */ 1757 /* Unhook entries from the cache */
1732 while ((n = rb_first(root_node)) != NULL) { 1758 while ((n = rb_first(root_node)) != NULL) {
1733 entry = rb_entry(n, struct nfs_access_entry, rb_node); 1759 entry = rb_entry(n, struct nfs_access_entry, rb_node);
1734 rb_erase(n, root_node); 1760 rb_erase(n, root_node);
1735 list_del(&entry->lru); 1761 list_move(&entry->lru, head);
1736 n->rb_left = dispose;
1737 dispose = n;
1738 } 1762 }
1739 nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; 1763 nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
1740 spin_unlock(&inode->i_lock);
1741
1742 /* Now kill them all! */
1743 while (dispose != NULL) {
1744 n = dispose;
1745 dispose = n->rb_left;
1746 nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
1747 }
1748} 1764}
1749 1765
1750void nfs_access_zap_cache(struct inode *inode) 1766void nfs_access_zap_cache(struct inode *inode)
1751{ 1767{
1768 LIST_HEAD(head);
1769
1770 if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
1771 return;
1752 /* Remove from global LRU init */ 1772 /* Remove from global LRU init */
1753 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { 1773 spin_lock(&nfs_access_lru_lock);
1754 spin_lock(&nfs_access_lru_lock); 1774 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
1755 list_del_init(&NFS_I(inode)->access_cache_inode_lru); 1775 list_del_init(&NFS_I(inode)->access_cache_inode_lru);
1756 spin_unlock(&nfs_access_lru_lock);
1757 }
1758 1776
1759 spin_lock(&inode->i_lock); 1777 spin_lock(&inode->i_lock);
1760 /* This will release the spinlock */ 1778 __nfs_access_zap_cache(NFS_I(inode), &head);
1761 __nfs_access_zap_cache(inode); 1779 spin_unlock(&inode->i_lock);
1780 spin_unlock(&nfs_access_lru_lock);
1781 nfs_access_free_list(&head);
1762} 1782}
1763 1783
1764static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) 1784static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
@@ -1809,8 +1829,8 @@ out_stale:
1809 nfs_access_free_entry(cache); 1829 nfs_access_free_entry(cache);
1810 return -ENOENT; 1830 return -ENOENT;
1811out_zap: 1831out_zap:
1812 /* This will release the spinlock */ 1832 spin_unlock(&inode->i_lock);
1813 __nfs_access_zap_cache(inode); 1833 nfs_access_zap_cache(inode);
1814 return -ENOENT; 1834 return -ENOENT;
1815} 1835}
1816 1836
@@ -1865,9 +1885,11 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
1865 smp_mb__after_atomic_inc(); 1885 smp_mb__after_atomic_inc();
1866 1886
1867 /* Add inode to global LRU list */ 1887 /* Add inode to global LRU list */
1868 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { 1888 if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
1869 spin_lock(&nfs_access_lru_lock); 1889 spin_lock(&nfs_access_lru_lock);
1870 list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); 1890 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
1891 list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
1892 &nfs_access_lru_list);
1871 spin_unlock(&nfs_access_lru_lock); 1893 spin_unlock(&nfs_access_lru_lock);
1872 } 1894 }
1873} 1895}
@@ -1929,7 +1951,7 @@ int nfs_permission(struct inode *inode, int mask)
1929 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 1951 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
1930 goto out; 1952 goto out;
1931 /* Is this sys_access() ? */ 1953 /* Is this sys_access() ? */
1932 if (mask & MAY_ACCESS) 1954 if (mask & (MAY_ACCESS | MAY_CHDIR))
1933 goto force_lookup; 1955 goto force_lookup;
1934 1956
1935 switch (inode->i_mode & S_IFMT) { 1957 switch (inode->i_mode & S_IFMT) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ad4cd31d6050..064a80961677 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -69,6 +69,7 @@ struct nfs_direct_req {
69 69
70 /* I/O parameters */ 70 /* I/O parameters */
71 struct nfs_open_context *ctx; /* file open context info */ 71 struct nfs_open_context *ctx; /* file open context info */
72 struct nfs_lock_context *l_ctx; /* Lock context info */
72 struct kiocb * iocb; /* controlling i/o request */ 73 struct kiocb * iocb; /* controlling i/o request */
73 struct inode * inode; /* target file of i/o */ 74 struct inode * inode; /* target file of i/o */
74 75
@@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
160 INIT_LIST_HEAD(&dreq->rewrite_list); 161 INIT_LIST_HEAD(&dreq->rewrite_list);
161 dreq->iocb = NULL; 162 dreq->iocb = NULL;
162 dreq->ctx = NULL; 163 dreq->ctx = NULL;
164 dreq->l_ctx = NULL;
163 spin_lock_init(&dreq->lock); 165 spin_lock_init(&dreq->lock);
164 atomic_set(&dreq->io_count, 0); 166 atomic_set(&dreq->io_count, 0);
165 dreq->count = 0; 167 dreq->count = 0;
@@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref)
173{ 175{
174 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); 176 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
175 177
178 if (dreq->l_ctx != NULL)
179 nfs_put_lock_context(dreq->l_ctx);
176 if (dreq->ctx != NULL) 180 if (dreq->ctx != NULL)
177 put_nfs_open_context(dreq->ctx); 181 put_nfs_open_context(dreq->ctx);
178 kmem_cache_free(nfs_direct_cachep, dreq); 182 kmem_cache_free(nfs_direct_cachep, dreq);
@@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
336 data->cred = msg.rpc_cred; 340 data->cred = msg.rpc_cred;
337 data->args.fh = NFS_FH(inode); 341 data->args.fh = NFS_FH(inode);
338 data->args.context = ctx; 342 data->args.context = ctx;
343 data->args.lock_context = dreq->l_ctx;
339 data->args.offset = pos; 344 data->args.offset = pos;
340 data->args.pgbase = pgbase; 345 data->args.pgbase = pgbase;
341 data->args.pages = data->pagevec; 346 data->args.pages = data->pagevec;
@@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
416static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 421static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
417 unsigned long nr_segs, loff_t pos) 422 unsigned long nr_segs, loff_t pos)
418{ 423{
419 ssize_t result = 0; 424 ssize_t result = -ENOMEM;
420 struct inode *inode = iocb->ki_filp->f_mapping->host; 425 struct inode *inode = iocb->ki_filp->f_mapping->host;
421 struct nfs_direct_req *dreq; 426 struct nfs_direct_req *dreq;
422 427
423 dreq = nfs_direct_req_alloc(); 428 dreq = nfs_direct_req_alloc();
424 if (!dreq) 429 if (dreq == NULL)
425 return -ENOMEM; 430 goto out;
426 431
427 dreq->inode = inode; 432 dreq->inode = inode;
428 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 433 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
434 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
435 if (dreq->l_ctx == NULL)
436 goto out_release;
429 if (!is_sync_kiocb(iocb)) 437 if (!is_sync_kiocb(iocb))
430 dreq->iocb = iocb; 438 dreq->iocb = iocb;
431 439
432 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); 440 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
433 if (!result) 441 if (!result)
434 result = nfs_direct_wait(dreq); 442 result = nfs_direct_wait(dreq);
443out_release:
435 nfs_direct_req_release(dreq); 444 nfs_direct_req_release(dreq);
436 445out:
437 return result; 446 return result;
438} 447}
439 448
@@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
574 data->args.offset = 0; 583 data->args.offset = 0;
575 data->args.count = 0; 584 data->args.count = 0;
576 data->args.context = dreq->ctx; 585 data->args.context = dreq->ctx;
586 data->args.lock_context = dreq->l_ctx;
577 data->res.count = 0; 587 data->res.count = 0;
578 data->res.fattr = &data->fattr; 588 data->res.fattr = &data->fattr;
579 data->res.verf = &data->verf; 589 data->res.verf = &data->verf;
@@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
761 data->cred = msg.rpc_cred; 771 data->cred = msg.rpc_cred;
762 data->args.fh = NFS_FH(inode); 772 data->args.fh = NFS_FH(inode);
763 data->args.context = ctx; 773 data->args.context = ctx;
774 data->args.lock_context = dreq->l_ctx;
764 data->args.offset = pos; 775 data->args.offset = pos;
765 data->args.pgbase = pgbase; 776 data->args.pgbase = pgbase;
766 data->args.pages = data->pagevec; 777 data->args.pages = data->pagevec;
@@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
845 unsigned long nr_segs, loff_t pos, 856 unsigned long nr_segs, loff_t pos,
846 size_t count) 857 size_t count)
847{ 858{
848 ssize_t result = 0; 859 ssize_t result = -ENOMEM;
849 struct inode *inode = iocb->ki_filp->f_mapping->host; 860 struct inode *inode = iocb->ki_filp->f_mapping->host;
850 struct nfs_direct_req *dreq; 861 struct nfs_direct_req *dreq;
851 size_t wsize = NFS_SERVER(inode)->wsize; 862 size_t wsize = NFS_SERVER(inode)->wsize;
@@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
853 864
854 dreq = nfs_direct_req_alloc(); 865 dreq = nfs_direct_req_alloc();
855 if (!dreq) 866 if (!dreq)
856 return -ENOMEM; 867 goto out;
857 nfs_alloc_commit_data(dreq); 868 nfs_alloc_commit_data(dreq);
858 869
859 if (dreq->commit_data == NULL || count < wsize) 870 if (dreq->commit_data == NULL || count < wsize)
@@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
861 872
862 dreq->inode = inode; 873 dreq->inode = inode;
863 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 874 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
875 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
876 if (dreq->l_ctx != NULL)
877 goto out_release;
864 if (!is_sync_kiocb(iocb)) 878 if (!is_sync_kiocb(iocb))
865 dreq->iocb = iocb; 879 dreq->iocb = iocb;
866 880
867 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); 881 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
868 if (!result) 882 if (!result)
869 result = nfs_direct_wait(dreq); 883 result = nfs_direct_wait(dreq);
884out_release:
870 nfs_direct_req_release(dreq); 885 nfs_direct_req_release(dreq);
871 886out:
872 return result; 887 return result;
873} 888}
874 889
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 76fd235d0024..dba50a5625db 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -6,6 +6,29 @@
6 * Resolves DNS hostnames into valid ip addresses 6 * Resolves DNS hostnames into valid ip addresses
7 */ 7 */
8 8
9#ifdef CONFIG_NFS_USE_KERNEL_DNS
10
11#include <linux/sunrpc/clnt.h>
12#include <linux/dns_resolver.h>
13
14ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
15 struct sockaddr *sa, size_t salen)
16{
17 ssize_t ret;
18 char *ip_addr = NULL;
19 int ip_len;
20
21 ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
22 if (ip_len > 0)
23 ret = rpc_pton(ip_addr, ip_len, sa, salen);
24 else
25 ret = -ESRCH;
26 kfree(ip_addr);
27 return ret;
28}
29
30#else
31
9#include <linux/hash.h> 32#include <linux/hash.h>
10#include <linux/string.h> 33#include <linux/string.h>
11#include <linux/kmod.h> 34#include <linux/kmod.h>
@@ -346,3 +369,4 @@ void nfs_dns_resolver_destroy(void)
346 nfs_cache_unregister(&nfs_dns_resolve); 369 nfs_cache_unregister(&nfs_dns_resolve);
347} 370}
348 371
372#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index a3f0938babf7..199bb5543a91 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -6,8 +6,20 @@
6 6
7#define NFS_DNS_HOSTNAME_MAXLEN (128) 7#define NFS_DNS_HOSTNAME_MAXLEN (128)
8 8
9
10#ifdef CONFIG_NFS_USE_KERNEL_DNS
11static inline int nfs_dns_resolver_init(void)
12{
13 return 0;
14}
15
16static inline void nfs_dns_resolver_destroy(void)
17{}
18#else
9extern int nfs_dns_resolver_init(void); 19extern int nfs_dns_resolver_init(void);
10extern void nfs_dns_resolver_destroy(void); 20extern void nfs_dns_resolver_destroy(void);
21#endif
22
11extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, 23extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
12 struct sockaddr *sa, size_t salen); 24 struct sockaddr *sa, size_t salen);
13 25
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8d965bddb87e..05bf3c0dc751 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -27,6 +27,7 @@
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/aio.h> 28#include <linux/aio.h>
29#include <linux/gfp.h> 29#include <linux/gfp.h>
30#include <linux/swap.h>
30 31
31#include <asm/uaccess.h> 32#include <asm/uaccess.h>
32#include <asm/system.h> 33#include <asm/system.h>
@@ -53,7 +54,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 54static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
54 unsigned long nr_segs, loff_t pos); 55 unsigned long nr_segs, loff_t pos);
55static int nfs_file_flush(struct file *, fl_owner_t id); 56static int nfs_file_flush(struct file *, fl_owner_t id);
56static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync); 57static int nfs_file_fsync(struct file *, int datasync);
57static int nfs_check_flags(int flags); 58static int nfs_check_flags(int flags);
58static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); 59static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 60static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -161,14 +162,17 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
161 struct nfs_server *server = NFS_SERVER(inode); 162 struct nfs_server *server = NFS_SERVER(inode);
162 struct nfs_inode *nfsi = NFS_I(inode); 163 struct nfs_inode *nfsi = NFS_I(inode);
163 164
164 if (server->flags & NFS_MOUNT_NOAC) 165 if (nfs_have_delegated_attributes(inode))
165 goto force_reval; 166 goto out_noreval;
167
166 if (filp->f_flags & O_DIRECT) 168 if (filp->f_flags & O_DIRECT)
167 goto force_reval; 169 goto force_reval;
168 if (nfsi->npages != 0) 170 if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
169 return 0; 171 goto force_reval;
170 if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode)) 172 if (nfs_attribute_timeout(inode))
171 return 0; 173 goto force_reval;
174out_noreval:
175 return 0;
172force_reval: 176force_reval:
173 return __nfs_revalidate_inode(server, inode); 177 return __nfs_revalidate_inode(server, inode);
174} 178}
@@ -199,37 +203,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
199} 203}
200 204
201/* 205/*
202 * Helper for nfs_file_flush() and nfs_file_fsync()
203 *
204 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
205 * disk, but it retrieves and clears ctx->error after synching, despite
206 * the two being set at the same time in nfs_context_set_write_error().
207 * This is because the former is used to notify the _next_ call to
208 * nfs_file_write() that a write error occured, and hence cause it to
209 * fall back to doing a synchronous write.
210 */
211static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
212{
213 int have_error, status;
214 int ret = 0;
215
216 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
217 status = nfs_wb_all(inode);
218 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
219 if (have_error)
220 ret = xchg(&ctx->error, 0);
221 if (!ret)
222 ret = status;
223 return ret;
224}
225
226/*
227 * Flush all dirty pages, and check for write errors. 206 * Flush all dirty pages, and check for write errors.
228 */ 207 */
229static int 208static int
230nfs_file_flush(struct file *file, fl_owner_t id) 209nfs_file_flush(struct file *file, fl_owner_t id)
231{ 210{
232 struct nfs_open_context *ctx = nfs_file_open_context(file);
233 struct dentry *dentry = file->f_path.dentry; 211 struct dentry *dentry = file->f_path.dentry;
234 struct inode *inode = dentry->d_inode; 212 struct inode *inode = dentry->d_inode;
235 213
@@ -242,7 +220,7 @@ nfs_file_flush(struct file *file, fl_owner_t id)
242 return 0; 220 return 0;
243 221
244 /* Flush writes to the server and return any errors */ 222 /* Flush writes to the server and return any errors */
245 return nfs_do_fsync(ctx, inode); 223 return vfs_fsync(file, 0);
246} 224}
247 225
248static ssize_t 226static ssize_t
@@ -317,19 +295,37 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
317 * Flush any dirty pages for this process, and check for write errors. 295 * Flush any dirty pages for this process, and check for write errors.
318 * The return status from this call provides a reliable indication of 296 * The return status from this call provides a reliable indication of
319 * whether any write errors occurred for this process. 297 * whether any write errors occurred for this process.
298 *
299 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
300 * disk, but it retrieves and clears ctx->error after synching, despite
301 * the two being set at the same time in nfs_context_set_write_error().
302 * This is because the former is used to notify the _next_ call to
303 * nfs_file_write() that a write error occured, and hence cause it to
304 * fall back to doing a synchronous write.
320 */ 305 */
321static int 306static int
322nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 307nfs_file_fsync(struct file *file, int datasync)
323{ 308{
309 struct dentry *dentry = file->f_path.dentry;
324 struct nfs_open_context *ctx = nfs_file_open_context(file); 310 struct nfs_open_context *ctx = nfs_file_open_context(file);
325 struct inode *inode = dentry->d_inode; 311 struct inode *inode = dentry->d_inode;
312 int have_error, status;
313 int ret = 0;
314
326 315
327 dprintk("NFS: fsync file(%s/%s) datasync %d\n", 316 dprintk("NFS: fsync file(%s/%s) datasync %d\n",
328 dentry->d_parent->d_name.name, dentry->d_name.name, 317 dentry->d_parent->d_name.name, dentry->d_name.name,
329 datasync); 318 datasync);
330 319
331 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 320 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
332 return nfs_do_fsync(ctx, inode); 321 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
322 status = nfs_commit_inode(inode, FLUSH_SYNC);
323 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
324 if (have_error)
325 ret = xchg(&ctx->error, 0);
326 if (!ret && status < 0)
327 ret = status;
328 return ret;
333} 329}
334 330
335/* 331/*
@@ -489,11 +485,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
489 */ 485 */
490static int nfs_release_page(struct page *page, gfp_t gfp) 486static int nfs_release_page(struct page *page, gfp_t gfp)
491{ 487{
488 struct address_space *mapping = page->mapping;
489
492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 490 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
493 491
494 /* Only do I/O if gfp is a superset of GFP_KERNEL */ 492 /* Only do I/O if gfp is a superset of GFP_KERNEL */
495 if ((gfp & GFP_KERNEL) == GFP_KERNEL) 493 if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) {
496 nfs_wb_page(page->mapping->host, page); 494 int how = FLUSH_SYNC;
495
496 /* Don't let kswapd deadlock waiting for OOM RPC calls */
497 if (current_is_kswapd())
498 how = 0;
499 nfs_commit_inode(mapping->host, how);
500 }
497 /* If PagePrivate() is set, then the page is not freeable */ 501 /* If PagePrivate() is set, then the page is not freeable */
498 if (PagePrivate(page)) 502 if (PagePrivate(page))
499 return 0; 503 return 0;
@@ -635,7 +639,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
635 639
636 /* Return error values for O_DSYNC and IS_SYNC() */ 640 /* Return error values for O_DSYNC and IS_SYNC() */
637 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { 641 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
638 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); 642 int err = vfs_fsync(iocb->ki_filp, 0);
639 if (err < 0) 643 if (err < 0)
640 result = err; 644 result = err;
641 } 645 }
@@ -671,7 +675,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
671 written = ret; 675 written = ret;
672 676
673 if (ret >= 0 && nfs_need_sync_write(filp, inode)) { 677 if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
674 int err = nfs_do_fsync(nfs_file_open_context(filp), inode); 678 int err = vfs_fsync(filp, 0);
675 if (err < 0) 679 if (err < 0)
676 ret = err; 680 ret = err;
677 } 681 }
@@ -719,10 +723,6 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
719 default: 723 default:
720 BUG(); 724 BUG();
721 } 725 }
722 if (res < 0)
723 dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager"
724 " - error %d!\n",
725 __func__, res);
726 return res; 726 return res;
727} 727}
728 728
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index a6b16ed93229..ce153a6b3aec 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -467,7 +467,8 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
467 struct list_head *pages, 467 struct list_head *pages,
468 unsigned *nr_pages) 468 unsigned *nr_pages)
469{ 469{
470 int ret, npages = *nr_pages; 470 unsigned npages = *nr_pages;
471 int ret;
471 472
472 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", 473 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
473 NFS_I(inode)->fscache, npages, inode); 474 NFS_I(inode)->fscache, npages, inode);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b35d2a616066..a70e446e1605 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
78{ 78{
79 struct nfs_server *server = NFS_SB(sb); 79 struct nfs_server *server = NFS_SB(sb);
80 struct nfs_fsinfo fsinfo; 80 struct nfs_fsinfo fsinfo;
81 struct nfs_fattr fattr; 81 struct dentry *ret;
82 struct dentry *mntroot;
83 struct inode *inode; 82 struct inode *inode;
84 int error; 83 int error;
85 84
86 /* get the actual root for this mount */ 85 /* get the actual root for this mount */
87 fsinfo.fattr = &fattr; 86 fsinfo.fattr = nfs_alloc_fattr();
87 if (fsinfo.fattr == NULL)
88 return ERR_PTR(-ENOMEM);
88 89
89 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); 90 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
90 if (error < 0) { 91 if (error < 0) {
91 dprintk("nfs_get_root: getattr error = %d\n", -error); 92 dprintk("nfs_get_root: getattr error = %d\n", -error);
92 return ERR_PTR(error); 93 ret = ERR_PTR(error);
94 goto out;
93 } 95 }
94 96
95 inode = nfs_fhget(sb, mntfh, fsinfo.fattr); 97 inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
96 if (IS_ERR(inode)) { 98 if (IS_ERR(inode)) {
97 dprintk("nfs_get_root: get root inode failed\n"); 99 dprintk("nfs_get_root: get root inode failed\n");
98 return ERR_CAST(inode); 100 ret = ERR_CAST(inode);
101 goto out;
99 } 102 }
100 103
101 error = nfs_superblock_set_dummy_root(sb, inode); 104 error = nfs_superblock_set_dummy_root(sb, inode);
102 if (error != 0) 105 if (error != 0) {
103 return ERR_PTR(error); 106 ret = ERR_PTR(error);
107 goto out;
108 }
104 109
105 /* root dentries normally start off anonymous and get spliced in later 110 /* root dentries normally start off anonymous and get spliced in later
106 * if the dentry tree reaches them; however if the dentry already 111 * if the dentry tree reaches them; however if the dentry already
107 * exists, we'll pick it up at this point and use it as the root 112 * exists, we'll pick it up at this point and use it as the root
108 */ 113 */
109 mntroot = d_obtain_alias(inode); 114 ret = d_obtain_alias(inode);
110 if (IS_ERR(mntroot)) { 115 if (IS_ERR(ret)) {
111 dprintk("nfs_get_root: get root dentry failed\n"); 116 dprintk("nfs_get_root: get root dentry failed\n");
112 return mntroot; 117 goto out;
113 } 118 }
114 119
115 security_d_instantiate(mntroot, inode); 120 security_d_instantiate(ret, inode);
116
117 if (!mntroot->d_op)
118 mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
119 121
120 return mntroot; 122 if (ret->d_op == NULL)
123 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
124out:
125 nfs_free_fattr(fsinfo.fattr);
126 return ret;
121} 127}
122 128
123#ifdef CONFIG_NFS_V4 129#ifdef CONFIG_NFS_V4
124 130
125/* 131int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
126 * Do a simple pathwalk from the root FH of the server to the nominated target
127 * of the mountpoint
128 * - give error on symlinks
129 * - give error on ".." occurring in the path
130 * - follow traversals
131 */
132int nfs4_path_walk(struct nfs_server *server,
133 struct nfs_fh *mntfh,
134 const char *path)
135{ 132{
136 struct nfs_fsinfo fsinfo; 133 struct nfs_fsinfo fsinfo;
137 struct nfs_fattr fattr; 134 int ret = -ENOMEM;
138 struct nfs_fh lastfh;
139 struct qstr name;
140 int ret;
141 135
142 dprintk("--> nfs4_path_walk(,,%s)\n", path); 136 dprintk("--> nfs4_get_rootfh()\n");
143 137
144 fsinfo.fattr = &fattr; 138 fsinfo.fattr = nfs_alloc_fattr();
145 nfs_fattr_init(&fattr); 139 if (fsinfo.fattr == NULL)
146 140 goto out;
147 /* Eat leading slashes */
148 while (*path == '/')
149 path++;
150 141
151 /* Start by getting the root filehandle from the server */ 142 /* Start by getting the root filehandle from the server */
152 ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); 143 ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
153 if (ret < 0) { 144 if (ret < 0) {
154 dprintk("nfs4_get_root: getroot error = %d\n", -ret); 145 dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
155 return ret; 146 goto out;
156 } 147 }
157 148
158 if (!S_ISDIR(fattr.mode)) { 149 if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
159 printk(KERN_ERR "nfs4_get_root:" 150 || !S_ISDIR(fsinfo.fattr->mode)) {
151 printk(KERN_ERR "nfs4_get_rootfh:"
160 " getroot encountered non-directory\n"); 152 " getroot encountered non-directory\n");
161 return -ENOTDIR; 153 ret = -ENOTDIR;
154 goto out;
162 } 155 }
163 156
164 /* FIXME: It is quite valid for the server to return a referral here */ 157 if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
165 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { 158 printk(KERN_ERR "nfs4_get_rootfh:"
166 printk(KERN_ERR "nfs4_get_root:"
167 " getroot obtained referral\n"); 159 " getroot obtained referral\n");
168 return -EREMOTE; 160 ret = -EREMOTE;
169 } 161 goto out;
170
171next_component:
172 dprintk("Next: %s\n", path);
173
174 /* extract the next bit of the path */
175 if (!*path)
176 goto path_walk_complete;
177
178 name.name = path;
179 while (*path && *path != '/')
180 path++;
181 name.len = path - (const char *) name.name;
182
183 if (name.len > NFS4_MAXNAMLEN)
184 return -ENAMETOOLONG;
185
186eat_dot_dir:
187 while (*path == '/')
188 path++;
189
190 if (path[0] == '.' && (path[1] == '/' || !path[1])) {
191 path += 2;
192 goto eat_dot_dir;
193 }
194
195 /* FIXME: Why shouldn't the user be able to use ".." in the path? */
196 if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
197 ) {
198 printk(KERN_ERR "nfs4_get_root:"
199 " Mount path contains reference to \"..\"\n");
200 return -EINVAL;
201 } 162 }
202 163
203 /* lookup the next FH in the sequence */ 164 memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
204 memcpy(&lastfh, mntfh, sizeof(lastfh)); 165out:
205 166 nfs_free_fattr(fsinfo.fattr);
206 dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path); 167 dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
207 168 return ret;
208 ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
209 mntfh, &fattr);
210 if (ret < 0) {
211 dprintk("nfs4_get_root: getroot error = %d\n", -ret);
212 return ret;
213 }
214
215 if (!S_ISDIR(fattr.mode)) {
216 printk(KERN_ERR "nfs4_get_root:"
217 " lookupfh encountered non-directory\n");
218 return -ENOTDIR;
219 }
220
221 /* FIXME: Referrals are quite valid here too */
222 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
223 printk(KERN_ERR "nfs4_get_root:"
224 " lookupfh obtained referral\n");
225 return -EREMOTE;
226 }
227
228 goto next_component;
229
230path_walk_complete:
231 memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
232 dprintk("<-- nfs4_path_walk() = 0\n");
233 return 0;
234} 169}
235 170
236/* 171/*
@@ -239,8 +174,8 @@ path_walk_complete:
239struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) 174struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
240{ 175{
241 struct nfs_server *server = NFS_SB(sb); 176 struct nfs_server *server = NFS_SB(sb);
242 struct nfs_fattr fattr; 177 struct nfs_fattr *fattr = NULL;
243 struct dentry *mntroot; 178 struct dentry *ret;
244 struct inode *inode; 179 struct inode *inode;
245 int error; 180 int error;
246 181
@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
254 return ERR_PTR(error); 189 return ERR_PTR(error);
255 } 190 }
256 191
192 fattr = nfs_alloc_fattr();
193 if (fattr == NULL)
194 return ERR_PTR(-ENOMEM);;
195
257 /* get the actual root for this mount */ 196 /* get the actual root for this mount */
258 error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); 197 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
259 if (error < 0) { 198 if (error < 0) {
260 dprintk("nfs_get_root: getattr error = %d\n", -error); 199 dprintk("nfs_get_root: getattr error = %d\n", -error);
261 return ERR_PTR(error); 200 ret = ERR_PTR(error);
201 goto out;
262 } 202 }
263 203
264 inode = nfs_fhget(sb, mntfh, &fattr); 204 inode = nfs_fhget(sb, mntfh, fattr);
265 if (IS_ERR(inode)) { 205 if (IS_ERR(inode)) {
266 dprintk("nfs_get_root: get root inode failed\n"); 206 dprintk("nfs_get_root: get root inode failed\n");
267 return ERR_CAST(inode); 207 ret = ERR_CAST(inode);
208 goto out;
268 } 209 }
269 210
270 error = nfs_superblock_set_dummy_root(sb, inode); 211 error = nfs_superblock_set_dummy_root(sb, inode);
271 if (error != 0) 212 if (error != 0) {
272 return ERR_PTR(error); 213 ret = ERR_PTR(error);
214 goto out;
215 }
273 216
274 /* root dentries normally start off anonymous and get spliced in later 217 /* root dentries normally start off anonymous and get spliced in later
275 * if the dentry tree reaches them; however if the dentry already 218 * if the dentry tree reaches them; however if the dentry already
276 * exists, we'll pick it up at this point and use it as the root 219 * exists, we'll pick it up at this point and use it as the root
277 */ 220 */
278 mntroot = d_obtain_alias(inode); 221 ret = d_obtain_alias(inode);
279 if (IS_ERR(mntroot)) { 222 if (IS_ERR(ret)) {
280 dprintk("nfs_get_root: get root dentry failed\n"); 223 dprintk("nfs_get_root: get root dentry failed\n");
281 return mntroot; 224 goto out;
282 } 225 }
283 226
284 security_d_instantiate(mntroot, inode); 227 security_d_instantiate(ret, inode);
285 228
286 if (!mntroot->d_op) 229 if (ret->d_op == NULL)
287 mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; 230 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
288 231
232out:
233 nfs_free_fattr(fattr);
289 dprintk("<-- nfs4_get_root()\n"); 234 dprintk("<-- nfs4_get_root()\n");
290 return mntroot; 235 return ret;
291} 236}
292 237
293#endif /* CONFIG_NFS_V4 */ 238#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 50a56edca0b5..7d2d6c72aa78 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -98,7 +98,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
98 return ino; 98 return ino;
99} 99}
100 100
101void nfs_clear_inode(struct inode *inode) 101static void nfs_clear_inode(struct inode *inode)
102{ 102{
103 /* 103 /*
104 * The following should never happen... 104 * The following should never happen...
@@ -110,6 +110,13 @@ void nfs_clear_inode(struct inode *inode)
110 nfs_fscache_release_inode_cookie(inode); 110 nfs_fscache_release_inode_cookie(inode);
111} 111}
112 112
113void nfs_evict_inode(struct inode *inode)
114{
115 truncate_inode_pages(&inode->i_data, 0);
116 end_writeback(inode);
117 nfs_clear_inode(inode);
118}
119
113/** 120/**
114 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk 121 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
115 */ 122 */
@@ -393,8 +400,8 @@ int
393nfs_setattr(struct dentry *dentry, struct iattr *attr) 400nfs_setattr(struct dentry *dentry, struct iattr *attr)
394{ 401{
395 struct inode *inode = dentry->d_inode; 402 struct inode *inode = dentry->d_inode;
396 struct nfs_fattr fattr; 403 struct nfs_fattr *fattr;
397 int error; 404 int error = -ENOMEM;
398 405
399 nfs_inc_stats(inode, NFSIOS_VFSSETATTR); 406 nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
400 407
@@ -413,18 +420,22 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
413 return 0; 420 return 0;
414 421
415 /* Write all dirty data */ 422 /* Write all dirty data */
416 if (S_ISREG(inode->i_mode)) { 423 if (S_ISREG(inode->i_mode))
417 filemap_write_and_wait(inode->i_mapping);
418 nfs_wb_all(inode); 424 nfs_wb_all(inode);
419 } 425
426 fattr = nfs_alloc_fattr();
427 if (fattr == NULL)
428 goto out;
420 /* 429 /*
421 * Return any delegations if we're going to change ACLs 430 * Return any delegations if we're going to change ACLs
422 */ 431 */
423 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) 432 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
424 nfs_inode_return_delegation(inode); 433 nfs_inode_return_delegation(inode);
425 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); 434 error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
426 if (error == 0) 435 if (error == 0)
427 nfs_refresh_inode(inode, &fattr); 436 nfs_refresh_inode(inode, fattr);
437 nfs_free_fattr(fattr);
438out:
428 return error; 439 return error;
429} 440}
430 441
@@ -524,6 +535,68 @@ out:
524 return err; 535 return err;
525} 536}
526 537
538static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
539{
540 atomic_set(&l_ctx->count, 1);
541 l_ctx->lockowner = current->files;
542 l_ctx->pid = current->tgid;
543 INIT_LIST_HEAD(&l_ctx->list);
544}
545
546static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
547{
548 struct nfs_lock_context *pos;
549
550 list_for_each_entry(pos, &ctx->lock_context.list, list) {
551 if (pos->lockowner != current->files)
552 continue;
553 if (pos->pid != current->tgid)
554 continue;
555 atomic_inc(&pos->count);
556 return pos;
557 }
558 return NULL;
559}
560
561struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
562{
563 struct nfs_lock_context *res, *new = NULL;
564 struct inode *inode = ctx->path.dentry->d_inode;
565
566 spin_lock(&inode->i_lock);
567 res = __nfs_find_lock_context(ctx);
568 if (res == NULL) {
569 spin_unlock(&inode->i_lock);
570 new = kmalloc(sizeof(*new), GFP_KERNEL);
571 if (new == NULL)
572 return NULL;
573 nfs_init_lock_context(new);
574 spin_lock(&inode->i_lock);
575 res = __nfs_find_lock_context(ctx);
576 if (res == NULL) {
577 list_add_tail(&new->list, &ctx->lock_context.list);
578 new->open_context = ctx;
579 res = new;
580 new = NULL;
581 }
582 }
583 spin_unlock(&inode->i_lock);
584 kfree(new);
585 return res;
586}
587
588void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
589{
590 struct nfs_open_context *ctx = l_ctx->open_context;
591 struct inode *inode = ctx->path.dentry->d_inode;
592
593 if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
594 return;
595 list_del(&l_ctx->list);
596 spin_unlock(&inode->i_lock);
597 kfree(l_ctx);
598}
599
527/** 600/**
528 * nfs_close_context - Common close_context() routine NFSv2/v3 601 * nfs_close_context - Common close_context() routine NFSv2/v3
529 * @ctx: pointer to context 602 * @ctx: pointer to context
@@ -560,11 +633,11 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
560 path_get(&ctx->path); 633 path_get(&ctx->path);
561 ctx->cred = get_rpccred(cred); 634 ctx->cred = get_rpccred(cred);
562 ctx->state = NULL; 635 ctx->state = NULL;
563 ctx->lockowner = current->files;
564 ctx->flags = 0; 636 ctx->flags = 0;
565 ctx->error = 0; 637 ctx->error = 0;
566 ctx->dir_cookie = 0; 638 ctx->dir_cookie = 0;
567 atomic_set(&ctx->count, 1); 639 nfs_init_lock_context(&ctx->lock_context);
640 ctx->lock_context.open_context = ctx;
568 } 641 }
569 return ctx; 642 return ctx;
570} 643}
@@ -572,7 +645,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
572struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) 645struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
573{ 646{
574 if (ctx != NULL) 647 if (ctx != NULL)
575 atomic_inc(&ctx->count); 648 atomic_inc(&ctx->lock_context.count);
576 return ctx; 649 return ctx;
577} 650}
578 651
@@ -580,7 +653,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
580{ 653{
581 struct inode *inode = ctx->path.dentry->d_inode; 654 struct inode *inode = ctx->path.dentry->d_inode;
582 655
583 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) 656 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
584 return; 657 return;
585 list_del(&ctx->list); 658 list_del(&ctx->list);
586 spin_unlock(&inode->i_lock); 659 spin_unlock(&inode->i_lock);
@@ -682,7 +755,7 @@ int
682__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 755__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
683{ 756{
684 int status = -ESTALE; 757 int status = -ESTALE;
685 struct nfs_fattr fattr; 758 struct nfs_fattr *fattr = NULL;
686 struct nfs_inode *nfsi = NFS_I(inode); 759 struct nfs_inode *nfsi = NFS_I(inode);
687 760
688 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", 761 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
@@ -693,8 +766,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
693 if (NFS_STALE(inode)) 766 if (NFS_STALE(inode))
694 goto out; 767 goto out;
695 768
769 status = -ENOMEM;
770 fattr = nfs_alloc_fattr();
771 if (fattr == NULL)
772 goto out;
773
696 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 774 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
697 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 775 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
698 if (status != 0) { 776 if (status != 0) {
699 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 777 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
700 inode->i_sb->s_id, 778 inode->i_sb->s_id,
@@ -707,7 +785,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
707 goto out; 785 goto out;
708 } 786 }
709 787
710 status = nfs_refresh_inode(inode, &fattr); 788 status = nfs_refresh_inode(inode, fattr);
711 if (status) { 789 if (status) {
712 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 790 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
713 inode->i_sb->s_id, 791 inode->i_sb->s_id,
@@ -723,6 +801,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
723 (long long)NFS_FILEID(inode)); 801 (long long)NFS_FILEID(inode));
724 802
725 out: 803 out:
804 nfs_free_fattr(fattr);
726 return status; 805 return status;
727} 806}
728 807
@@ -730,9 +809,14 @@ int nfs_attribute_timeout(struct inode *inode)
730{ 809{
731 struct nfs_inode *nfsi = NFS_I(inode); 810 struct nfs_inode *nfsi = NFS_I(inode);
732 811
812 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
813}
814
815static int nfs_attribute_cache_expired(struct inode *inode)
816{
733 if (nfs_have_delegated_attributes(inode)) 817 if (nfs_have_delegated_attributes(inode))
734 return 0; 818 return 0;
735 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 819 return nfs_attribute_timeout(inode);
736} 820}
737 821
738/** 822/**
@@ -745,7 +829,7 @@ int nfs_attribute_timeout(struct inode *inode)
745int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 829int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
746{ 830{
747 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) 831 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
748 && !nfs_attribute_timeout(inode)) 832 && !nfs_attribute_cache_expired(inode))
749 return NFS_STALE(inode) ? -ESTALE : 0; 833 return NFS_STALE(inode) ? -ESTALE : 0;
750 return __nfs_revalidate_inode(server, inode); 834 return __nfs_revalidate_inode(server, inode);
751} 835}
@@ -782,7 +866,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
782 int ret = 0; 866 int ret = 0;
783 867
784 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) 868 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
785 || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { 869 || nfs_attribute_cache_expired(inode)
870 || NFS_STALE(inode)) {
786 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 871 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
787 if (ret < 0) 872 if (ret < 0)
788 goto out; 873 goto out;
@@ -916,6 +1001,26 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
916 fattr->gencount = nfs_inc_attr_generation_counter(); 1001 fattr->gencount = nfs_inc_attr_generation_counter();
917} 1002}
918 1003
1004struct nfs_fattr *nfs_alloc_fattr(void)
1005{
1006 struct nfs_fattr *fattr;
1007
1008 fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
1009 if (fattr != NULL)
1010 nfs_fattr_init(fattr);
1011 return fattr;
1012}
1013
1014struct nfs_fh *nfs_alloc_fhandle(void)
1015{
1016 struct nfs_fh *fh;
1017
1018 fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
1019 if (fh != NULL)
1020 fh->size = 0;
1021 return fh;
1022}
1023
919/** 1024/**
920 * nfs_inode_attrs_need_update - check if the inode attributes need updating 1025 * nfs_inode_attrs_need_update - check if the inode attributes need updating
921 * @inode - pointer to inode 1026 * @inode - pointer to inode
@@ -1300,8 +1405,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1300 * to open() calls that passed nfs_atomic_lookup, but failed to call 1405 * to open() calls that passed nfs_atomic_lookup, but failed to call
1301 * nfs_open(). 1406 * nfs_open().
1302 */ 1407 */
1303void nfs4_clear_inode(struct inode *inode) 1408void nfs4_evict_inode(struct inode *inode)
1304{ 1409{
1410 truncate_inode_pages(&inode->i_data, 0);
1411 end_writeback(inode);
1305 /* If we are holding a delegation, return it! */ 1412 /* If we are holding a delegation, return it! */
1306 nfs_inode_return_delegation_noreclaim(inode); 1413 nfs_inode_return_delegation_noreclaim(inode);
1307 /* First call standard NFS clear_inode() code */ 1414 /* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 11f82f03c5de..c961bc92c107 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -205,16 +205,17 @@ extern struct rpc_procinfo nfs4_procedures[];
205void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 205void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
206 206
207/* dir.c */ 207/* dir.c */
208extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); 208extern int nfs_access_cache_shrinker(struct shrinker *shrink,
209 int nr_to_scan, gfp_t gfp_mask);
209 210
210/* inode.c */ 211/* inode.c */
211extern struct workqueue_struct *nfsiod_workqueue; 212extern struct workqueue_struct *nfsiod_workqueue;
212extern struct inode *nfs_alloc_inode(struct super_block *sb); 213extern struct inode *nfs_alloc_inode(struct super_block *sb);
213extern void nfs_destroy_inode(struct inode *); 214extern void nfs_destroy_inode(struct inode *);
214extern int nfs_write_inode(struct inode *, struct writeback_control *); 215extern int nfs_write_inode(struct inode *, struct writeback_control *);
215extern void nfs_clear_inode(struct inode *); 216extern void nfs_evict_inode(struct inode *);
216#ifdef CONFIG_NFS_V4 217#ifdef CONFIG_NFS_V4
217extern void nfs4_clear_inode(struct inode *); 218extern void nfs4_evict_inode(struct inode *);
218#endif 219#endif
219void nfs_zap_acl_cache(struct inode *inode); 220void nfs_zap_acl_cache(struct inode *inode);
220extern int nfs_wait_bit_killable(void *word); 221extern int nfs_wait_bit_killable(void *word);
@@ -244,9 +245,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
244#ifdef CONFIG_NFS_V4 245#ifdef CONFIG_NFS_V4
245extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); 246extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
246 247
247extern int nfs4_path_walk(struct nfs_server *server, 248extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
248 struct nfs_fh *mntfh,
249 const char *path);
250#endif 249#endif
251 250
252/* read.c */ 251/* read.c */
@@ -371,10 +370,9 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
371 * Helper for restarting RPC calls in the possible presence of NFSv4.1 370 * Helper for restarting RPC calls in the possible presence of NFSv4.1
372 * sessions. 371 * sessions.
373 */ 372 */
374static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) 373static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
375{ 374{
376 if (nfs4_has_session(clp)) 375 if (nfs4_has_session(clp))
377 rpc_restart_call_prepare(task); 376 return rpc_restart_call_prepare(task);
378 else 377 return rpc_restart_call(task);
379 rpc_restart_call(task);
380} 378}
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 1d8d5c813b01..c5832487c456 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const struct inode *inode,
36 36
37static inline void nfs_add_server_stats(const struct nfs_server *server, 37static inline void nfs_add_server_stats(const struct nfs_server *server,
38 enum nfs_stat_bytecounters stat, 38 enum nfs_stat_bytecounters stat,
39 unsigned long addend) 39 long addend)
40{ 40{
41 this_cpu_add(server->io_stats->bytes[stat], addend); 41 this_cpu_add(server->io_stats->bytes[stat], addend);
42} 42}
43 43
44static inline void nfs_add_stats(const struct inode *inode, 44static inline void nfs_add_stats(const struct inode *inode,
45 enum nfs_stat_bytecounters stat, 45 enum nfs_stat_bytecounters stat,
46 unsigned long addend) 46 long addend)
47{ 47{
48 nfs_add_server_stats(NFS_SERVER(inode), stat, addend); 48 nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
49} 49}
@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const struct inode *inode,
51#ifdef CONFIG_NFS_FSCACHE 51#ifdef CONFIG_NFS_FSCACHE
52static inline void nfs_add_fscache_stats(struct inode *inode, 52static inline void nfs_add_fscache_stats(struct inode *inode,
53 enum nfs_stat_fscachecounters stat, 53 enum nfs_stat_fscachecounters stat,
54 unsigned long addend) 54 long addend)
55{ 55{
56 this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); 56 this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
57} 57}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 7888cf36022d..db6aa3673cf3 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -105,8 +105,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
105 struct vfsmount *mnt; 105 struct vfsmount *mnt;
106 struct nfs_server *server = NFS_SERVER(dentry->d_inode); 106 struct nfs_server *server = NFS_SERVER(dentry->d_inode);
107 struct dentry *parent; 107 struct dentry *parent;
108 struct nfs_fh fh; 108 struct nfs_fh *fh = NULL;
109 struct nfs_fattr fattr; 109 struct nfs_fattr *fattr = NULL;
110 int err; 110 int err;
111 111
112 dprintk("--> nfs_follow_mountpoint()\n"); 112 dprintk("--> nfs_follow_mountpoint()\n");
@@ -115,6 +115,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
115 if (IS_ROOT(dentry)) 115 if (IS_ROOT(dentry))
116 goto out_err; 116 goto out_err;
117 117
118 err = -ENOMEM;
119 fh = nfs_alloc_fhandle();
120 fattr = nfs_alloc_fattr();
121 if (fh == NULL || fattr == NULL)
122 goto out_err;
123
118 dprintk("%s: enter\n", __func__); 124 dprintk("%s: enter\n", __func__);
119 dput(nd->path.dentry); 125 dput(nd->path.dentry);
120 nd->path.dentry = dget(dentry); 126 nd->path.dentry = dget(dentry);
@@ -123,16 +129,16 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
123 parent = dget_parent(nd->path.dentry); 129 parent = dget_parent(nd->path.dentry);
124 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, 130 err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
125 &nd->path.dentry->d_name, 131 &nd->path.dentry->d_name,
126 &fh, &fattr); 132 fh, fattr);
127 dput(parent); 133 dput(parent);
128 if (err != 0) 134 if (err != 0)
129 goto out_err; 135 goto out_err;
130 136
131 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) 137 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
132 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); 138 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
133 else 139 else
134 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh, 140 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
135 &fattr); 141 fattr);
136 err = PTR_ERR(mnt); 142 err = PTR_ERR(mnt);
137 if (IS_ERR(mnt)) 143 if (IS_ERR(mnt))
138 goto out_err; 144 goto out_err;
@@ -151,6 +157,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
151 nd->path.dentry = dget(mnt->mnt_root); 157 nd->path.dentry = dget(mnt->mnt_root);
152 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); 158 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
153out: 159out:
160 nfs_free_fattr(fattr);
161 nfs_free_fhandle(fh);
154 dprintk("%s: done, returned %d\n", __func__, err); 162 dprintk("%s: done, returned %d\n", __func__, err);
155 163
156 dprintk("<-- nfs_follow_mountpoint() = %d\n", err); 164 dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 81cf14257916..db8846a0e82e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs
233static int 233static int
234nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 234nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
235{ 235{
236 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 236 struct rpc_auth *auth = req->rq_cred->cr_auth;
237 unsigned int replen; 237 unsigned int replen;
238 u32 offset = (u32)args->offset; 238 u32 offset = (u32)args->offset;
239 u32 count = args->count; 239 u32 count = args->count;
@@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg
393static int 393static int
394nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) 394nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
395{ 395{
396 struct rpc_task *task = req->rq_task; 396 struct rpc_auth *auth = req->rq_cred->cr_auth;
397 struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth;
398 unsigned int replen; 397 unsigned int replen;
399 u32 count = args->count; 398 u32 count = args->count;
400 399
@@ -575,7 +574,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
575static int 574static int
576nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) 575nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
577{ 576{
578 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 577 struct rpc_auth *auth = req->rq_cred->cr_auth;
579 unsigned int replen; 578 unsigned int replen;
580 579
581 p = xdr_encode_fhandle(p, args->fh); 580 p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index d150ae0c5ecd..9f88c5f4c7e2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -185,7 +185,6 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
185struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) 185struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
186{ 186{
187 struct nfs_server *server = NFS_SERVER(inode); 187 struct nfs_server *server = NFS_SERVER(inode);
188 struct nfs_fattr fattr;
189 struct page *pages[NFSACL_MAXPAGES] = { }; 188 struct page *pages[NFSACL_MAXPAGES] = { };
190 struct nfs3_getaclargs args = { 189 struct nfs3_getaclargs args = {
191 .fh = NFS_FH(inode), 190 .fh = NFS_FH(inode),
@@ -193,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
193 .pages = pages, 192 .pages = pages,
194 }; 193 };
195 struct nfs3_getaclres res = { 194 struct nfs3_getaclres res = {
196 .fattr = &fattr, 195 0
197 }; 196 };
198 struct rpc_message msg = { 197 struct rpc_message msg = {
199 .rpc_argp = &args, 198 .rpc_argp = &args,
@@ -228,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
228 227
229 dprintk("NFS call getacl\n"); 228 dprintk("NFS call getacl\n");
230 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; 229 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
231 nfs_fattr_init(&fattr); 230 res.fattr = nfs_alloc_fattr();
231 if (res.fattr == NULL)
232 return ERR_PTR(-ENOMEM);
233
232 status = rpc_call_sync(server->client_acl, &msg, 0); 234 status = rpc_call_sync(server->client_acl, &msg, 0);
233 dprintk("NFS reply getacl: %d\n", status); 235 dprintk("NFS reply getacl: %d\n", status);
234 236
@@ -238,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
238 240
239 switch (status) { 241 switch (status) {
240 case 0: 242 case 0:
241 status = nfs_refresh_inode(inode, &fattr); 243 status = nfs_refresh_inode(inode, res.fattr);
242 break; 244 break;
243 case -EPFNOSUPPORT: 245 case -EPFNOSUPPORT:
244 case -EPROTONOSUPPORT: 246 case -EPROTONOSUPPORT:
@@ -278,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
278getout: 280getout:
279 posix_acl_release(res.acl_access); 281 posix_acl_release(res.acl_access);
280 posix_acl_release(res.acl_default); 282 posix_acl_release(res.acl_default);
283 nfs_free_fattr(res.fattr);
281 284
282 if (status != 0) { 285 if (status != 0) {
283 posix_acl_release(acl); 286 posix_acl_release(acl);
@@ -290,7 +293,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
290 struct posix_acl *dfacl) 293 struct posix_acl *dfacl)
291{ 294{
292 struct nfs_server *server = NFS_SERVER(inode); 295 struct nfs_server *server = NFS_SERVER(inode);
293 struct nfs_fattr fattr; 296 struct nfs_fattr *fattr;
294 struct page *pages[NFSACL_MAXPAGES]; 297 struct page *pages[NFSACL_MAXPAGES];
295 struct nfs3_setaclargs args = { 298 struct nfs3_setaclargs args = {
296 .inode = inode, 299 .inode = inode,
@@ -335,8 +338,13 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
335 } 338 }
336 339
337 dprintk("NFS call setacl\n"); 340 dprintk("NFS call setacl\n");
341 status = -ENOMEM;
342 fattr = nfs_alloc_fattr();
343 if (fattr == NULL)
344 goto out_freepages;
345
338 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; 346 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
339 nfs_fattr_init(&fattr); 347 msg.rpc_resp = fattr;
340 status = rpc_call_sync(server->client_acl, &msg, 0); 348 status = rpc_call_sync(server->client_acl, &msg, 0);
341 nfs_access_zap_cache(inode); 349 nfs_access_zap_cache(inode);
342 nfs_zap_acl_cache(inode); 350 nfs_zap_acl_cache(inode);
@@ -344,7 +352,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
344 352
345 switch (status) { 353 switch (status) {
346 case 0: 354 case 0:
347 status = nfs_refresh_inode(inode, &fattr); 355 status = nfs_refresh_inode(inode, fattr);
348 nfs3_cache_acls(inode, acl, dfacl); 356 nfs3_cache_acls(inode, acl, dfacl);
349 break; 357 break;
350 case -EPFNOSUPPORT: 358 case -EPFNOSUPPORT:
@@ -355,6 +363,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
355 case -ENOTSUPP: 363 case -ENOTSUPP:
356 status = -EOPNOTSUPP; 364 status = -EOPNOTSUPP;
357 } 365 }
366 nfs_free_fattr(fattr);
358out_freepages: 367out_freepages:
359 while (args.npages != 0) { 368 while (args.npages != 0) {
360 args.npages--; 369 args.npages--;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index e701002694e5..fabb4f2849a1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -144,14 +144,12 @@ static int
144nfs3_proc_lookup(struct inode *dir, struct qstr *name, 144nfs3_proc_lookup(struct inode *dir, struct qstr *name,
145 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 145 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
146{ 146{
147 struct nfs_fattr dir_attr;
148 struct nfs3_diropargs arg = { 147 struct nfs3_diropargs arg = {
149 .fh = NFS_FH(dir), 148 .fh = NFS_FH(dir),
150 .name = name->name, 149 .name = name->name,
151 .len = name->len 150 .len = name->len
152 }; 151 };
153 struct nfs3_diropres res = { 152 struct nfs3_diropres res = {
154 .dir_attr = &dir_attr,
155 .fh = fhandle, 153 .fh = fhandle,
156 .fattr = fattr 154 .fattr = fattr
157 }; 155 };
@@ -163,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
163 int status; 161 int status;
164 162
165 dprintk("NFS call lookup %s\n", name->name); 163 dprintk("NFS call lookup %s\n", name->name);
166 nfs_fattr_init(&dir_attr); 164 res.dir_attr = nfs_alloc_fattr();
165 if (res.dir_attr == NULL)
166 return -ENOMEM;
167
167 nfs_fattr_init(fattr); 168 nfs_fattr_init(fattr);
168 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 169 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
169 nfs_refresh_inode(dir, &dir_attr); 170 nfs_refresh_inode(dir, res.dir_attr);
170 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { 171 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
171 msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; 172 msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
172 msg.rpc_argp = fhandle; 173 msg.rpc_argp = fhandle;
173 msg.rpc_resp = fattr; 174 msg.rpc_resp = fattr;
174 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 175 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
175 } 176 }
177 nfs_free_fattr(res.dir_attr);
176 dprintk("NFS reply lookup: %d\n", status); 178 dprintk("NFS reply lookup: %d\n", status);
177 return status; 179 return status;
178} 180}
179 181
180static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) 182static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
181{ 183{
182 struct nfs_fattr fattr;
183 struct nfs3_accessargs arg = { 184 struct nfs3_accessargs arg = {
184 .fh = NFS_FH(inode), 185 .fh = NFS_FH(inode),
185 }; 186 };
186 struct nfs3_accessres res = { 187 struct nfs3_accessres res;
187 .fattr = &fattr,
188 };
189 struct rpc_message msg = { 188 struct rpc_message msg = {
190 .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], 189 .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS],
191 .rpc_argp = &arg, 190 .rpc_argp = &arg,
@@ -193,7 +192,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
193 .rpc_cred = entry->cred, 192 .rpc_cred = entry->cred,
194 }; 193 };
195 int mode = entry->mask; 194 int mode = entry->mask;
196 int status; 195 int status = -ENOMEM;
197 196
198 dprintk("NFS call access\n"); 197 dprintk("NFS call access\n");
199 198
@@ -210,9 +209,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
210 if (mode & MAY_EXEC) 209 if (mode & MAY_EXEC)
211 arg.access |= NFS3_ACCESS_EXECUTE; 210 arg.access |= NFS3_ACCESS_EXECUTE;
212 } 211 }
213 nfs_fattr_init(&fattr); 212
213 res.fattr = nfs_alloc_fattr();
214 if (res.fattr == NULL)
215 goto out;
216
214 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 217 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
215 nfs_refresh_inode(inode, &fattr); 218 nfs_refresh_inode(inode, res.fattr);
216 if (status == 0) { 219 if (status == 0) {
217 entry->mask = 0; 220 entry->mask = 0;
218 if (res.access & NFS3_ACCESS_READ) 221 if (res.access & NFS3_ACCESS_READ)
@@ -222,6 +225,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
222 if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) 225 if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
223 entry->mask |= MAY_EXEC; 226 entry->mask |= MAY_EXEC;
224 } 227 }
228 nfs_free_fattr(res.fattr);
229out:
225 dprintk("NFS reply access: %d\n", status); 230 dprintk("NFS reply access: %d\n", status);
226 return status; 231 return status;
227} 232}
@@ -229,7 +234,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
229static int nfs3_proc_readlink(struct inode *inode, struct page *page, 234static int nfs3_proc_readlink(struct inode *inode, struct page *page,
230 unsigned int pgbase, unsigned int pglen) 235 unsigned int pgbase, unsigned int pglen)
231{ 236{
232 struct nfs_fattr fattr; 237 struct nfs_fattr *fattr;
233 struct nfs3_readlinkargs args = { 238 struct nfs3_readlinkargs args = {
234 .fh = NFS_FH(inode), 239 .fh = NFS_FH(inode),
235 .pgbase = pgbase, 240 .pgbase = pgbase,
@@ -239,14 +244,19 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
239 struct rpc_message msg = { 244 struct rpc_message msg = {
240 .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], 245 .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK],
241 .rpc_argp = &args, 246 .rpc_argp = &args,
242 .rpc_resp = &fattr,
243 }; 247 };
244 int status; 248 int status = -ENOMEM;
245 249
246 dprintk("NFS call readlink\n"); 250 dprintk("NFS call readlink\n");
247 nfs_fattr_init(&fattr); 251 fattr = nfs_alloc_fattr();
252 if (fattr == NULL)
253 goto out;
254 msg.rpc_resp = fattr;
255
248 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 256 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
249 nfs_refresh_inode(inode, &fattr); 257 nfs_refresh_inode(inode, fattr);
258 nfs_free_fattr(fattr);
259out:
250 dprintk("NFS reply readlink: %d\n", status); 260 dprintk("NFS reply readlink: %d\n", status);
251 return status; 261 return status;
252} 262}
@@ -396,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
396 .rpc_argp = &arg, 406 .rpc_argp = &arg,
397 .rpc_resp = &res, 407 .rpc_resp = &res,
398 }; 408 };
399 int status; 409 int status = -ENOMEM;
400 410
401 dprintk("NFS call remove %s\n", name->name); 411 dprintk("NFS call remove %s\n", name->name);
402 nfs_fattr_init(&res.dir_attr); 412 res.dir_attr = nfs_alloc_fattr();
413 if (res.dir_attr == NULL)
414 goto out;
415
403 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 416 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
404 nfs_post_op_update_inode(dir, &res.dir_attr); 417 nfs_post_op_update_inode(dir, res.dir_attr);
418 nfs_free_fattr(res.dir_attr);
419out:
405 dprintk("NFS reply remove: %d\n", status); 420 dprintk("NFS reply remove: %d\n", status);
406 return status; 421 return status;
407} 422}
@@ -419,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
419 if (nfs3_async_handle_jukebox(task, dir)) 434 if (nfs3_async_handle_jukebox(task, dir))
420 return 0; 435 return 0;
421 res = task->tk_msg.rpc_resp; 436 res = task->tk_msg.rpc_resp;
422 nfs_post_op_update_inode(dir, &res->dir_attr); 437 nfs_post_op_update_inode(dir, res->dir_attr);
423 return 1; 438 return 1;
424} 439}
425 440
@@ -427,7 +442,6 @@ static int
427nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, 442nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
428 struct inode *new_dir, struct qstr *new_name) 443 struct inode *new_dir, struct qstr *new_name)
429{ 444{
430 struct nfs_fattr old_dir_attr, new_dir_attr;
431 struct nfs3_renameargs arg = { 445 struct nfs3_renameargs arg = {
432 .fromfh = NFS_FH(old_dir), 446 .fromfh = NFS_FH(old_dir),
433 .fromname = old_name->name, 447 .fromname = old_name->name,
@@ -436,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
436 .toname = new_name->name, 450 .toname = new_name->name,
437 .tolen = new_name->len 451 .tolen = new_name->len
438 }; 452 };
439 struct nfs3_renameres res = { 453 struct nfs3_renameres res;
440 .fromattr = &old_dir_attr,
441 .toattr = &new_dir_attr
442 };
443 struct rpc_message msg = { 454 struct rpc_message msg = {
444 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], 455 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
445 .rpc_argp = &arg, 456 .rpc_argp = &arg,
446 .rpc_resp = &res, 457 .rpc_resp = &res,
447 }; 458 };
448 int status; 459 int status = -ENOMEM;
449 460
450 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); 461 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
451 nfs_fattr_init(&old_dir_attr); 462
452 nfs_fattr_init(&new_dir_attr); 463 res.fromattr = nfs_alloc_fattr();
464 res.toattr = nfs_alloc_fattr();
465 if (res.fromattr == NULL || res.toattr == NULL)
466 goto out;
467
453 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); 468 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
454 nfs_post_op_update_inode(old_dir, &old_dir_attr); 469 nfs_post_op_update_inode(old_dir, res.fromattr);
455 nfs_post_op_update_inode(new_dir, &new_dir_attr); 470 nfs_post_op_update_inode(new_dir, res.toattr);
471out:
472 nfs_free_fattr(res.toattr);
473 nfs_free_fattr(res.fromattr);
456 dprintk("NFS reply rename: %d\n", status); 474 dprintk("NFS reply rename: %d\n", status);
457 return status; 475 return status;
458} 476}
@@ -460,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
460static int 478static int
461nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 479nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
462{ 480{
463 struct nfs_fattr dir_attr, fattr;
464 struct nfs3_linkargs arg = { 481 struct nfs3_linkargs arg = {
465 .fromfh = NFS_FH(inode), 482 .fromfh = NFS_FH(inode),
466 .tofh = NFS_FH(dir), 483 .tofh = NFS_FH(dir),
467 .toname = name->name, 484 .toname = name->name,
468 .tolen = name->len 485 .tolen = name->len
469 }; 486 };
470 struct nfs3_linkres res = { 487 struct nfs3_linkres res;
471 .dir_attr = &dir_attr,
472 .fattr = &fattr
473 };
474 struct rpc_message msg = { 488 struct rpc_message msg = {
475 .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], 489 .rpc_proc = &nfs3_procedures[NFS3PROC_LINK],
476 .rpc_argp = &arg, 490 .rpc_argp = &arg,
477 .rpc_resp = &res, 491 .rpc_resp = &res,
478 }; 492 };
479 int status; 493 int status = -ENOMEM;
480 494
481 dprintk("NFS call link %s\n", name->name); 495 dprintk("NFS call link %s\n", name->name);
482 nfs_fattr_init(&dir_attr); 496 res.fattr = nfs_alloc_fattr();
483 nfs_fattr_init(&fattr); 497 res.dir_attr = nfs_alloc_fattr();
498 if (res.fattr == NULL || res.dir_attr == NULL)
499 goto out;
500
484 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 501 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
485 nfs_post_op_update_inode(dir, &dir_attr); 502 nfs_post_op_update_inode(dir, res.dir_attr);
486 nfs_post_op_update_inode(inode, &fattr); 503 nfs_post_op_update_inode(inode, res.fattr);
504out:
505 nfs_free_fattr(res.dir_attr);
506 nfs_free_fattr(res.fattr);
487 dprintk("NFS reply link: %d\n", status); 507 dprintk("NFS reply link: %d\n", status);
488 return status; 508 return status;
489} 509}
@@ -554,7 +574,7 @@ out:
554static int 574static int
555nfs3_proc_rmdir(struct inode *dir, struct qstr *name) 575nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
556{ 576{
557 struct nfs_fattr dir_attr; 577 struct nfs_fattr *dir_attr;
558 struct nfs3_diropargs arg = { 578 struct nfs3_diropargs arg = {
559 .fh = NFS_FH(dir), 579 .fh = NFS_FH(dir),
560 .name = name->name, 580 .name = name->name,
@@ -563,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
563 struct rpc_message msg = { 583 struct rpc_message msg = {
564 .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], 584 .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR],
565 .rpc_argp = &arg, 585 .rpc_argp = &arg,
566 .rpc_resp = &dir_attr,
567 }; 586 };
568 int status; 587 int status = -ENOMEM;
569 588
570 dprintk("NFS call rmdir %s\n", name->name); 589 dprintk("NFS call rmdir %s\n", name->name);
571 nfs_fattr_init(&dir_attr); 590 dir_attr = nfs_alloc_fattr();
591 if (dir_attr == NULL)
592 goto out;
593
594 msg.rpc_resp = dir_attr;
572 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 595 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
573 nfs_post_op_update_inode(dir, &dir_attr); 596 nfs_post_op_update_inode(dir, dir_attr);
597 nfs_free_fattr(dir_attr);
598out:
574 dprintk("NFS reply rmdir: %d\n", status); 599 dprintk("NFS reply rmdir: %d\n", status);
575 return status; 600 return status;
576} 601}
@@ -589,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
589 u64 cookie, struct page *page, unsigned int count, int plus) 614 u64 cookie, struct page *page, unsigned int count, int plus)
590{ 615{
591 struct inode *dir = dentry->d_inode; 616 struct inode *dir = dentry->d_inode;
592 struct nfs_fattr dir_attr;
593 __be32 *verf = NFS_COOKIEVERF(dir); 617 __be32 *verf = NFS_COOKIEVERF(dir);
594 struct nfs3_readdirargs arg = { 618 struct nfs3_readdirargs arg = {
595 .fh = NFS_FH(dir), 619 .fh = NFS_FH(dir),
@@ -600,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
600 .pages = &page 624 .pages = &page
601 }; 625 };
602 struct nfs3_readdirres res = { 626 struct nfs3_readdirres res = {
603 .dir_attr = &dir_attr,
604 .verf = verf, 627 .verf = verf,
605 .plus = plus 628 .plus = plus
606 }; 629 };
@@ -610,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
610 .rpc_resp = &res, 633 .rpc_resp = &res,
611 .rpc_cred = cred 634 .rpc_cred = cred
612 }; 635 };
613 int status; 636 int status = -ENOMEM;
614 637
615 if (plus) 638 if (plus)
616 msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; 639 msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
@@ -618,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
618 dprintk("NFS call readdir%s %d\n", 641 dprintk("NFS call readdir%s %d\n",
619 plus? "plus" : "", (unsigned int) cookie); 642 plus? "plus" : "", (unsigned int) cookie);
620 643
621 nfs_fattr_init(&dir_attr); 644 res.dir_attr = nfs_alloc_fattr();
645 if (res.dir_attr == NULL)
646 goto out;
647
622 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 648 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
623 649
624 nfs_invalidate_atime(dir); 650 nfs_invalidate_atime(dir);
651 nfs_refresh_inode(dir, res.dir_attr);
625 652
626 nfs_refresh_inode(dir, &dir_attr); 653 nfs_free_fattr(res.dir_attr);
654out:
627 dprintk("NFS reply readdir: %d\n", status); 655 dprintk("NFS reply readdir: %d\n", status);
628 return status; 656 return status;
629} 657}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 56a86f6ac8b5..9769704f8ce6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -330,7 +330,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
330static int 330static int
331nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 331nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
332{ 332{
333 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 333 struct rpc_auth *auth = req->rq_cred->cr_auth;
334 unsigned int replen; 334 unsigned int replen;
335 u32 count = args->count; 335 u32 count = args->count;
336 336
@@ -471,7 +471,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
471static int 471static int
472nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) 472nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
473{ 473{
474 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 474 struct rpc_auth *auth = req->rq_cred->cr_auth;
475 unsigned int replen; 475 unsigned int replen;
476 u32 count = args->count; 476 u32 count = args->count;
477 477
@@ -675,7 +675,7 @@ static int
675nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, 675nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
676 struct nfs3_getaclargs *args) 676 struct nfs3_getaclargs *args)
677{ 677{
678 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 678 struct rpc_auth *auth = req->rq_cred->cr_auth;
679 unsigned int replen; 679 unsigned int replen;
680 680
681 p = xdr_encode_fhandle(p, args->fh); 681 p = xdr_encode_fhandle(p, args->fh);
@@ -762,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
762static int 762static int
763nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) 763nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
764{ 764{
765 return nfs3_xdr_wccstat(req, p, &res->dir_attr); 765 return nfs3_xdr_wccstat(req, p, res->dir_attr);
766} 766}
767 767
768/* 768/*
@@ -802,7 +802,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
802static int 802static int
803nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) 803nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
804{ 804{
805 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 805 struct rpc_auth *auth = req->rq_cred->cr_auth;
806 unsigned int replen; 806 unsigned int replen;
807 807
808 p = xdr_encode_fhandle(p, args->fh); 808 p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a187200a7aac..311e15cc8af0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -45,10 +45,29 @@ enum nfs4_client_state {
45 NFS4CLNT_RECLAIM_NOGRACE, 45 NFS4CLNT_RECLAIM_NOGRACE,
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_SESSION_RESET, 47 NFS4CLNT_SESSION_RESET,
48 NFS4CLNT_SESSION_DRAINING,
49 NFS4CLNT_RECALL_SLOT, 48 NFS4CLNT_RECALL_SLOT,
50}; 49};
51 50
51enum nfs4_session_state {
52 NFS4_SESSION_INITING,
53 NFS4_SESSION_DRAINING,
54};
55
56struct nfs4_minor_version_ops {
57 u32 minor_version;
58
59 int (*call_sync)(struct nfs_server *server,
60 struct rpc_message *msg,
61 struct nfs4_sequence_args *args,
62 struct nfs4_sequence_res *res,
63 int cache_reply);
64 int (*validate_stateid)(struct nfs_delegation *,
65 const nfs4_stateid *);
66 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
67 const struct nfs4_state_recovery_ops *nograce_recovery_ops;
68 const struct nfs4_state_maintenance_ops *state_renewal_ops;
69};
70
52/* 71/*
53 * struct rpc_sequence ensures that RPC calls are sent in the exact 72 * struct rpc_sequence ensures that RPC calls are sent in the exact
54 * order that they appear on the list. 73 * order that they appear on the list.
@@ -89,7 +108,6 @@ struct nfs_unique_id {
89 */ 108 */
90struct nfs4_state_owner { 109struct nfs4_state_owner {
91 struct nfs_unique_id so_owner_id; 110 struct nfs_unique_id so_owner_id;
92 struct nfs_client *so_client;
93 struct nfs_server *so_server; 111 struct nfs_server *so_server;
94 struct rb_node so_client_node; 112 struct rb_node so_client_node;
95 113
@@ -99,7 +117,6 @@ struct nfs4_state_owner {
99 atomic_t so_count; 117 atomic_t so_count;
100 unsigned long so_flags; 118 unsigned long so_flags;
101 struct list_head so_states; 119 struct list_head so_states;
102 struct list_head so_delegations;
103 struct nfs_seqid_counter so_seqid; 120 struct nfs_seqid_counter so_seqid;
104 struct rpc_sequence so_sequence; 121 struct rpc_sequence so_sequence;
105}; 122};
@@ -125,10 +142,20 @@ enum {
125 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) 142 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
126 */ 143 */
127 144
145struct nfs4_lock_owner {
146 unsigned int lo_type;
147#define NFS4_ANY_LOCK_TYPE (0U)
148#define NFS4_FLOCK_LOCK_TYPE (1U << 0)
149#define NFS4_POSIX_LOCK_TYPE (1U << 1)
150 union {
151 fl_owner_t posix_owner;
152 pid_t flock_owner;
153 } lo_u;
154};
155
128struct nfs4_lock_state { 156struct nfs4_lock_state {
129 struct list_head ls_locks; /* Other lock stateids */ 157 struct list_head ls_locks; /* Other lock stateids */
130 struct nfs4_state * ls_state; /* Pointer to open state */ 158 struct nfs4_state * ls_state; /* Pointer to open state */
131 fl_owner_t ls_owner; /* POSIX lock owner */
132#define NFS_LOCK_INITIALIZED 1 159#define NFS_LOCK_INITIALIZED 1
133 int ls_flags; 160 int ls_flags;
134 struct nfs_seqid_counter ls_seqid; 161 struct nfs_seqid_counter ls_seqid;
@@ -136,6 +163,7 @@ struct nfs4_lock_state {
136 struct nfs_unique_id ls_id; 163 struct nfs_unique_id ls_id;
137 nfs4_stateid ls_stateid; 164 nfs4_stateid ls_stateid;
138 atomic_t ls_count; 165 atomic_t ls_count;
166 struct nfs4_lock_owner ls_owner;
139}; 167};
140 168
141/* bits for nfs4_state->flags */ 169/* bits for nfs4_state->flags */
@@ -206,24 +234,28 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
206 234
207 235
208/* nfs4proc.c */ 236/* nfs4proc.c */
209extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); 237extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
210extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); 238extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
211extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); 239extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
212extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); 240extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
213extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); 241extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
214extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
215extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
216extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); 244extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
217extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); 245extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
218extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); 246extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
219extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 247extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
220extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 248extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
221 struct nfs4_fs_locations *fs_locations, struct page *page); 249 struct nfs4_fs_locations *fs_locations, struct page *page);
250extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
222 251
223extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[];
224extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[];
225#if defined(CONFIG_NFS_V4_1) 252#if defined(CONFIG_NFS_V4_1)
226extern int nfs4_setup_sequence(struct nfs_client *clp, 253static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
254{
255 return server->nfs_client->cl_session;
256}
257
258extern int nfs4_setup_sequence(const struct nfs_server *server,
227 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 259 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
228 int cache_reply, struct rpc_task *task); 260 int cache_reply, struct rpc_task *task);
229extern void nfs4_destroy_session(struct nfs4_session *session); 261extern void nfs4_destroy_session(struct nfs4_session *session);
@@ -234,7 +266,12 @@ extern int nfs4_init_session(struct nfs_server *server);
234extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 266extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
235 struct nfs_fsinfo *fsinfo); 267 struct nfs_fsinfo *fsinfo);
236#else /* CONFIG_NFS_v4_1 */ 268#else /* CONFIG_NFS_v4_1 */
237static inline int nfs4_setup_sequence(struct nfs_client *clp, 269static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
270{
271 return NULL;
272}
273
274static inline int nfs4_setup_sequence(const struct nfs_server *server,
238 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 275 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
239 int cache_reply, struct rpc_task *task) 276 int cache_reply, struct rpc_task *task)
240{ 277{
@@ -247,7 +284,7 @@ static inline int nfs4_init_session(struct nfs_server *server)
247} 284}
248#endif /* CONFIG_NFS_V4_1 */ 285#endif /* CONFIG_NFS_V4_1 */
249 286
250extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; 287extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
251 288
252extern const u32 nfs4_fattr_bitmap[2]; 289extern const u32 nfs4_fattr_bitmap[2];
253extern const u32 nfs4_statfs_bitmap[2]; 290extern const u32 nfs4_statfs_bitmap[2];
@@ -284,9 +321,9 @@ extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
284extern void nfs41_handle_recall_slot(struct nfs_client *clp); 321extern void nfs41_handle_recall_slot(struct nfs_client *clp);
285extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 322extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 323extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 324extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
288 325
289extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); 326extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
290extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 327extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
291extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); 328extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
292extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); 329extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index f071d12c613b..3c2a1724fbd2 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -115,6 +115,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
115 char *page, char *page2, 115 char *page, char *page2,
116 const struct nfs4_fs_location *location) 116 const struct nfs4_fs_location *location)
117{ 117{
118 const size_t addr_bufsize = sizeof(struct sockaddr_storage);
118 struct vfsmount *mnt = ERR_PTR(-ENOENT); 119 struct vfsmount *mnt = ERR_PTR(-ENOENT);
119 char *mnt_path; 120 char *mnt_path;
120 unsigned int maxbuflen; 121 unsigned int maxbuflen;
@@ -126,9 +127,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
126 mountdata->mnt_path = mnt_path; 127 mountdata->mnt_path = mnt_path;
127 maxbuflen = mnt_path - 1 - page2; 128 maxbuflen = mnt_path - 1 - page2;
128 129
130 mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
131 if (mountdata->addr == NULL)
132 return ERR_PTR(-ENOMEM);
133
129 for (s = 0; s < location->nservers; s++) { 134 for (s = 0; s < location->nservers; s++) {
130 const struct nfs4_string *buf = &location->servers[s]; 135 const struct nfs4_string *buf = &location->servers[s];
131 struct sockaddr_storage addr;
132 136
133 if (buf->len <= 0 || buf->len >= maxbuflen) 137 if (buf->len <= 0 || buf->len >= maxbuflen)
134 continue; 138 continue;
@@ -137,11 +141,10 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
137 continue; 141 continue;
138 142
139 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, 143 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
140 (struct sockaddr *)&addr, sizeof(addr)); 144 mountdata->addr, addr_bufsize);
141 if (mountdata->addrlen == 0) 145 if (mountdata->addrlen == 0)
142 continue; 146 continue;
143 147
144 mountdata->addr = (struct sockaddr *)&addr;
145 rpc_set_port(mountdata->addr, NFS_PORT); 148 rpc_set_port(mountdata->addr, NFS_PORT);
146 149
147 memcpy(page2, buf->data, buf->len); 150 memcpy(page2, buf->data, buf->len);
@@ -156,6 +159,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
156 if (!IS_ERR(mnt)) 159 if (!IS_ERR(mnt))
157 break; 160 break;
158 } 161 }
162 kfree(mountdata->addr);
159 return mnt; 163 return mnt;
160} 164}
161 165
@@ -221,8 +225,8 @@ out:
221 225
222/* 226/*
223 * nfs_do_refmount - handle crossing a referral on server 227 * nfs_do_refmount - handle crossing a referral on server
228 * @mnt_parent - mountpoint of referral
224 * @dentry - dentry of referral 229 * @dentry - dentry of referral
225 * @nd - nameidata info
226 * 230 *
227 */ 231 */
228struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) 232struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 071fcedd517c..089da5b5d20a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -70,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
70static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 70static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
71static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 71static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
72static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 72static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
73static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
74 struct nfs_fattr *fattr, struct iattr *sattr,
75 struct nfs4_state *state);
73 76
74/* Prevent leaks of NFSv4 errors into userland */ 77/* Prevent leaks of NFSv4 errors into userland */
75static int nfs4_map_errors(int err) 78static int nfs4_map_errors(int err)
@@ -300,15 +303,19 @@ do_state_recovery:
300} 303}
301 304
302 305
303static void renew_lease(const struct nfs_server *server, unsigned long timestamp) 306static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
304{ 307{
305 struct nfs_client *clp = server->nfs_client;
306 spin_lock(&clp->cl_lock); 308 spin_lock(&clp->cl_lock);
307 if (time_before(clp->cl_last_renewal,timestamp)) 309 if (time_before(clp->cl_last_renewal,timestamp))
308 clp->cl_last_renewal = timestamp; 310 clp->cl_last_renewal = timestamp;
309 spin_unlock(&clp->cl_lock); 311 spin_unlock(&clp->cl_lock);
310} 312}
311 313
314static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
315{
316 do_renew_lease(server->nfs_client, timestamp);
317}
318
312#if defined(CONFIG_NFS_V4_1) 319#if defined(CONFIG_NFS_V4_1)
313 320
314/* 321/*
@@ -353,7 +360,7 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
353{ 360{
354 struct rpc_task *task; 361 struct rpc_task *task;
355 362
356 if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { 363 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
357 task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); 364 task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
358 if (task) 365 if (task)
359 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); 366 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
@@ -367,12 +374,11 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
367 complete(&ses->complete); 374 complete(&ses->complete);
368} 375}
369 376
370static void nfs41_sequence_free_slot(const struct nfs_client *clp, 377static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
371 struct nfs4_sequence_res *res)
372{ 378{
373 struct nfs4_slot_table *tbl; 379 struct nfs4_slot_table *tbl;
374 380
375 tbl = &clp->cl_session->fc_slot_table; 381 tbl = &res->sr_session->fc_slot_table;
376 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { 382 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
377 /* just wake up the next guy waiting since 383 /* just wake up the next guy waiting since
378 * we may have not consumed a slot after all */ 384 * we may have not consumed a slot after all */
@@ -382,18 +388,17 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp,
382 388
383 spin_lock(&tbl->slot_tbl_lock); 389 spin_lock(&tbl->slot_tbl_lock);
384 nfs4_free_slot(tbl, res->sr_slotid); 390 nfs4_free_slot(tbl, res->sr_slotid);
385 nfs41_check_drain_session_complete(clp->cl_session); 391 nfs41_check_drain_session_complete(res->sr_session);
386 spin_unlock(&tbl->slot_tbl_lock); 392 spin_unlock(&tbl->slot_tbl_lock);
387 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 393 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
388} 394}
389 395
390static void nfs41_sequence_done(struct nfs_client *clp, 396static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
391 struct nfs4_sequence_res *res,
392 int rpc_status)
393{ 397{
394 unsigned long timestamp; 398 unsigned long timestamp;
395 struct nfs4_slot_table *tbl; 399 struct nfs4_slot_table *tbl;
396 struct nfs4_slot *slot; 400 struct nfs4_slot *slot;
401 struct nfs_client *clp;
397 402
398 /* 403 /*
399 * sr_status remains 1 if an RPC level error occurred. The server 404 * sr_status remains 1 if an RPC level error occurred. The server
@@ -408,25 +413,51 @@ static void nfs41_sequence_done(struct nfs_client *clp,
408 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) 413 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
409 goto out; 414 goto out;
410 415
416 tbl = &res->sr_session->fc_slot_table;
417 slot = tbl->slots + res->sr_slotid;
418
411 /* Check the SEQUENCE operation status */ 419 /* Check the SEQUENCE operation status */
412 if (res->sr_status == 0) { 420 switch (res->sr_status) {
413 tbl = &clp->cl_session->fc_slot_table; 421 case 0:
414 slot = tbl->slots + res->sr_slotid;
415 /* Update the slot's sequence and clientid lease timer */ 422 /* Update the slot's sequence and clientid lease timer */
416 ++slot->seq_nr; 423 ++slot->seq_nr;
417 timestamp = res->sr_renewal_time; 424 timestamp = res->sr_renewal_time;
418 spin_lock(&clp->cl_lock); 425 clp = res->sr_session->clp;
419 if (time_before(clp->cl_last_renewal, timestamp)) 426 do_renew_lease(clp, timestamp);
420 clp->cl_last_renewal = timestamp;
421 spin_unlock(&clp->cl_lock);
422 /* Check sequence flags */ 427 /* Check sequence flags */
423 if (atomic_read(&clp->cl_count) > 1) 428 if (atomic_read(&clp->cl_count) > 1)
424 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); 429 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
430 break;
431 case -NFS4ERR_DELAY:
432 /* The server detected a resend of the RPC call and
433 * returned NFS4ERR_DELAY as per Section 2.10.6.2
434 * of RFC5661.
435 */
436 dprintk("%s: slot=%d seq=%d: Operation in progress\n",
437 __func__, res->sr_slotid, slot->seq_nr);
438 goto out_retry;
439 default:
440 /* Just update the slot sequence no. */
441 ++slot->seq_nr;
425 } 442 }
426out: 443out:
427 /* The session may be reset by one of the error handlers. */ 444 /* The session may be reset by one of the error handlers. */
428 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); 445 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
429 nfs41_sequence_free_slot(clp, res); 446 nfs41_sequence_free_slot(res);
447 return 1;
448out_retry:
449 if (!rpc_restart_call(task))
450 goto out;
451 rpc_delay(task, NFS4_POLL_RETRY_MAX);
452 return 0;
453}
454
455static int nfs4_sequence_done(struct rpc_task *task,
456 struct nfs4_sequence_res *res)
457{
458 if (res->sr_session == NULL)
459 return 1;
460 return nfs41_sequence_done(task, res);
430} 461}
431 462
432/* 463/*
@@ -477,12 +508,11 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
477 if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) 508 if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
478 return 0; 509 return 0;
479 510
480 memset(res, 0, sizeof(*res));
481 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 511 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
482 tbl = &session->fc_slot_table; 512 tbl = &session->fc_slot_table;
483 513
484 spin_lock(&tbl->slot_tbl_lock); 514 spin_lock(&tbl->slot_tbl_lock);
485 if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && 515 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
486 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { 516 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
487 /* 517 /*
488 * The state manager will wait until the slot table is empty. 518 * The state manager will wait until the slot table is empty.
@@ -522,6 +552,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
522 res->sr_session = session; 552 res->sr_session = session;
523 res->sr_slotid = slotid; 553 res->sr_slotid = slotid;
524 res->sr_renewal_time = jiffies; 554 res->sr_renewal_time = jiffies;
555 res->sr_status_flags = 0;
525 /* 556 /*
526 * sr_status is only set in decode_sequence, and so will remain 557 * sr_status is only set in decode_sequence, and so will remain
527 * set to 1 if an rpc level failure occurs. 558 * set to 1 if an rpc level failure occurs.
@@ -530,33 +561,33 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
530 return 0; 561 return 0;
531} 562}
532 563
533int nfs4_setup_sequence(struct nfs_client *clp, 564int nfs4_setup_sequence(const struct nfs_server *server,
534 struct nfs4_sequence_args *args, 565 struct nfs4_sequence_args *args,
535 struct nfs4_sequence_res *res, 566 struct nfs4_sequence_res *res,
536 int cache_reply, 567 int cache_reply,
537 struct rpc_task *task) 568 struct rpc_task *task)
538{ 569{
570 struct nfs4_session *session = nfs4_get_session(server);
539 int ret = 0; 571 int ret = 0;
540 572
573 if (session == NULL) {
574 args->sa_session = NULL;
575 res->sr_session = NULL;
576 goto out;
577 }
578
541 dprintk("--> %s clp %p session %p sr_slotid %d\n", 579 dprintk("--> %s clp %p session %p sr_slotid %d\n",
542 __func__, clp, clp->cl_session, res->sr_slotid); 580 __func__, session->clp, session, res->sr_slotid);
543 581
544 if (!nfs4_has_session(clp)) 582 ret = nfs41_setup_sequence(session, args, res, cache_reply,
545 goto out;
546 ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
547 task); 583 task);
548 if (ret && ret != -EAGAIN) {
549 /* terminate rpc task */
550 task->tk_status = ret;
551 task->tk_action = NULL;
552 }
553out: 584out:
554 dprintk("<-- %s status=%d\n", __func__, ret); 585 dprintk("<-- %s status=%d\n", __func__, ret);
555 return ret; 586 return ret;
556} 587}
557 588
558struct nfs41_call_sync_data { 589struct nfs41_call_sync_data {
559 struct nfs_client *clp; 590 const struct nfs_server *seq_server;
560 struct nfs4_sequence_args *seq_args; 591 struct nfs4_sequence_args *seq_args;
561 struct nfs4_sequence_res *seq_res; 592 struct nfs4_sequence_res *seq_res;
562 int cache_reply; 593 int cache_reply;
@@ -566,9 +597,9 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
566{ 597{
567 struct nfs41_call_sync_data *data = calldata; 598 struct nfs41_call_sync_data *data = calldata;
568 599
569 dprintk("--> %s data->clp->cl_session %p\n", __func__, 600 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
570 data->clp->cl_session); 601
571 if (nfs4_setup_sequence(data->clp, data->seq_args, 602 if (nfs4_setup_sequence(data->seq_server, data->seq_args,
572 data->seq_res, data->cache_reply, task)) 603 data->seq_res, data->cache_reply, task))
573 return; 604 return;
574 rpc_call_start(task); 605 rpc_call_start(task);
@@ -584,7 +615,7 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
584{ 615{
585 struct nfs41_call_sync_data *data = calldata; 616 struct nfs41_call_sync_data *data = calldata;
586 617
587 nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); 618 nfs41_sequence_done(task, data->seq_res);
588} 619}
589 620
590struct rpc_call_ops nfs41_call_sync_ops = { 621struct rpc_call_ops nfs41_call_sync_ops = {
@@ -597,8 +628,7 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = {
597 .rpc_call_done = nfs41_call_sync_done, 628 .rpc_call_done = nfs41_call_sync_done,
598}; 629};
599 630
600static int nfs4_call_sync_sequence(struct nfs_client *clp, 631static int nfs4_call_sync_sequence(struct nfs_server *server,
601 struct rpc_clnt *clnt,
602 struct rpc_message *msg, 632 struct rpc_message *msg,
603 struct nfs4_sequence_args *args, 633 struct nfs4_sequence_args *args,
604 struct nfs4_sequence_res *res, 634 struct nfs4_sequence_res *res,
@@ -608,13 +638,13 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp,
608 int ret; 638 int ret;
609 struct rpc_task *task; 639 struct rpc_task *task;
610 struct nfs41_call_sync_data data = { 640 struct nfs41_call_sync_data data = {
611 .clp = clp, 641 .seq_server = server,
612 .seq_args = args, 642 .seq_args = args,
613 .seq_res = res, 643 .seq_res = res,
614 .cache_reply = cache_reply, 644 .cache_reply = cache_reply,
615 }; 645 };
616 struct rpc_task_setup task_setup = { 646 struct rpc_task_setup task_setup = {
617 .rpc_client = clnt, 647 .rpc_client = server->client,
618 .rpc_message = msg, 648 .rpc_message = msg,
619 .callback_ops = &nfs41_call_sync_ops, 649 .callback_ops = &nfs41_call_sync_ops,
620 .callback_data = &data 650 .callback_data = &data
@@ -639,10 +669,15 @@ int _nfs4_call_sync_session(struct nfs_server *server,
639 struct nfs4_sequence_res *res, 669 struct nfs4_sequence_res *res,
640 int cache_reply) 670 int cache_reply)
641{ 671{
642 return nfs4_call_sync_sequence(server->nfs_client, server->client, 672 return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0);
643 msg, args, res, cache_reply, 0);
644} 673}
645 674
675#else
676static int nfs4_sequence_done(struct rpc_task *task,
677 struct nfs4_sequence_res *res)
678{
679 return 1;
680}
646#endif /* CONFIG_NFS_V4_1 */ 681#endif /* CONFIG_NFS_V4_1 */
647 682
648int _nfs4_call_sync(struct nfs_server *server, 683int _nfs4_call_sync(struct nfs_server *server,
@@ -656,18 +691,9 @@ int _nfs4_call_sync(struct nfs_server *server,
656} 691}
657 692
658#define nfs4_call_sync(server, msg, args, res, cache_reply) \ 693#define nfs4_call_sync(server, msg, args, res, cache_reply) \
659 (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ 694 (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \
660 &(res)->seq_res, (cache_reply)) 695 &(res)->seq_res, (cache_reply))
661 696
662static void nfs4_sequence_done(const struct nfs_server *server,
663 struct nfs4_sequence_res *res, int rpc_status)
664{
665#ifdef CONFIG_NFS_V4_1
666 if (nfs4_has_session(server->nfs_client))
667 nfs41_sequence_done(server->nfs_client, res, rpc_status);
668#endif /* CONFIG_NFS_V4_1 */
669}
670
671static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 697static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
672{ 698{
673 struct nfs_inode *nfsi = NFS_I(dir); 699 struct nfs_inode *nfsi = NFS_I(dir);
@@ -714,17 +740,18 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
714 740
715static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 741static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
716 struct nfs4_state_owner *sp, fmode_t fmode, int flags, 742 struct nfs4_state_owner *sp, fmode_t fmode, int flags,
717 const struct iattr *attrs) 743 const struct iattr *attrs,
744 gfp_t gfp_mask)
718{ 745{
719 struct dentry *parent = dget_parent(path->dentry); 746 struct dentry *parent = dget_parent(path->dentry);
720 struct inode *dir = parent->d_inode; 747 struct inode *dir = parent->d_inode;
721 struct nfs_server *server = NFS_SERVER(dir); 748 struct nfs_server *server = NFS_SERVER(dir);
722 struct nfs4_opendata *p; 749 struct nfs4_opendata *p;
723 750
724 p = kzalloc(sizeof(*p), GFP_KERNEL); 751 p = kzalloc(sizeof(*p), gfp_mask);
725 if (p == NULL) 752 if (p == NULL)
726 goto err; 753 goto err;
727 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 754 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
728 if (p->o_arg.seqid == NULL) 755 if (p->o_arg.seqid == NULL)
729 goto err_free; 756 goto err_free;
730 path_get(path); 757 path_get(path);
@@ -741,19 +768,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
741 p->o_arg.server = server; 768 p->o_arg.server = server;
742 p->o_arg.bitmask = server->attr_bitmask; 769 p->o_arg.bitmask = server->attr_bitmask;
743 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; 770 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
744 if (flags & O_EXCL) { 771 if (flags & O_CREAT) {
745 if (nfs4_has_persistent_session(server->nfs_client)) { 772 u32 *s;
746 /* GUARDED */ 773
747 p->o_arg.u.attrs = &p->attrs;
748 memcpy(&p->attrs, attrs, sizeof(p->attrs));
749 } else { /* EXCLUSIVE4_1 */
750 u32 *s = (u32 *) p->o_arg.u.verifier.data;
751 s[0] = jiffies;
752 s[1] = current->pid;
753 }
754 } else if (flags & O_CREAT) {
755 p->o_arg.u.attrs = &p->attrs; 774 p->o_arg.u.attrs = &p->attrs;
756 memcpy(&p->attrs, attrs, sizeof(p->attrs)); 775 memcpy(&p->attrs, attrs, sizeof(p->attrs));
776 s = (u32 *) p->o_arg.u.verifier.data;
777 s[0] = jiffies;
778 s[1] = current->pid;
757 } 779 }
758 p->c_arg.fh = &p->o_res.fh; 780 p->c_arg.fh = &p->o_res.fh;
759 p->c_arg.stateid = &p->o_res.stateid; 781 p->c_arg.stateid = &p->o_res.stateid;
@@ -1060,7 +1082,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
1060{ 1082{
1061 struct nfs4_opendata *opendata; 1083 struct nfs4_opendata *opendata;
1062 1084
1063 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL); 1085 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS);
1064 if (opendata == NULL) 1086 if (opendata == NULL)
1065 return ERR_PTR(-ENOMEM); 1087 return ERR_PTR(-ENOMEM);
1066 opendata->state = state; 1088 opendata->state = state;
@@ -1251,8 +1273,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
1251 struct nfs4_opendata *data = calldata; 1273 struct nfs4_opendata *data = calldata;
1252 1274
1253 data->rpc_status = task->tk_status; 1275 data->rpc_status = task->tk_status;
1254 if (RPC_ASSASSINATED(task))
1255 return;
1256 if (data->rpc_status == 0) { 1276 if (data->rpc_status == 0) {
1257 memcpy(data->o_res.stateid.data, data->c_res.stateid.data, 1277 memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
1258 sizeof(data->o_res.stateid.data)); 1278 sizeof(data->o_res.stateid.data));
@@ -1352,13 +1372,13 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1352 } 1372 }
1353 /* Update sequence id. */ 1373 /* Update sequence id. */
1354 data->o_arg.id = sp->so_owner_id.id; 1374 data->o_arg.id = sp->so_owner_id.id;
1355 data->o_arg.clientid = sp->so_client->cl_clientid; 1375 data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
1356 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { 1376 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
1357 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; 1377 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
1358 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); 1378 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
1359 } 1379 }
1360 data->timestamp = jiffies; 1380 data->timestamp = jiffies;
1361 if (nfs4_setup_sequence(data->o_arg.server->nfs_client, 1381 if (nfs4_setup_sequence(data->o_arg.server,
1362 &data->o_arg.seq_args, 1382 &data->o_arg.seq_args,
1363 &data->o_res.seq_res, 1, task)) 1383 &data->o_res.seq_res, 1, task))
1364 return; 1384 return;
@@ -1381,11 +1401,9 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
1381 1401
1382 data->rpc_status = task->tk_status; 1402 data->rpc_status = task->tk_status;
1383 1403
1384 nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, 1404 if (!nfs4_sequence_done(task, &data->o_res.seq_res))
1385 task->tk_status);
1386
1387 if (RPC_ASSASSINATED(task))
1388 return; 1405 return;
1406
1389 if (task->tk_status == 0) { 1407 if (task->tk_status == 0) {
1390 switch (data->o_res.f_attr->mode & S_IFMT) { 1408 switch (data->o_res.f_attr->mode & S_IFMT) {
1391 case S_IFREG: 1409 case S_IFREG:
@@ -1648,7 +1666,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1648 if (path->dentry->d_inode != NULL) 1666 if (path->dentry->d_inode != NULL)
1649 nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); 1667 nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
1650 status = -ENOMEM; 1668 status = -ENOMEM;
1651 opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr); 1669 opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL);
1652 if (opendata == NULL) 1670 if (opendata == NULL)
1653 goto err_put_state_owner; 1671 goto err_put_state_owner;
1654 1672
@@ -1659,15 +1677,24 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1659 if (status != 0) 1677 if (status != 0)
1660 goto err_opendata_put; 1678 goto err_opendata_put;
1661 1679
1662 if (opendata->o_arg.open_flags & O_EXCL)
1663 nfs4_exclusive_attrset(opendata, sattr);
1664
1665 state = nfs4_opendata_to_nfs4_state(opendata); 1680 state = nfs4_opendata_to_nfs4_state(opendata);
1666 status = PTR_ERR(state); 1681 status = PTR_ERR(state);
1667 if (IS_ERR(state)) 1682 if (IS_ERR(state))
1668 goto err_opendata_put; 1683 goto err_opendata_put;
1669 if (server->caps & NFS_CAP_POSIX_LOCK) 1684 if (server->caps & NFS_CAP_POSIX_LOCK)
1670 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 1685 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1686
1687 if (opendata->o_arg.open_flags & O_EXCL) {
1688 nfs4_exclusive_attrset(opendata, sattr);
1689
1690 nfs_fattr_init(opendata->o_res.f_attr);
1691 status = nfs4_do_setattr(state->inode, cred,
1692 opendata->o_res.f_attr, sattr,
1693 state);
1694 if (status == 0)
1695 nfs_setattr_update_inode(state->inode, sattr);
1696 nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
1697 }
1671 nfs4_opendata_put(opendata); 1698 nfs4_opendata_put(opendata);
1672 nfs4_put_state_owner(sp); 1699 nfs4_put_state_owner(sp);
1673 *res = state; 1700 *res = state;
@@ -1760,7 +1787,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1760 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { 1787 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
1761 /* Use that stateid */ 1788 /* Use that stateid */
1762 } else if (state != NULL) { 1789 } else if (state != NULL) {
1763 nfs4_copy_stateid(&arg.stateid, state, current->files); 1790 nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
1764 } else 1791 } else
1765 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1792 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
1766 1793
@@ -1825,8 +1852,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1825 struct nfs4_state *state = calldata->state; 1852 struct nfs4_state *state = calldata->state;
1826 struct nfs_server *server = NFS_SERVER(calldata->inode); 1853 struct nfs_server *server = NFS_SERVER(calldata->inode);
1827 1854
1828 nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); 1855 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
1829 if (RPC_ASSASSINATED(task))
1830 return; 1856 return;
1831 /* hmm. we are done with the inode, and in the process of freeing 1857 /* hmm. we are done with the inode, and in the process of freeing
1832 * the state_owner. we keep this around to process errors 1858 * the state_owner. we keep this around to process errors
@@ -1890,7 +1916,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1890 1916
1891 nfs_fattr_init(calldata->res.fattr); 1917 nfs_fattr_init(calldata->res.fattr);
1892 calldata->timestamp = jiffies; 1918 calldata->timestamp = jiffies;
1893 if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, 1919 if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
1894 &calldata->arg.seq_args, &calldata->res.seq_res, 1920 &calldata->arg.seq_args, &calldata->res.seq_res,
1895 1, task)) 1921 1, task))
1896 return; 1922 return;
@@ -1914,7 +1940,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
1914 * 1940 *
1915 * NOTE: Caller must be holding the sp->so_owner semaphore! 1941 * NOTE: Caller must be holding the sp->so_owner semaphore!
1916 */ 1942 */
1917int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) 1943int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
1918{ 1944{
1919 struct nfs_server *server = NFS_SERVER(state->inode); 1945 struct nfs_server *server = NFS_SERVER(state->inode);
1920 struct nfs4_closedata *calldata; 1946 struct nfs4_closedata *calldata;
@@ -1933,7 +1959,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1933 }; 1959 };
1934 int status = -ENOMEM; 1960 int status = -ENOMEM;
1935 1961
1936 calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); 1962 calldata = kzalloc(sizeof(*calldata), gfp_mask);
1937 if (calldata == NULL) 1963 if (calldata == NULL)
1938 goto out; 1964 goto out;
1939 calldata->inode = state->inode; 1965 calldata->inode = state->inode;
@@ -1941,7 +1967,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1941 calldata->arg.fh = NFS_FH(state->inode); 1967 calldata->arg.fh = NFS_FH(state->inode);
1942 calldata->arg.stateid = &state->open_stateid; 1968 calldata->arg.stateid = &state->open_stateid;
1943 /* Serialization for the sequence id */ 1969 /* Serialization for the sequence id */
1944 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); 1970 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
1945 if (calldata->arg.seqid == NULL) 1971 if (calldata->arg.seqid == NULL)
1946 goto out_free_calldata; 1972 goto out_free_calldata;
1947 calldata->arg.fmode = 0; 1973 calldata->arg.fmode = 0;
@@ -2010,7 +2036,8 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2010 struct rpc_cred *cred; 2036 struct rpc_cred *cred;
2011 struct nfs4_state *state; 2037 struct nfs4_state *state;
2012 struct dentry *res; 2038 struct dentry *res;
2013 fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); 2039 int open_flags = nd->intent.open.flags;
2040 fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
2014 2041
2015 if (nd->flags & LOOKUP_CREATE) { 2042 if (nd->flags & LOOKUP_CREATE) {
2016 attr.ia_mode = nd->intent.open.create_mode; 2043 attr.ia_mode = nd->intent.open.create_mode;
@@ -2018,8 +2045,9 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2018 if (!IS_POSIXACL(dir)) 2045 if (!IS_POSIXACL(dir))
2019 attr.ia_mode &= ~current_umask(); 2046 attr.ia_mode &= ~current_umask();
2020 } else { 2047 } else {
2048 open_flags &= ~O_EXCL;
2021 attr.ia_valid = 0; 2049 attr.ia_valid = 0;
2022 BUG_ON(nd->intent.open.flags & O_CREAT); 2050 BUG_ON(open_flags & O_CREAT);
2023 } 2051 }
2024 2052
2025 cred = rpc_lookup_cred(); 2053 cred = rpc_lookup_cred();
@@ -2028,7 +2056,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2028 parent = dentry->d_parent; 2056 parent = dentry->d_parent;
2029 /* Protect against concurrent sillydeletes */ 2057 /* Protect against concurrent sillydeletes */
2030 nfs_block_sillyrename(parent); 2058 nfs_block_sillyrename(parent);
2031 state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred); 2059 state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
2032 put_rpccred(cred); 2060 put_rpccred(cred);
2033 if (IS_ERR(state)) { 2061 if (IS_ERR(state)) {
2034 if (PTR_ERR(state) == -ENOENT) { 2062 if (PTR_ERR(state) == -ENOENT) {
@@ -2247,8 +2275,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
2247out: 2275out:
2248 if (page) 2276 if (page)
2249 __free_page(page); 2277 __free_page(page);
2250 if (locations) 2278 kfree(locations);
2251 kfree(locations);
2252 return status; 2279 return status;
2253} 2280}
2254 2281
@@ -2404,14 +2431,12 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
2404static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) 2431static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
2405{ 2432{
2406 struct nfs_server *server = NFS_SERVER(inode); 2433 struct nfs_server *server = NFS_SERVER(inode);
2407 struct nfs_fattr fattr;
2408 struct nfs4_accessargs args = { 2434 struct nfs4_accessargs args = {
2409 .fh = NFS_FH(inode), 2435 .fh = NFS_FH(inode),
2410 .bitmask = server->attr_bitmask, 2436 .bitmask = server->attr_bitmask,
2411 }; 2437 };
2412 struct nfs4_accessres res = { 2438 struct nfs4_accessres res = {
2413 .server = server, 2439 .server = server,
2414 .fattr = &fattr,
2415 }; 2440 };
2416 struct rpc_message msg = { 2441 struct rpc_message msg = {
2417 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], 2442 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
@@ -2438,7 +2463,11 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2438 if (mode & MAY_EXEC) 2463 if (mode & MAY_EXEC)
2439 args.access |= NFS4_ACCESS_EXECUTE; 2464 args.access |= NFS4_ACCESS_EXECUTE;
2440 } 2465 }
2441 nfs_fattr_init(&fattr); 2466
2467 res.fattr = nfs_alloc_fattr();
2468 if (res.fattr == NULL)
2469 return -ENOMEM;
2470
2442 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2471 status = nfs4_call_sync(server, &msg, &args, &res, 0);
2443 if (!status) { 2472 if (!status) {
2444 entry->mask = 0; 2473 entry->mask = 0;
@@ -2448,8 +2477,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2448 entry->mask |= MAY_WRITE; 2477 entry->mask |= MAY_WRITE;
2449 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) 2478 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
2450 entry->mask |= MAY_EXEC; 2479 entry->mask |= MAY_EXEC;
2451 nfs_refresh_inode(inode, &fattr); 2480 nfs_refresh_inode(inode, res.fattr);
2452 } 2481 }
2482 nfs_free_fattr(res.fattr);
2453 return status; 2483 return status;
2454} 2484}
2455 2485
@@ -2562,13 +2592,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2562 } 2592 }
2563 d_add(dentry, igrab(state->inode)); 2593 d_add(dentry, igrab(state->inode));
2564 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 2594 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2565 if (flags & O_EXCL) {
2566 struct nfs_fattr fattr;
2567 status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
2568 if (status == 0)
2569 nfs_setattr_update_inode(state->inode, sattr);
2570 nfs_post_op_update_inode(state->inode, &fattr);
2571 }
2572 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) 2595 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
2573 status = nfs4_intent_set_file(nd, &path, state, fmode); 2596 status = nfs4_intent_set_file(nd, &path, state, fmode);
2574 else 2597 else
@@ -2596,14 +2619,19 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
2596 .rpc_argp = &args, 2619 .rpc_argp = &args,
2597 .rpc_resp = &res, 2620 .rpc_resp = &res,
2598 }; 2621 };
2599 int status; 2622 int status = -ENOMEM;
2623
2624 res.dir_attr = nfs_alloc_fattr();
2625 if (res.dir_attr == NULL)
2626 goto out;
2600 2627
2601 nfs_fattr_init(&res.dir_attr);
2602 status = nfs4_call_sync(server, &msg, &args, &res, 1); 2628 status = nfs4_call_sync(server, &msg, &args, &res, 1);
2603 if (status == 0) { 2629 if (status == 0) {
2604 update_changeattr(dir, &res.cinfo); 2630 update_changeattr(dir, &res.cinfo);
2605 nfs_post_op_update_inode(dir, &res.dir_attr); 2631 nfs_post_op_update_inode(dir, res.dir_attr);
2606 } 2632 }
2633 nfs_free_fattr(res.dir_attr);
2634out:
2607 return status; 2635 return status;
2608} 2636}
2609 2637
@@ -2634,11 +2662,12 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2634{ 2662{
2635 struct nfs_removeres *res = task->tk_msg.rpc_resp; 2663 struct nfs_removeres *res = task->tk_msg.rpc_resp;
2636 2664
2637 nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); 2665 if (!nfs4_sequence_done(task, &res->seq_res))
2666 return 0;
2638 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 2667 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2639 return 0; 2668 return 0;
2640 update_changeattr(dir, &res->cinfo); 2669 update_changeattr(dir, &res->cinfo);
2641 nfs_post_op_update_inode(dir, &res->dir_attr); 2670 nfs_post_op_update_inode(dir, res->dir_attr);
2642 return 1; 2671 return 1;
2643} 2672}
2644 2673
@@ -2653,29 +2682,31 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2653 .new_name = new_name, 2682 .new_name = new_name,
2654 .bitmask = server->attr_bitmask, 2683 .bitmask = server->attr_bitmask,
2655 }; 2684 };
2656 struct nfs_fattr old_fattr, new_fattr;
2657 struct nfs4_rename_res res = { 2685 struct nfs4_rename_res res = {
2658 .server = server, 2686 .server = server,
2659 .old_fattr = &old_fattr,
2660 .new_fattr = &new_fattr,
2661 }; 2687 };
2662 struct rpc_message msg = { 2688 struct rpc_message msg = {
2663 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], 2689 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
2664 .rpc_argp = &arg, 2690 .rpc_argp = &arg,
2665 .rpc_resp = &res, 2691 .rpc_resp = &res,
2666 }; 2692 };
2667 int status; 2693 int status = -ENOMEM;
2668 2694
2669 nfs_fattr_init(res.old_fattr); 2695 res.old_fattr = nfs_alloc_fattr();
2670 nfs_fattr_init(res.new_fattr); 2696 res.new_fattr = nfs_alloc_fattr();
2671 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2697 if (res.old_fattr == NULL || res.new_fattr == NULL)
2698 goto out;
2672 2699
2700 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
2673 if (!status) { 2701 if (!status) {
2674 update_changeattr(old_dir, &res.old_cinfo); 2702 update_changeattr(old_dir, &res.old_cinfo);
2675 nfs_post_op_update_inode(old_dir, res.old_fattr); 2703 nfs_post_op_update_inode(old_dir, res.old_fattr);
2676 update_changeattr(new_dir, &res.new_cinfo); 2704 update_changeattr(new_dir, &res.new_cinfo);
2677 nfs_post_op_update_inode(new_dir, res.new_fattr); 2705 nfs_post_op_update_inode(new_dir, res.new_fattr);
2678 } 2706 }
2707out:
2708 nfs_free_fattr(res.new_fattr);
2709 nfs_free_fattr(res.old_fattr);
2679 return status; 2710 return status;
2680} 2711}
2681 2712
@@ -2702,28 +2733,30 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
2702 .name = name, 2733 .name = name,
2703 .bitmask = server->attr_bitmask, 2734 .bitmask = server->attr_bitmask,
2704 }; 2735 };
2705 struct nfs_fattr fattr, dir_attr;
2706 struct nfs4_link_res res = { 2736 struct nfs4_link_res res = {
2707 .server = server, 2737 .server = server,
2708 .fattr = &fattr,
2709 .dir_attr = &dir_attr,
2710 }; 2738 };
2711 struct rpc_message msg = { 2739 struct rpc_message msg = {
2712 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], 2740 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
2713 .rpc_argp = &arg, 2741 .rpc_argp = &arg,
2714 .rpc_resp = &res, 2742 .rpc_resp = &res,
2715 }; 2743 };
2716 int status; 2744 int status = -ENOMEM;
2745
2746 res.fattr = nfs_alloc_fattr();
2747 res.dir_attr = nfs_alloc_fattr();
2748 if (res.fattr == NULL || res.dir_attr == NULL)
2749 goto out;
2717 2750
2718 nfs_fattr_init(res.fattr);
2719 nfs_fattr_init(res.dir_attr);
2720 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2751 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
2721 if (!status) { 2752 if (!status) {
2722 update_changeattr(dir, &res.cinfo); 2753 update_changeattr(dir, &res.cinfo);
2723 nfs_post_op_update_inode(dir, res.dir_attr); 2754 nfs_post_op_update_inode(dir, res.dir_attr);
2724 nfs_post_op_update_inode(inode, res.fattr); 2755 nfs_post_op_update_inode(inode, res.fattr);
2725 } 2756 }
2726 2757out:
2758 nfs_free_fattr(res.dir_attr);
2759 nfs_free_fattr(res.fattr);
2727 return status; 2760 return status;
2728} 2761}
2729 2762
@@ -3075,7 +3108,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3075 3108
3076 dprintk("--> %s\n", __func__); 3109 dprintk("--> %s\n", __func__);
3077 3110
3078 nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); 3111 if (!nfs4_sequence_done(task, &data->res.seq_res))
3112 return -EAGAIN;
3079 3113
3080 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 3114 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
3081 nfs_restart_rpc(task, server->nfs_client); 3115 nfs_restart_rpc(task, server->nfs_client);
@@ -3098,8 +3132,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3098{ 3132{
3099 struct inode *inode = data->inode; 3133 struct inode *inode = data->inode;
3100 3134
3101 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, 3135 if (!nfs4_sequence_done(task, &data->res.seq_res))
3102 task->tk_status); 3136 return -EAGAIN;
3103 3137
3104 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 3138 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3105 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3139 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
@@ -3127,8 +3161,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
3127{ 3161{
3128 struct inode *inode = data->inode; 3162 struct inode *inode = data->inode;
3129 3163
3130 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, 3164 if (!nfs4_sequence_done(task, &data->res.seq_res))
3131 task->tk_status); 3165 return -EAGAIN;
3166
3132 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 3167 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
3133 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3168 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
3134 return -EAGAIN; 3169 return -EAGAIN;
@@ -3146,23 +3181,31 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
3146 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 3181 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
3147} 3182}
3148 3183
3184struct nfs4_renewdata {
3185 struct nfs_client *client;
3186 unsigned long timestamp;
3187};
3188
3149/* 3189/*
3150 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special 3190 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
3151 * standalone procedure for queueing an asynchronous RENEW. 3191 * standalone procedure for queueing an asynchronous RENEW.
3152 */ 3192 */
3153static void nfs4_renew_release(void *data) 3193static void nfs4_renew_release(void *calldata)
3154{ 3194{
3155 struct nfs_client *clp = data; 3195 struct nfs4_renewdata *data = calldata;
3196 struct nfs_client *clp = data->client;
3156 3197
3157 if (atomic_read(&clp->cl_count) > 1) 3198 if (atomic_read(&clp->cl_count) > 1)
3158 nfs4_schedule_state_renewal(clp); 3199 nfs4_schedule_state_renewal(clp);
3159 nfs_put_client(clp); 3200 nfs_put_client(clp);
3201 kfree(data);
3160} 3202}
3161 3203
3162static void nfs4_renew_done(struct rpc_task *task, void *data) 3204static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3163{ 3205{
3164 struct nfs_client *clp = data; 3206 struct nfs4_renewdata *data = calldata;
3165 unsigned long timestamp = task->tk_start; 3207 struct nfs_client *clp = data->client;
3208 unsigned long timestamp = data->timestamp;
3166 3209
3167 if (task->tk_status < 0) { 3210 if (task->tk_status < 0) {
3168 /* Unless we're shutting down, schedule state recovery! */ 3211 /* Unless we're shutting down, schedule state recovery! */
@@ -3170,10 +3213,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
3170 nfs4_schedule_state_recovery(clp); 3213 nfs4_schedule_state_recovery(clp);
3171 return; 3214 return;
3172 } 3215 }
3173 spin_lock(&clp->cl_lock); 3216 do_renew_lease(clp, timestamp);
3174 if (time_before(clp->cl_last_renewal,timestamp))
3175 clp->cl_last_renewal = timestamp;
3176 spin_unlock(&clp->cl_lock);
3177} 3217}
3178 3218
3179static const struct rpc_call_ops nfs4_renew_ops = { 3219static const struct rpc_call_ops nfs4_renew_ops = {
@@ -3188,11 +3228,17 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3188 .rpc_argp = clp, 3228 .rpc_argp = clp,
3189 .rpc_cred = cred, 3229 .rpc_cred = cred,
3190 }; 3230 };
3231 struct nfs4_renewdata *data;
3191 3232
3192 if (!atomic_inc_not_zero(&clp->cl_count)) 3233 if (!atomic_inc_not_zero(&clp->cl_count))
3193 return -EIO; 3234 return -EIO;
3235 data = kmalloc(sizeof(*data), GFP_KERNEL);
3236 if (data == NULL)
3237 return -ENOMEM;
3238 data->client = clp;
3239 data->timestamp = jiffies;
3194 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 3240 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
3195 &nfs4_renew_ops, clp); 3241 &nfs4_renew_ops, data);
3196} 3242}
3197 3243
3198int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) 3244int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3208,10 +3254,7 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
3208 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); 3254 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
3209 if (status < 0) 3255 if (status < 0)
3210 return status; 3256 return status;
3211 spin_lock(&clp->cl_lock); 3257 do_renew_lease(clp, now);
3212 if (time_before(clp->cl_last_renewal,now))
3213 clp->cl_last_renewal = now;
3214 spin_unlock(&clp->cl_lock);
3215 return 0; 3258 return 0;
3216} 3259}
3217 3260
@@ -3432,9 +3475,11 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
3432} 3475}
3433 3476
3434static int 3477static int
3435_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) 3478nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
3436{ 3479{
3437 if (!clp || task->tk_status >= 0) 3480 struct nfs_client *clp = server->nfs_client;
3481
3482 if (task->tk_status >= 0)
3438 return 0; 3483 return 0;
3439 switch(task->tk_status) { 3484 switch(task->tk_status) {
3440 case -NFS4ERR_ADMIN_REVOKED: 3485 case -NFS4ERR_ADMIN_REVOKED:
@@ -3466,8 +3511,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3466 return -EAGAIN; 3511 return -EAGAIN;
3467#endif /* CONFIG_NFS_V4_1 */ 3512#endif /* CONFIG_NFS_V4_1 */
3468 case -NFS4ERR_DELAY: 3513 case -NFS4ERR_DELAY:
3469 if (server) 3514 nfs_inc_server_stats(server, NFSIOS_DELAY);
3470 nfs_inc_server_stats(server, NFSIOS_DELAY);
3471 case -NFS4ERR_GRACE: 3515 case -NFS4ERR_GRACE:
3472 case -EKEYEXPIRED: 3516 case -EKEYEXPIRED:
3473 rpc_delay(task, NFS4_POLL_RETRY_MAX); 3517 rpc_delay(task, NFS4_POLL_RETRY_MAX);
@@ -3488,13 +3532,9 @@ do_state_recovery:
3488 return -EAGAIN; 3532 return -EAGAIN;
3489} 3533}
3490 3534
3491static int 3535int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3492nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) 3536 unsigned short port, struct rpc_cred *cred,
3493{ 3537 struct nfs4_setclientid_res *res)
3494 return _nfs4_async_handle_error(task, server, server->nfs_client, state);
3495}
3496
3497int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
3498{ 3538{
3499 nfs4_verifier sc_verifier; 3539 nfs4_verifier sc_verifier;
3500 struct nfs4_setclientid setclientid = { 3540 struct nfs4_setclientid setclientid = {
@@ -3504,7 +3544,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
3504 struct rpc_message msg = { 3544 struct rpc_message msg = {
3505 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], 3545 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
3506 .rpc_argp = &setclientid, 3546 .rpc_argp = &setclientid,
3507 .rpc_resp = clp, 3547 .rpc_resp = res,
3508 .rpc_cred = cred, 3548 .rpc_cred = cred,
3509 }; 3549 };
3510 __be32 *p; 3550 __be32 *p;
@@ -3547,12 +3587,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
3547 return status; 3587 return status;
3548} 3588}
3549 3589
3550static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) 3590static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3591 struct nfs4_setclientid_res *arg,
3592 struct rpc_cred *cred)
3551{ 3593{
3552 struct nfs_fsinfo fsinfo; 3594 struct nfs_fsinfo fsinfo;
3553 struct rpc_message msg = { 3595 struct rpc_message msg = {
3554 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], 3596 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
3555 .rpc_argp = clp, 3597 .rpc_argp = arg,
3556 .rpc_resp = &fsinfo, 3598 .rpc_resp = &fsinfo,
3557 .rpc_cred = cred, 3599 .rpc_cred = cred,
3558 }; 3600 };
@@ -3570,12 +3612,14 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
3570 return status; 3612 return status;
3571} 3613}
3572 3614
3573int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) 3615int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3616 struct nfs4_setclientid_res *arg,
3617 struct rpc_cred *cred)
3574{ 3618{
3575 long timeout = 0; 3619 long timeout = 0;
3576 int err; 3620 int err;
3577 do { 3621 do {
3578 err = _nfs4_proc_setclientid_confirm(clp, cred); 3622 err = _nfs4_proc_setclientid_confirm(clp, arg, cred);
3579 switch (err) { 3623 switch (err) {
3580 case 0: 3624 case 0:
3581 return err; 3625 return err;
@@ -3603,8 +3647,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
3603{ 3647{
3604 struct nfs4_delegreturndata *data = calldata; 3648 struct nfs4_delegreturndata *data = calldata;
3605 3649
3606 nfs4_sequence_done(data->res.server, &data->res.seq_res, 3650 if (!nfs4_sequence_done(task, &data->res.seq_res))
3607 task->tk_status); 3651 return;
3608 3652
3609 switch (task->tk_status) { 3653 switch (task->tk_status) {
3610 case -NFS4ERR_STALE_STATEID: 3654 case -NFS4ERR_STALE_STATEID:
@@ -3634,7 +3678,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
3634 3678
3635 d_data = (struct nfs4_delegreturndata *)data; 3679 d_data = (struct nfs4_delegreturndata *)data;
3636 3680
3637 if (nfs4_setup_sequence(d_data->res.server->nfs_client, 3681 if (nfs4_setup_sequence(d_data->res.server,
3638 &d_data->args.seq_args, 3682 &d_data->args.seq_args,
3639 &d_data->res.seq_res, 1, task)) 3683 &d_data->res.seq_res, 1, task))
3640 return; 3684 return;
@@ -3667,7 +3711,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3667 }; 3711 };
3668 int status = 0; 3712 int status = 0;
3669 3713
3670 data = kzalloc(sizeof(*data), GFP_KERNEL); 3714 data = kzalloc(sizeof(*data), GFP_NOFS);
3671 if (data == NULL) 3715 if (data == NULL)
3672 return -ENOMEM; 3716 return -ENOMEM;
3673 data->args.fhandle = &data->fh; 3717 data->args.fhandle = &data->fh;
@@ -3823,7 +3867,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
3823 struct nfs4_unlockdata *p; 3867 struct nfs4_unlockdata *p;
3824 struct inode *inode = lsp->ls_state->inode; 3868 struct inode *inode = lsp->ls_state->inode;
3825 3869
3826 p = kzalloc(sizeof(*p), GFP_KERNEL); 3870 p = kzalloc(sizeof(*p), GFP_NOFS);
3827 if (p == NULL) 3871 if (p == NULL)
3828 return NULL; 3872 return NULL;
3829 p->arg.fh = NFS_FH(inode); 3873 p->arg.fh = NFS_FH(inode);
@@ -3854,9 +3898,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
3854{ 3898{
3855 struct nfs4_unlockdata *calldata = data; 3899 struct nfs4_unlockdata *calldata = data;
3856 3900
3857 nfs4_sequence_done(calldata->server, &calldata->res.seq_res, 3901 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
3858 task->tk_status);
3859 if (RPC_ASSASSINATED(task))
3860 return; 3902 return;
3861 switch (task->tk_status) { 3903 switch (task->tk_status) {
3862 case 0: 3904 case 0:
@@ -3889,7 +3931,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
3889 return; 3931 return;
3890 } 3932 }
3891 calldata->timestamp = jiffies; 3933 calldata->timestamp = jiffies;
3892 if (nfs4_setup_sequence(calldata->server->nfs_client, 3934 if (nfs4_setup_sequence(calldata->server,
3893 &calldata->arg.seq_args, 3935 &calldata->arg.seq_args,
3894 &calldata->res.seq_res, 1, task)) 3936 &calldata->res.seq_res, 1, task))
3895 return; 3937 return;
@@ -3961,7 +4003,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
3961 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) 4003 if (test_bit(NFS_DELEGATED_STATE, &state->flags))
3962 goto out; 4004 goto out;
3963 lsp = request->fl_u.nfs4_fl.owner; 4005 lsp = request->fl_u.nfs4_fl.owner;
3964 seqid = nfs_alloc_seqid(&lsp->ls_seqid); 4006 seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
3965 status = -ENOMEM; 4007 status = -ENOMEM;
3966 if (seqid == NULL) 4008 if (seqid == NULL)
3967 goto out; 4009 goto out;
@@ -3989,22 +4031,23 @@ struct nfs4_lockdata {
3989}; 4031};
3990 4032
3991static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, 4033static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3992 struct nfs_open_context *ctx, struct nfs4_lock_state *lsp) 4034 struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
4035 gfp_t gfp_mask)
3993{ 4036{
3994 struct nfs4_lockdata *p; 4037 struct nfs4_lockdata *p;
3995 struct inode *inode = lsp->ls_state->inode; 4038 struct inode *inode = lsp->ls_state->inode;
3996 struct nfs_server *server = NFS_SERVER(inode); 4039 struct nfs_server *server = NFS_SERVER(inode);
3997 4040
3998 p = kzalloc(sizeof(*p), GFP_KERNEL); 4041 p = kzalloc(sizeof(*p), gfp_mask);
3999 if (p == NULL) 4042 if (p == NULL)
4000 return NULL; 4043 return NULL;
4001 4044
4002 p->arg.fh = NFS_FH(inode); 4045 p->arg.fh = NFS_FH(inode);
4003 p->arg.fl = &p->fl; 4046 p->arg.fl = &p->fl;
4004 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid); 4047 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
4005 if (p->arg.open_seqid == NULL) 4048 if (p->arg.open_seqid == NULL)
4006 goto out_free; 4049 goto out_free;
4007 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); 4050 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
4008 if (p->arg.lock_seqid == NULL) 4051 if (p->arg.lock_seqid == NULL)
4009 goto out_free_seqid; 4052 goto out_free_seqid;
4010 p->arg.lock_stateid = &lsp->ls_stateid; 4053 p->arg.lock_stateid = &lsp->ls_stateid;
@@ -4043,7 +4086,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4043 } else 4086 } else
4044 data->arg.new_lock_owner = 0; 4087 data->arg.new_lock_owner = 0;
4045 data->timestamp = jiffies; 4088 data->timestamp = jiffies;
4046 if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, 4089 if (nfs4_setup_sequence(data->server,
4090 &data->arg.seq_args,
4047 &data->res.seq_res, 1, task)) 4091 &data->res.seq_res, 1, task))
4048 return; 4092 return;
4049 rpc_call_start(task); 4093 rpc_call_start(task);
@@ -4062,12 +4106,10 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
4062 4106
4063 dprintk("%s: begin!\n", __func__); 4107 dprintk("%s: begin!\n", __func__);
4064 4108
4065 nfs4_sequence_done(data->server, &data->res.seq_res, 4109 if (!nfs4_sequence_done(task, &data->res.seq_res))
4066 task->tk_status); 4110 return;
4067 4111
4068 data->rpc_status = task->tk_status; 4112 data->rpc_status = task->tk_status;
4069 if (RPC_ASSASSINATED(task))
4070 goto out;
4071 if (data->arg.new_lock_owner != 0) { 4113 if (data->arg.new_lock_owner != 0) {
4072 if (data->rpc_status == 0) 4114 if (data->rpc_status == 0)
4073 nfs_confirm_seqid(&data->lsp->ls_seqid, 0); 4115 nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
@@ -4158,7 +4200,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4158 4200
4159 dprintk("%s: begin!\n", __func__); 4201 dprintk("%s: begin!\n", __func__);
4160 data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), 4202 data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
4161 fl->fl_u.nfs4_fl.owner); 4203 fl->fl_u.nfs4_fl.owner,
4204 recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
4162 if (data == NULL) 4205 if (data == NULL)
4163 return -ENOMEM; 4206 return -ENOMEM;
4164 if (IS_SETLKW(cmd)) 4207 if (IS_SETLKW(cmd))
@@ -4384,6 +4427,34 @@ out:
4384 return err; 4427 return err;
4385} 4428}
4386 4429
4430static void nfs4_release_lockowner_release(void *calldata)
4431{
4432 kfree(calldata);
4433}
4434
4435const struct rpc_call_ops nfs4_release_lockowner_ops = {
4436 .rpc_release = nfs4_release_lockowner_release,
4437};
4438
4439void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
4440{
4441 struct nfs_server *server = lsp->ls_state->owner->so_server;
4442 struct nfs_release_lockowner_args *args;
4443 struct rpc_message msg = {
4444 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
4445 };
4446
4447 if (server->nfs_client->cl_mvops->minor_version != 0)
4448 return;
4449 args = kmalloc(sizeof(*args), GFP_NOFS);
4450 if (!args)
4451 return;
4452 args->lock_owner.clientid = server->nfs_client->cl_clientid;
4453 args->lock_owner.id = lsp->ls_id.id;
4454 msg.rpc_argp = args;
4455 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
4456}
4457
4387#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" 4458#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
4388 4459
4389int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, 4460int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
@@ -4571,7 +4642,8 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4571 (struct nfs4_get_lease_time_data *)calldata; 4642 (struct nfs4_get_lease_time_data *)calldata;
4572 4643
4573 dprintk("--> %s\n", __func__); 4644 dprintk("--> %s\n", __func__);
4574 nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); 4645 if (!nfs41_sequence_done(task, &data->res->lr_seq_res))
4646 return;
4575 switch (task->tk_status) { 4647 switch (task->tk_status) {
4576 case -NFS4ERR_DELAY: 4648 case -NFS4ERR_DELAY:
4577 case -NFS4ERR_GRACE: 4649 case -NFS4ERR_GRACE:
@@ -4647,7 +4719,7 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
4647 if (max_reqs != tbl->max_slots) { 4719 if (max_reqs != tbl->max_slots) {
4648 ret = -ENOMEM; 4720 ret = -ENOMEM;
4649 new = kmalloc(max_reqs * sizeof(struct nfs4_slot), 4721 new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
4650 GFP_KERNEL); 4722 GFP_NOFS);
4651 if (!new) 4723 if (!new)
4652 goto out; 4724 goto out;
4653 ret = 0; 4725 ret = 0;
@@ -4712,7 +4784,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
4712 4784
4713 dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); 4785 dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
4714 4786
4715 slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); 4787 slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
4716 if (!slot) 4788 if (!slot)
4717 goto out; 4789 goto out;
4718 ret = 0; 4790 ret = 0;
@@ -4761,17 +4833,10 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4761 struct nfs4_session *session; 4833 struct nfs4_session *session;
4762 struct nfs4_slot_table *tbl; 4834 struct nfs4_slot_table *tbl;
4763 4835
4764 session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL); 4836 session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
4765 if (!session) 4837 if (!session)
4766 return NULL; 4838 return NULL;
4767 4839
4768 /*
4769 * The create session reply races with the server back
4770 * channel probe. Mark the client NFS_CS_SESSION_INITING
4771 * so that the client back channel can find the
4772 * nfs_client struct
4773 */
4774 clp->cl_cons_state = NFS_CS_SESSION_INITING;
4775 init_completion(&session->complete); 4840 init_completion(&session->complete);
4776 4841
4777 tbl = &session->fc_slot_table; 4842 tbl = &session->fc_slot_table;
@@ -4784,6 +4849,8 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4784 spin_lock_init(&tbl->slot_tbl_lock); 4849 spin_lock_init(&tbl->slot_tbl_lock);
4785 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); 4850 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
4786 4851
4852 session->session_state = 1<<NFS4_SESSION_INITING;
4853
4787 session->clp = clp; 4854 session->clp = clp;
4788 return session; 4855 return session;
4789} 4856}
@@ -5000,6 +5067,10 @@ int nfs4_init_session(struct nfs_server *server)
5000 if (!nfs4_has_session(clp)) 5067 if (!nfs4_has_session(clp))
5001 return 0; 5068 return 0;
5002 5069
5070 session = clp->cl_session;
5071 if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
5072 return 0;
5073
5003 rsize = server->rsize; 5074 rsize = server->rsize;
5004 if (rsize == 0) 5075 if (rsize == 0)
5005 rsize = NFS_MAX_FILE_IO_SIZE; 5076 rsize = NFS_MAX_FILE_IO_SIZE;
@@ -5007,7 +5078,6 @@ int nfs4_init_session(struct nfs_server *server)
5007 if (wsize == 0) 5078 if (wsize == 0)
5008 wsize = NFS_MAX_FILE_IO_SIZE; 5079 wsize = NFS_MAX_FILE_IO_SIZE;
5009 5080
5010 session = clp->cl_session;
5011 session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; 5081 session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
5012 session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; 5082 session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
5013 5083
@@ -5020,69 +5090,70 @@ int nfs4_init_session(struct nfs_server *server)
5020/* 5090/*
5021 * Renew the cl_session lease. 5091 * Renew the cl_session lease.
5022 */ 5092 */
5023static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) 5093struct nfs4_sequence_data {
5024{ 5094 struct nfs_client *clp;
5025 struct nfs4_sequence_args args; 5095 struct nfs4_sequence_args args;
5026 struct nfs4_sequence_res res; 5096 struct nfs4_sequence_res res;
5027 5097};
5028 struct rpc_message msg = {
5029 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
5030 .rpc_argp = &args,
5031 .rpc_resp = &res,
5032 .rpc_cred = cred,
5033 };
5034
5035 args.sa_cache_this = 0;
5036
5037 return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
5038 &res, args.sa_cache_this, 1);
5039}
5040 5098
5041static void nfs41_sequence_release(void *data) 5099static void nfs41_sequence_release(void *data)
5042{ 5100{
5043 struct nfs_client *clp = (struct nfs_client *)data; 5101 struct nfs4_sequence_data *calldata = data;
5102 struct nfs_client *clp = calldata->clp;
5044 5103
5045 if (atomic_read(&clp->cl_count) > 1) 5104 if (atomic_read(&clp->cl_count) > 1)
5046 nfs4_schedule_state_renewal(clp); 5105 nfs4_schedule_state_renewal(clp);
5047 nfs_put_client(clp); 5106 nfs_put_client(clp);
5107 kfree(calldata);
5108}
5109
5110static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
5111{
5112 switch(task->tk_status) {
5113 case -NFS4ERR_DELAY:
5114 case -EKEYEXPIRED:
5115 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5116 return -EAGAIN;
5117 default:
5118 nfs4_schedule_state_recovery(clp);
5119 }
5120 return 0;
5048} 5121}
5049 5122
5050static void nfs41_sequence_call_done(struct rpc_task *task, void *data) 5123static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
5051{ 5124{
5052 struct nfs_client *clp = (struct nfs_client *)data; 5125 struct nfs4_sequence_data *calldata = data;
5126 struct nfs_client *clp = calldata->clp;
5053 5127
5054 nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); 5128 if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
5129 return;
5055 5130
5056 if (task->tk_status < 0) { 5131 if (task->tk_status < 0) {
5057 dprintk("%s ERROR %d\n", __func__, task->tk_status); 5132 dprintk("%s ERROR %d\n", __func__, task->tk_status);
5058 if (atomic_read(&clp->cl_count) == 1) 5133 if (atomic_read(&clp->cl_count) == 1)
5059 goto out; 5134 goto out;
5060 5135
5061 if (_nfs4_async_handle_error(task, NULL, clp, NULL) 5136 if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
5062 == -EAGAIN) { 5137 rpc_restart_call_prepare(task);
5063 nfs_restart_rpc(task, clp);
5064 return; 5138 return;
5065 } 5139 }
5066 } 5140 }
5067 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); 5141 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
5068out: 5142out:
5069 kfree(task->tk_msg.rpc_argp);
5070 kfree(task->tk_msg.rpc_resp);
5071
5072 dprintk("<-- %s\n", __func__); 5143 dprintk("<-- %s\n", __func__);
5073} 5144}
5074 5145
5075static void nfs41_sequence_prepare(struct rpc_task *task, void *data) 5146static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
5076{ 5147{
5077 struct nfs_client *clp; 5148 struct nfs4_sequence_data *calldata = data;
5149 struct nfs_client *clp = calldata->clp;
5078 struct nfs4_sequence_args *args; 5150 struct nfs4_sequence_args *args;
5079 struct nfs4_sequence_res *res; 5151 struct nfs4_sequence_res *res;
5080 5152
5081 clp = (struct nfs_client *)data;
5082 args = task->tk_msg.rpc_argp; 5153 args = task->tk_msg.rpc_argp;
5083 res = task->tk_msg.rpc_resp; 5154 res = task->tk_msg.rpc_resp;
5084 5155
5085 if (nfs4_setup_sequence(clp, args, res, 0, task)) 5156 if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task))
5086 return; 5157 return;
5087 rpc_call_start(task); 5158 rpc_call_start(task);
5088} 5159}
@@ -5093,32 +5164,67 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
5093 .rpc_release = nfs41_sequence_release, 5164 .rpc_release = nfs41_sequence_release,
5094}; 5165};
5095 5166
5096static int nfs41_proc_async_sequence(struct nfs_client *clp, 5167static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5097 struct rpc_cred *cred)
5098{ 5168{
5099 struct nfs4_sequence_args *args; 5169 struct nfs4_sequence_data *calldata;
5100 struct nfs4_sequence_res *res;
5101 struct rpc_message msg = { 5170 struct rpc_message msg = {
5102 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], 5171 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
5103 .rpc_cred = cred, 5172 .rpc_cred = cred,
5104 }; 5173 };
5174 struct rpc_task_setup task_setup_data = {
5175 .rpc_client = clp->cl_rpcclient,
5176 .rpc_message = &msg,
5177 .callback_ops = &nfs41_sequence_ops,
5178 .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT,
5179 };
5105 5180
5106 if (!atomic_inc_not_zero(&clp->cl_count)) 5181 if (!atomic_inc_not_zero(&clp->cl_count))
5107 return -EIO; 5182 return ERR_PTR(-EIO);
5108 args = kzalloc(sizeof(*args), GFP_KERNEL); 5183 calldata = kmalloc(sizeof(*calldata), GFP_NOFS);
5109 res = kzalloc(sizeof(*res), GFP_KERNEL); 5184 if (calldata == NULL) {
5110 if (!args || !res) {
5111 kfree(args);
5112 kfree(res);
5113 nfs_put_client(clp); 5185 nfs_put_client(clp);
5114 return -ENOMEM; 5186 return ERR_PTR(-ENOMEM);
5115 } 5187 }
5116 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 5188 calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5117 msg.rpc_argp = args; 5189 msg.rpc_argp = &calldata->args;
5118 msg.rpc_resp = res; 5190 msg.rpc_resp = &calldata->res;
5191 calldata->clp = clp;
5192 task_setup_data.callback_data = calldata;
5119 5193
5120 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 5194 return rpc_run_task(&task_setup_data);
5121 &nfs41_sequence_ops, (void *)clp); 5195}
5196
5197static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5198{
5199 struct rpc_task *task;
5200 int ret = 0;
5201
5202 task = _nfs41_proc_sequence(clp, cred);
5203 if (IS_ERR(task))
5204 ret = PTR_ERR(task);
5205 else
5206 rpc_put_task(task);
5207 dprintk("<-- %s status=%d\n", __func__, ret);
5208 return ret;
5209}
5210
5211static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5212{
5213 struct rpc_task *task;
5214 int ret;
5215
5216 task = _nfs41_proc_sequence(clp, cred);
5217 if (IS_ERR(task)) {
5218 ret = PTR_ERR(task);
5219 goto out;
5220 }
5221 ret = rpc_wait_for_completion_task(task);
5222 if (!ret)
5223 ret = task->tk_status;
5224 rpc_put_task(task);
5225out:
5226 dprintk("<-- %s status=%d\n", __func__, ret);
5227 return ret;
5122} 5228}
5123 5229
5124struct nfs4_reclaim_complete_data { 5230struct nfs4_reclaim_complete_data {
@@ -5132,13 +5238,31 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
5132 struct nfs4_reclaim_complete_data *calldata = data; 5238 struct nfs4_reclaim_complete_data *calldata = data;
5133 5239
5134 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); 5240 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
5135 if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, 5241 if (nfs41_setup_sequence(calldata->clp->cl_session,
5242 &calldata->arg.seq_args,
5136 &calldata->res.seq_res, 0, task)) 5243 &calldata->res.seq_res, 0, task))
5137 return; 5244 return;
5138 5245
5139 rpc_call_start(task); 5246 rpc_call_start(task);
5140} 5247}
5141 5248
5249static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
5250{
5251 switch(task->tk_status) {
5252 case 0:
5253 case -NFS4ERR_COMPLETE_ALREADY:
5254 case -NFS4ERR_WRONG_CRED: /* What to do here? */
5255 break;
5256 case -NFS4ERR_DELAY:
5257 case -EKEYEXPIRED:
5258 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5259 return -EAGAIN;
5260 default:
5261 nfs4_schedule_state_recovery(clp);
5262 }
5263 return 0;
5264}
5265
5142static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) 5266static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
5143{ 5267{
5144 struct nfs4_reclaim_complete_data *calldata = data; 5268 struct nfs4_reclaim_complete_data *calldata = data;
@@ -5146,32 +5270,13 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
5146 struct nfs4_sequence_res *res = &calldata->res.seq_res; 5270 struct nfs4_sequence_res *res = &calldata->res.seq_res;
5147 5271
5148 dprintk("--> %s\n", __func__); 5272 dprintk("--> %s\n", __func__);
5149 nfs41_sequence_done(clp, res, task->tk_status); 5273 if (!nfs41_sequence_done(task, res))
5150 switch (task->tk_status) { 5274 return;
5151 case 0:
5152 case -NFS4ERR_COMPLETE_ALREADY:
5153 break;
5154 case -NFS4ERR_BADSESSION:
5155 case -NFS4ERR_DEADSESSION:
5156 /*
5157 * Handle the session error, but do not retry the operation, as
5158 * we have no way of telling whether the clientid had to be
5159 * reset before we got our reply. If reset, a new wave of
5160 * reclaim operations will follow, containing their own reclaim
5161 * complete. We don't want our retry to get on the way of
5162 * recovery by incorrectly indicating to the server that we're
5163 * done reclaiming state since the process had to be restarted.
5164 */
5165 _nfs4_async_handle_error(task, NULL, clp, NULL);
5166 break;
5167 default:
5168 if (_nfs4_async_handle_error(
5169 task, NULL, clp, NULL) == -EAGAIN) {
5170 rpc_restart_call_prepare(task);
5171 return;
5172 }
5173 }
5174 5275
5276 if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
5277 rpc_restart_call_prepare(task);
5278 return;
5279 }
5175 dprintk("<-- %s\n", __func__); 5280 dprintk("<-- %s\n", __func__);
5176} 5281}
5177 5282
@@ -5207,7 +5312,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5207 int status = -ENOMEM; 5312 int status = -ENOMEM;
5208 5313
5209 dprintk("--> %s\n", __func__); 5314 dprintk("--> %s\n", __func__);
5210 calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); 5315 calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
5211 if (calldata == NULL) 5316 if (calldata == NULL)
5212 goto out; 5317 goto out;
5213 calldata->clp = clp; 5318 calldata->clp = clp;
@@ -5285,28 +5390,30 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
5285}; 5390};
5286#endif 5391#endif
5287 5392
5288/* 5393static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
5289 * Per minor version reboot and network partition recovery ops 5394 .minor_version = 0,
5290 */ 5395 .call_sync = _nfs4_call_sync,
5291 5396 .validate_stateid = nfs4_validate_delegation_stateid,
5292struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { 5397 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
5293 &nfs40_reboot_recovery_ops, 5398 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
5294#if defined(CONFIG_NFS_V4_1) 5399 .state_renewal_ops = &nfs40_state_renewal_ops,
5295 &nfs41_reboot_recovery_ops,
5296#endif
5297}; 5400};
5298 5401
5299struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = {
5300 &nfs40_nograce_recovery_ops,
5301#if defined(CONFIG_NFS_V4_1) 5402#if defined(CONFIG_NFS_V4_1)
5302 &nfs41_nograce_recovery_ops, 5403static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
5303#endif 5404 .minor_version = 1,
5405 .call_sync = _nfs4_call_sync_session,
5406 .validate_stateid = nfs41_validate_delegation_stateid,
5407 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
5408 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
5409 .state_renewal_ops = &nfs41_state_renewal_ops,
5304}; 5410};
5411#endif
5305 5412
5306struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { 5413const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
5307 &nfs40_state_renewal_ops, 5414 [0] = &nfs_v4_0_minor_ops,
5308#if defined(CONFIG_NFS_V4_1) 5415#if defined(CONFIG_NFS_V4_1)
5309 &nfs41_state_renewal_ops, 5416 [1] = &nfs_v4_1_minor_ops,
5310#endif 5417#endif
5311}; 5418};
5312 5419
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index d87f10327b72..72b6c580af13 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -54,14 +54,14 @@
54void 54void
55nfs4_renew_state(struct work_struct *work) 55nfs4_renew_state(struct work_struct *work)
56{ 56{
57 struct nfs4_state_maintenance_ops *ops; 57 const struct nfs4_state_maintenance_ops *ops;
58 struct nfs_client *clp = 58 struct nfs_client *clp =
59 container_of(work, struct nfs_client, cl_renewd.work); 59 container_of(work, struct nfs_client, cl_renewd.work);
60 struct rpc_cred *cred; 60 struct rpc_cred *cred;
61 long lease; 61 long lease;
62 unsigned long last, now; 62 unsigned long last, now;
63 63
64 ops = nfs4_state_renewal_ops[clp->cl_minorversion]; 64 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 65 dprintk("%s: start\n", __func__);
66 /* Are there any active superblocks? */ 66 /* Are there any active superblocks? */
67 if (list_empty(&clp->cl_superblocks)) 67 if (list_empty(&clp->cl_superblocks))
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6c5ed51f105e..3e2f19b04c06 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list);
62 62
63int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 63int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
64{ 64{
65 struct nfs4_setclientid_res clid;
65 unsigned short port; 66 unsigned short port;
66 int status; 67 int status;
67 68
@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
69 if (clp->cl_addr.ss_family == AF_INET6) 70 if (clp->cl_addr.ss_family == AF_INET6)
70 port = nfs_callback_tcpport6; 71 port = nfs_callback_tcpport6;
71 72
72 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); 73 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
73 if (status == 0) 74 if (status != 0)
74 status = nfs4_proc_setclientid_confirm(clp, cred); 75 goto out;
75 if (status == 0) 76 status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
76 nfs4_schedule_state_renewal(clp); 77 if (status != 0)
78 goto out;
79 clp->cl_clientid = clid.clientid;
80 nfs4_schedule_state_renewal(clp);
81out:
77 return status; 82 return status;
78} 83}
79 84
@@ -140,7 +145,9 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
140 struct nfs4_session *ses = clp->cl_session; 145 struct nfs4_session *ses = clp->cl_session;
141 int max_slots; 146 int max_slots;
142 147
143 if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { 148 if (ses == NULL)
149 return;
150 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
144 spin_lock(&ses->fc_slot_table.slot_tbl_lock); 151 spin_lock(&ses->fc_slot_table.slot_tbl_lock);
145 max_slots = ses->fc_slot_table.max_slots; 152 max_slots = ses->fc_slot_table.max_slots;
146 while (max_slots--) { 153 while (max_slots--) {
@@ -162,7 +169,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
162 struct nfs4_slot_table *tbl = &ses->fc_slot_table; 169 struct nfs4_slot_table *tbl = &ses->fc_slot_table;
163 170
164 spin_lock(&tbl->slot_tbl_lock); 171 spin_lock(&tbl->slot_tbl_lock);
165 set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); 172 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
166 if (tbl->highest_used_slotid != -1) { 173 if (tbl->highest_used_slotid != -1) {
167 INIT_COMPLETION(ses->complete); 174 INIT_COMPLETION(ses->complete);
168 spin_unlock(&tbl->slot_tbl_lock); 175 spin_unlock(&tbl->slot_tbl_lock);
@@ -361,12 +368,11 @@ nfs4_alloc_state_owner(void)
361{ 368{
362 struct nfs4_state_owner *sp; 369 struct nfs4_state_owner *sp;
363 370
364 sp = kzalloc(sizeof(*sp),GFP_KERNEL); 371 sp = kzalloc(sizeof(*sp),GFP_NOFS);
365 if (!sp) 372 if (!sp)
366 return NULL; 373 return NULL;
367 spin_lock_init(&sp->so_lock); 374 spin_lock_init(&sp->so_lock);
368 INIT_LIST_HEAD(&sp->so_states); 375 INIT_LIST_HEAD(&sp->so_states);
369 INIT_LIST_HEAD(&sp->so_delegations);
370 rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); 376 rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
371 sp->so_seqid.sequence = &sp->so_sequence; 377 sp->so_seqid.sequence = &sp->so_sequence;
372 spin_lock_init(&sp->so_sequence.lock); 378 spin_lock_init(&sp->so_sequence.lock);
@@ -379,7 +385,7 @@ static void
379nfs4_drop_state_owner(struct nfs4_state_owner *sp) 385nfs4_drop_state_owner(struct nfs4_state_owner *sp)
380{ 386{
381 if (!RB_EMPTY_NODE(&sp->so_client_node)) { 387 if (!RB_EMPTY_NODE(&sp->so_client_node)) {
382 struct nfs_client *clp = sp->so_client; 388 struct nfs_client *clp = sp->so_server->nfs_client;
383 389
384 spin_lock(&clp->cl_lock); 390 spin_lock(&clp->cl_lock);
385 rb_erase(&sp->so_client_node, &clp->cl_state_owners); 391 rb_erase(&sp->so_client_node, &clp->cl_state_owners);
@@ -401,7 +407,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
401 new = nfs4_alloc_state_owner(); 407 new = nfs4_alloc_state_owner();
402 if (new == NULL) 408 if (new == NULL)
403 return NULL; 409 return NULL;
404 new->so_client = clp;
405 new->so_server = server; 410 new->so_server = server;
406 new->so_cred = cred; 411 new->so_cred = cred;
407 spin_lock(&clp->cl_lock); 412 spin_lock(&clp->cl_lock);
@@ -418,7 +423,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
418 423
419void nfs4_put_state_owner(struct nfs4_state_owner *sp) 424void nfs4_put_state_owner(struct nfs4_state_owner *sp)
420{ 425{
421 struct nfs_client *clp = sp->so_client; 426 struct nfs_client *clp = sp->so_server->nfs_client;
422 struct rpc_cred *cred = sp->so_cred; 427 struct rpc_cred *cred = sp->so_cred;
423 428
424 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) 429 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void)
435{ 440{
436 struct nfs4_state *state; 441 struct nfs4_state *state;
437 442
438 state = kzalloc(sizeof(*state), GFP_KERNEL); 443 state = kzalloc(sizeof(*state), GFP_NOFS);
439 if (!state) 444 if (!state)
440 return NULL; 445 return NULL;
441 atomic_set(&state->count, 1); 446 atomic_set(&state->count, 1);
@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_state *state)
537/* 542/*
538 * Close the current file. 543 * Close the current file.
539 */ 544 */
540static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait) 545static void __nfs4_close(struct path *path, struct nfs4_state *state,
546 fmode_t fmode, gfp_t gfp_mask, int wait)
541{ 547{
542 struct nfs4_state_owner *owner = state->owner; 548 struct nfs4_state_owner *owner = state->owner;
543 int call_close = 0; 549 int call_close = 0;
@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm
578 nfs4_put_open_state(state); 584 nfs4_put_open_state(state);
579 nfs4_put_state_owner(owner); 585 nfs4_put_state_owner(owner);
580 } else 586 } else
581 nfs4_do_close(path, state, wait); 587 nfs4_do_close(path, state, gfp_mask, wait);
582} 588}
583 589
584void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) 590void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
585{ 591{
586 __nfs4_close(path, state, fmode, 0); 592 __nfs4_close(path, state, fmode, GFP_NOFS, 0);
587} 593}
588 594
589void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) 595void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
590{ 596{
591 __nfs4_close(path, state, fmode, 1); 597 __nfs4_close(path, state, fmode, GFP_KERNEL, 1);
592} 598}
593 599
594/* 600/*
@@ -596,12 +602,21 @@ void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
596 * that is compatible with current->files 602 * that is compatible with current->files
597 */ 603 */
598static struct nfs4_lock_state * 604static struct nfs4_lock_state *
599__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) 605__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
600{ 606{
601 struct nfs4_lock_state *pos; 607 struct nfs4_lock_state *pos;
602 list_for_each_entry(pos, &state->lock_states, ls_locks) { 608 list_for_each_entry(pos, &state->lock_states, ls_locks) {
603 if (pos->ls_owner != fl_owner) 609 if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
604 continue; 610 continue;
611 switch (pos->ls_owner.lo_type) {
612 case NFS4_POSIX_LOCK_TYPE:
613 if (pos->ls_owner.lo_u.posix_owner != fl_owner)
614 continue;
615 break;
616 case NFS4_FLOCK_LOCK_TYPE:
617 if (pos->ls_owner.lo_u.flock_owner != fl_pid)
618 continue;
619 }
605 atomic_inc(&pos->ls_count); 620 atomic_inc(&pos->ls_count);
606 return pos; 621 return pos;
607 } 622 }
@@ -613,12 +628,12 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
613 * exists, return an uninitialized one. 628 * exists, return an uninitialized one.
614 * 629 *
615 */ 630 */
616static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) 631static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
617{ 632{
618 struct nfs4_lock_state *lsp; 633 struct nfs4_lock_state *lsp;
619 struct nfs_client *clp = state->owner->so_client; 634 struct nfs_client *clp = state->owner->so_server->nfs_client;
620 635
621 lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); 636 lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
622 if (lsp == NULL) 637 if (lsp == NULL)
623 return NULL; 638 return NULL;
624 rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); 639 rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
@@ -627,7 +642,18 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
627 lsp->ls_seqid.sequence = &lsp->ls_sequence; 642 lsp->ls_seqid.sequence = &lsp->ls_sequence;
628 atomic_set(&lsp->ls_count, 1); 643 atomic_set(&lsp->ls_count, 1);
629 lsp->ls_state = state; 644 lsp->ls_state = state;
630 lsp->ls_owner = fl_owner; 645 lsp->ls_owner.lo_type = type;
646 switch (lsp->ls_owner.lo_type) {
647 case NFS4_FLOCK_LOCK_TYPE:
648 lsp->ls_owner.lo_u.flock_owner = fl_pid;
649 break;
650 case NFS4_POSIX_LOCK_TYPE:
651 lsp->ls_owner.lo_u.posix_owner = fl_owner;
652 break;
653 default:
654 kfree(lsp);
655 return NULL;
656 }
631 spin_lock(&clp->cl_lock); 657 spin_lock(&clp->cl_lock);
632 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); 658 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
633 spin_unlock(&clp->cl_lock); 659 spin_unlock(&clp->cl_lock);
@@ -637,7 +663,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
637 663
638static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) 664static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
639{ 665{
640 struct nfs_client *clp = lsp->ls_state->owner->so_client; 666 struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
641 667
642 spin_lock(&clp->cl_lock); 668 spin_lock(&clp->cl_lock);
643 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); 669 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
@@ -651,13 +677,13 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
651 * exists, return an uninitialized one. 677 * exists, return an uninitialized one.
652 * 678 *
653 */ 679 */
654static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) 680static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
655{ 681{
656 struct nfs4_lock_state *lsp, *new = NULL; 682 struct nfs4_lock_state *lsp, *new = NULL;
657 683
658 for(;;) { 684 for(;;) {
659 spin_lock(&state->state_lock); 685 spin_lock(&state->state_lock);
660 lsp = __nfs4_find_lock_state(state, owner); 686 lsp = __nfs4_find_lock_state(state, owner, pid, type);
661 if (lsp != NULL) 687 if (lsp != NULL)
662 break; 688 break;
663 if (new != NULL) { 689 if (new != NULL) {
@@ -668,7 +694,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
668 break; 694 break;
669 } 695 }
670 spin_unlock(&state->state_lock); 696 spin_unlock(&state->state_lock);
671 new = nfs4_alloc_lock_state(state, owner); 697 new = nfs4_alloc_lock_state(state, owner, pid, type);
672 if (new == NULL) 698 if (new == NULL)
673 return NULL; 699 return NULL;
674 } 700 }
@@ -695,6 +721,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
695 if (list_empty(&state->lock_states)) 721 if (list_empty(&state->lock_states))
696 clear_bit(LK_STATE_IN_USE, &state->flags); 722 clear_bit(LK_STATE_IN_USE, &state->flags);
697 spin_unlock(&state->state_lock); 723 spin_unlock(&state->state_lock);
724 if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
725 nfs4_release_lockowner(lsp);
698 nfs4_free_lock_state(lsp); 726 nfs4_free_lock_state(lsp);
699} 727}
700 728
@@ -722,7 +750,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
722 750
723 if (fl->fl_ops != NULL) 751 if (fl->fl_ops != NULL)
724 return 0; 752 return 0;
725 lsp = nfs4_get_lock_state(state, fl->fl_owner); 753 if (fl->fl_flags & FL_POSIX)
754 lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
755 else if (fl->fl_flags & FL_FLOCK)
756 lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE);
757 else
758 return -EINVAL;
726 if (lsp == NULL) 759 if (lsp == NULL)
727 return -ENOMEM; 760 return -ENOMEM;
728 fl->fl_u.nfs4_fl.owner = lsp; 761 fl->fl_u.nfs4_fl.owner = lsp;
@@ -734,7 +767,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
734 * Byte-range lock aware utility to initialize the stateid of read/write 767 * Byte-range lock aware utility to initialize the stateid of read/write
735 * requests. 768 * requests.
736 */ 769 */
737void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) 770void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
738{ 771{
739 struct nfs4_lock_state *lsp; 772 struct nfs4_lock_state *lsp;
740 int seq; 773 int seq;
@@ -747,18 +780,18 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
747 return; 780 return;
748 781
749 spin_lock(&state->state_lock); 782 spin_lock(&state->state_lock);
750 lsp = __nfs4_find_lock_state(state, fl_owner); 783 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
751 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) 784 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
752 memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); 785 memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
753 spin_unlock(&state->state_lock); 786 spin_unlock(&state->state_lock);
754 nfs4_put_lock_state(lsp); 787 nfs4_put_lock_state(lsp);
755} 788}
756 789
757struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) 790struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
758{ 791{
759 struct nfs_seqid *new; 792 struct nfs_seqid *new;
760 793
761 new = kmalloc(sizeof(*new), GFP_KERNEL); 794 new = kmalloc(sizeof(*new), gfp_mask);
762 if (new != NULL) { 795 if (new != NULL) {
763 new->sequence = counter; 796 new->sequence = counter;
764 INIT_LIST_HEAD(&new->list); 797 INIT_LIST_HEAD(&new->list);
@@ -1035,11 +1068,11 @@ restart:
1035 case -NFS4ERR_BAD_STATEID: 1068 case -NFS4ERR_BAD_STATEID:
1036 case -NFS4ERR_RECLAIM_BAD: 1069 case -NFS4ERR_RECLAIM_BAD:
1037 case -NFS4ERR_RECLAIM_CONFLICT: 1070 case -NFS4ERR_RECLAIM_CONFLICT:
1038 nfs4_state_mark_reclaim_nograce(sp->so_client, state); 1071 nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
1039 break; 1072 break;
1040 case -NFS4ERR_EXPIRED: 1073 case -NFS4ERR_EXPIRED:
1041 case -NFS4ERR_NO_GRACE: 1074 case -NFS4ERR_NO_GRACE:
1042 nfs4_state_mark_reclaim_nograce(sp->so_client, state); 1075 nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
1043 case -NFS4ERR_STALE_CLIENTID: 1076 case -NFS4ERR_STALE_CLIENTID:
1044 case -NFS4ERR_BADSESSION: 1077 case -NFS4ERR_BADSESSION:
1045 case -NFS4ERR_BADSLOT: 1078 case -NFS4ERR_BADSLOT:
@@ -1114,8 +1147,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1114 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1147 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1115 return; 1148 return;
1116 1149
1117 nfs4_reclaim_complete(clp, 1150 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
1118 nfs4_reboot_recovery_ops[clp->cl_minorversion]);
1119 1151
1120 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1152 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
1121 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1153 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1205,8 +1237,8 @@ restart:
1205static int nfs4_check_lease(struct nfs_client *clp) 1237static int nfs4_check_lease(struct nfs_client *clp)
1206{ 1238{
1207 struct rpc_cred *cred; 1239 struct rpc_cred *cred;
1208 struct nfs4_state_maintenance_ops *ops = 1240 const struct nfs4_state_maintenance_ops *ops =
1209 nfs4_state_renewal_ops[clp->cl_minorversion]; 1241 clp->cl_mvops->state_renewal_ops;
1210 int status = -NFS4ERR_EXPIRED; 1242 int status = -NFS4ERR_EXPIRED;
1211 1243
1212 /* Is the client already known to have an expired lease? */ 1244 /* Is the client already known to have an expired lease? */
@@ -1229,8 +1261,8 @@ out:
1229static int nfs4_reclaim_lease(struct nfs_client *clp) 1261static int nfs4_reclaim_lease(struct nfs_client *clp)
1230{ 1262{
1231 struct rpc_cred *cred; 1263 struct rpc_cred *cred;
1232 struct nfs4_state_recovery_ops *ops = 1264 const struct nfs4_state_recovery_ops *ops =
1233 nfs4_reboot_recovery_ops[clp->cl_minorversion]; 1265 clp->cl_mvops->reboot_recovery_ops;
1234 int status = -ENOENT; 1266 int status = -ENOENT;
1235 1267
1236 cred = ops->get_clid_cred(clp); 1268 cred = ops->get_clid_cred(clp);
@@ -1347,7 +1379,7 @@ static int nfs4_recall_slot(struct nfs_client *clp)
1347 1379
1348 nfs4_begin_drain_session(clp); 1380 nfs4_begin_drain_session(clp);
1349 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), 1381 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
1350 GFP_KERNEL); 1382 GFP_NOFS);
1351 if (!new) 1383 if (!new)
1352 return -ENOMEM; 1384 return -ENOMEM;
1353 1385
@@ -1438,7 +1470,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1438 /* First recover reboot state... */ 1470 /* First recover reboot state... */
1439 if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { 1471 if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
1440 status = nfs4_do_reclaim(clp, 1472 status = nfs4_do_reclaim(clp,
1441 nfs4_reboot_recovery_ops[clp->cl_minorversion]); 1473 clp->cl_mvops->reboot_recovery_ops);
1442 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 1474 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
1443 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) 1475 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
1444 continue; 1476 continue;
@@ -1452,7 +1484,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1452 /* Now recover expired state... */ 1484 /* Now recover expired state... */
1453 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { 1485 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
1454 status = nfs4_do_reclaim(clp, 1486 status = nfs4_do_reclaim(clp,
1455 nfs4_nograce_recovery_ops[clp->cl_minorversion]); 1487 clp->cl_mvops->nograce_recovery_ops);
1456 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 1488 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
1457 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || 1489 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
1458 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1490 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 38f3b582e7c2..08ef91291132 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -202,14 +202,17 @@ static int nfs4_stat_to_errno(int);
202#define encode_link_maxsz (op_encode_hdr_maxsz + \ 202#define encode_link_maxsz (op_encode_hdr_maxsz + \
203 nfs4_name_maxsz) 203 nfs4_name_maxsz)
204#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) 204#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
205#define encode_lockowner_maxsz (7)
205#define encode_lock_maxsz (op_encode_hdr_maxsz + \ 206#define encode_lock_maxsz (op_encode_hdr_maxsz + \
206 7 + \ 207 7 + \
207 1 + encode_stateid_maxsz + 8) 208 1 + encode_stateid_maxsz + 1 + \
209 encode_lockowner_maxsz)
208#define decode_lock_denied_maxsz \ 210#define decode_lock_denied_maxsz \
209 (8 + decode_lockowner_maxsz) 211 (8 + decode_lockowner_maxsz)
210#define decode_lock_maxsz (op_decode_hdr_maxsz + \ 212#define decode_lock_maxsz (op_decode_hdr_maxsz + \
211 decode_lock_denied_maxsz) 213 decode_lock_denied_maxsz)
212#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) 214#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \
215 encode_lockowner_maxsz)
213#define decode_lockt_maxsz (op_decode_hdr_maxsz + \ 216#define decode_lockt_maxsz (op_decode_hdr_maxsz + \
214 decode_lock_denied_maxsz) 217 decode_lock_denied_maxsz)
215#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ 218#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \
@@ -217,6 +220,11 @@ static int nfs4_stat_to_errno(int);
217 4) 220 4)
218#define decode_locku_maxsz (op_decode_hdr_maxsz + \ 221#define decode_locku_maxsz (op_decode_hdr_maxsz + \
219 decode_stateid_maxsz) 222 decode_stateid_maxsz)
223#define encode_release_lockowner_maxsz \
224 (op_encode_hdr_maxsz + \
225 encode_lockowner_maxsz)
226#define decode_release_lockowner_maxsz \
227 (op_decode_hdr_maxsz)
220#define encode_access_maxsz (op_encode_hdr_maxsz + 1) 228#define encode_access_maxsz (op_encode_hdr_maxsz + 1)
221#define decode_access_maxsz (op_decode_hdr_maxsz + 2) 229#define decode_access_maxsz (op_decode_hdr_maxsz + 2)
222#define encode_symlink_maxsz (op_encode_hdr_maxsz + \ 230#define encode_symlink_maxsz (op_encode_hdr_maxsz + \
@@ -471,6 +479,12 @@ static int nfs4_stat_to_errno(int);
471 decode_sequence_maxsz + \ 479 decode_sequence_maxsz + \
472 decode_putfh_maxsz + \ 480 decode_putfh_maxsz + \
473 decode_locku_maxsz) 481 decode_locku_maxsz)
482#define NFS4_enc_release_lockowner_sz \
483 (compound_encode_hdr_maxsz + \
484 encode_lockowner_maxsz)
485#define NFS4_dec_release_lockowner_sz \
486 (compound_decode_hdr_maxsz + \
487 decode_lockowner_maxsz)
474#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ 488#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \
475 encode_sequence_maxsz + \ 489 encode_sequence_maxsz + \
476 encode_putfh_maxsz + \ 490 encode_putfh_maxsz + \
@@ -744,7 +758,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
744 struct compound_hdr *hdr) 758 struct compound_hdr *hdr)
745{ 759{
746 __be32 *p; 760 __be32 *p;
747 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 761 struct rpc_auth *auth = req->rq_cred->cr_auth;
748 762
749 /* initialize running count of expected bytes in reply. 763 /* initialize running count of expected bytes in reply.
750 * NOTE: the replied tag SHOULD be the same is the one sent, 764 * NOTE: the replied tag SHOULD be the same is the one sent,
@@ -862,8 +876,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
862 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; 876 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
863 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); 877 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
864 *p++ = cpu_to_be32(0); 878 *p++ = cpu_to_be32(0);
865 *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); 879 *p++ = cpu_to_be32(iap->ia_atime.tv_sec);
866 *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); 880 *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
867 } 881 }
868 else if (iap->ia_valid & ATTR_ATIME) { 882 else if (iap->ia_valid & ATTR_ATIME) {
869 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; 883 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -1042,6 +1056,17 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
1042 return fl->fl_end - fl->fl_start + 1; 1056 return fl->fl_end - fl->fl_start + 1;
1043} 1057}
1044 1058
1059static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
1060{
1061 __be32 *p;
1062
1063 p = reserve_space(xdr, 28);
1064 p = xdr_encode_hyper(p, lowner->clientid);
1065 *p++ = cpu_to_be32(16);
1066 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1067 xdr_encode_hyper(p, lowner->id);
1068}
1069
1045/* 1070/*
1046 * opcode,type,reclaim,offset,length,new_lock_owner = 32 1071 * opcode,type,reclaim,offset,length,new_lock_owner = 32
1047 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 1072 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
@@ -1058,14 +1083,11 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1058 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1083 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1059 *p = cpu_to_be32(args->new_lock_owner); 1084 *p = cpu_to_be32(args->new_lock_owner);
1060 if (args->new_lock_owner){ 1085 if (args->new_lock_owner){
1061 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); 1086 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
1062 *p++ = cpu_to_be32(args->open_seqid->sequence->counter); 1087 *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
1063 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); 1088 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
1064 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); 1089 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
1065 p = xdr_encode_hyper(p, args->lock_owner.clientid); 1090 encode_lockowner(xdr, &args->lock_owner);
1066 *p++ = cpu_to_be32(16);
1067 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1068 xdr_encode_hyper(p, args->lock_owner.id);
1069 } 1091 }
1070 else { 1092 else {
1071 p = reserve_space(xdr, NFS4_STATEID_SIZE+4); 1093 p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
@@ -1080,15 +1102,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
1080{ 1102{
1081 __be32 *p; 1103 __be32 *p;
1082 1104
1083 p = reserve_space(xdr, 52); 1105 p = reserve_space(xdr, 24);
1084 *p++ = cpu_to_be32(OP_LOCKT); 1106 *p++ = cpu_to_be32(OP_LOCKT);
1085 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); 1107 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1086 p = xdr_encode_hyper(p, args->fl->fl_start); 1108 p = xdr_encode_hyper(p, args->fl->fl_start);
1087 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1109 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1088 p = xdr_encode_hyper(p, args->lock_owner.clientid); 1110 encode_lockowner(xdr, &args->lock_owner);
1089 *p++ = cpu_to_be32(16);
1090 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1091 xdr_encode_hyper(p, args->lock_owner.id);
1092 hdr->nops++; 1111 hdr->nops++;
1093 hdr->replen += decode_lockt_maxsz; 1112 hdr->replen += decode_lockt_maxsz;
1094} 1113}
@@ -1108,6 +1127,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
1108 hdr->replen += decode_locku_maxsz; 1127 hdr->replen += decode_locku_maxsz;
1109} 1128}
1110 1129
1130static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
1131{
1132 __be32 *p;
1133
1134 p = reserve_space(xdr, 4);
1135 *p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
1136 encode_lockowner(xdr, lowner);
1137 hdr->nops++;
1138 hdr->replen += decode_release_lockowner_maxsz;
1139}
1140
1111static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1141static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1112{ 1142{
1113 int len = name->len; 1143 int len = name->len;
@@ -1172,7 +1202,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1172 break; 1202 break;
1173 default: 1203 default:
1174 clp = arg->server->nfs_client; 1204 clp = arg->server->nfs_client;
1175 if (clp->cl_minorversion > 0) { 1205 if (clp->cl_mvops->minor_version > 0) {
1176 if (nfs4_has_persistent_session(clp)) { 1206 if (nfs4_has_persistent_session(clp)) {
1177 *p = cpu_to_be32(NFS4_CREATE_GUARDED); 1207 *p = cpu_to_be32(NFS4_CREATE_GUARDED);
1178 encode_attrs(xdr, arg->u.attrs, arg->server); 1208 encode_attrs(xdr, arg->u.attrs, arg->server);
@@ -1324,14 +1354,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1324 hdr->replen += decode_putrootfh_maxsz; 1354 hdr->replen += decode_putrootfh_maxsz;
1325} 1355}
1326 1356
1327static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) 1357static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
1328{ 1358{
1329 nfs4_stateid stateid; 1359 nfs4_stateid stateid;
1330 __be32 *p; 1360 __be32 *p;
1331 1361
1332 p = reserve_space(xdr, NFS4_STATEID_SIZE); 1362 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1333 if (ctx->state != NULL) { 1363 if (ctx->state != NULL) {
1334 nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); 1364 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
1335 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); 1365 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
1336 } else 1366 } else
1337 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); 1367 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1344,7 +1374,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1344 p = reserve_space(xdr, 4); 1374 p = reserve_space(xdr, 4);
1345 *p = cpu_to_be32(OP_READ); 1375 *p = cpu_to_be32(OP_READ);
1346 1376
1347 encode_stateid(xdr, args->context); 1377 encode_stateid(xdr, args->context, args->lock_context);
1348 1378
1349 p = reserve_space(xdr, 12); 1379 p = reserve_space(xdr, 12);
1350 p = xdr_encode_hyper(p, args->offset); 1380 p = xdr_encode_hyper(p, args->offset);
@@ -1504,14 +1534,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1504 hdr->replen += decode_setclientid_maxsz; 1534 hdr->replen += decode_setclientid_maxsz;
1505} 1535}
1506 1536
1507static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr) 1537static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
1508{ 1538{
1509 __be32 *p; 1539 __be32 *p;
1510 1540
1511 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); 1541 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
1512 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); 1542 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
1513 p = xdr_encode_hyper(p, client_state->cl_clientid); 1543 p = xdr_encode_hyper(p, arg->clientid);
1514 xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); 1544 xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
1515 hdr->nops++; 1545 hdr->nops++;
1516 hdr->replen += decode_setclientid_confirm_maxsz; 1546 hdr->replen += decode_setclientid_confirm_maxsz;
1517} 1547}
@@ -1523,7 +1553,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1523 p = reserve_space(xdr, 4); 1553 p = reserve_space(xdr, 4);
1524 *p = cpu_to_be32(OP_WRITE); 1554 *p = cpu_to_be32(OP_WRITE);
1525 1555
1526 encode_stateid(xdr, args->context); 1556 encode_stateid(xdr, args->context, args->lock_context);
1527 1557
1528 p = reserve_space(xdr, 16); 1558 p = reserve_space(xdr, 16);
1529 p = xdr_encode_hyper(p, args->offset); 1559 p = xdr_encode_hyper(p, args->offset);
@@ -1704,7 +1734,7 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
1704{ 1734{
1705#if defined(CONFIG_NFS_V4_1) 1735#if defined(CONFIG_NFS_V4_1)
1706 if (args->sa_session) 1736 if (args->sa_session)
1707 return args->sa_session->clp->cl_minorversion; 1737 return args->sa_session->clp->cl_mvops->minor_version;
1708#endif /* CONFIG_NFS_V4_1 */ 1738#endif /* CONFIG_NFS_V4_1 */
1709 return 0; 1739 return 0;
1710} 1740}
@@ -2048,6 +2078,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
2048 return 0; 2078 return 0;
2049} 2079}
2050 2080
2081static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
2082{
2083 struct xdr_stream xdr;
2084 struct compound_hdr hdr = {
2085 .minorversion = 0,
2086 };
2087
2088 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2089 encode_compound_hdr(&xdr, req, &hdr);
2090 encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
2091 encode_nops(&hdr);
2092 return 0;
2093}
2094
2051/* 2095/*
2052 * Encode a READLINK request 2096 * Encode a READLINK request
2053 */ 2097 */
@@ -2324,7 +2368,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
2324/* 2368/*
2325 * a SETCLIENTID_CONFIRM request 2369 * a SETCLIENTID_CONFIRM request
2326 */ 2370 */
2327static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) 2371static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
2328{ 2372{
2329 struct xdr_stream xdr; 2373 struct xdr_stream xdr;
2330 struct compound_hdr hdr = { 2374 struct compound_hdr hdr = {
@@ -2334,7 +2378,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
2334 2378
2335 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2379 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2336 encode_compound_hdr(&xdr, req, &hdr); 2380 encode_compound_hdr(&xdr, req, &hdr);
2337 encode_setclientid_confirm(&xdr, clp, &hdr); 2381 encode_setclientid_confirm(&xdr, arg, &hdr);
2338 encode_putrootfh(&xdr, &hdr); 2382 encode_putrootfh(&xdr, &hdr);
2339 encode_fsinfo(&xdr, lease_bitmap, &hdr); 2383 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2340 encode_nops(&hdr); 2384 encode_nops(&hdr);
@@ -2395,7 +2439,7 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
2395{ 2439{
2396 struct xdr_stream xdr; 2440 struct xdr_stream xdr;
2397 struct compound_hdr hdr = { 2441 struct compound_hdr hdr = {
2398 .minorversion = args->client->cl_minorversion, 2442 .minorversion = args->client->cl_mvops->minor_version,
2399 }; 2443 };
2400 2444
2401 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2445 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2413,7 +2457,7 @@ static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
2413{ 2457{
2414 struct xdr_stream xdr; 2458 struct xdr_stream xdr;
2415 struct compound_hdr hdr = { 2459 struct compound_hdr hdr = {
2416 .minorversion = args->client->cl_minorversion, 2460 .minorversion = args->client->cl_mvops->minor_version,
2417 }; 2461 };
2418 2462
2419 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2463 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2431,7 +2475,7 @@ static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
2431{ 2475{
2432 struct xdr_stream xdr; 2476 struct xdr_stream xdr;
2433 struct compound_hdr hdr = { 2477 struct compound_hdr hdr = {
2434 .minorversion = session->clp->cl_minorversion, 2478 .minorversion = session->clp->cl_mvops->minor_version,
2435 }; 2479 };
2436 2480
2437 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2481 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -3973,6 +4017,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
3973 return status; 4017 return status;
3974} 4018}
3975 4019
4020static int decode_release_lockowner(struct xdr_stream *xdr)
4021{
4022 return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
4023}
4024
3976static int decode_lookup(struct xdr_stream *xdr) 4025static int decode_lookup(struct xdr_stream *xdr)
3977{ 4026{
3978 return decode_op_hdr(xdr, OP_LOOKUP); 4027 return decode_op_hdr(xdr, OP_LOOKUP);
@@ -4397,7 +4446,7 @@ out_overflow:
4397 return -EIO; 4446 return -EIO;
4398} 4447}
4399 4448
4400static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) 4449static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
4401{ 4450{
4402 __be32 *p; 4451 __be32 *p;
4403 uint32_t opnum; 4452 uint32_t opnum;
@@ -4417,8 +4466,8 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
4417 p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); 4466 p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
4418 if (unlikely(!p)) 4467 if (unlikely(!p))
4419 goto out_overflow; 4468 goto out_overflow;
4420 p = xdr_decode_hyper(p, &clp->cl_clientid); 4469 p = xdr_decode_hyper(p, &res->clientid);
4421 memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); 4470 memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
4422 } else if (nfserr == NFSERR_CLID_INUSE) { 4471 } else if (nfserr == NFSERR_CLID_INUSE) {
4423 uint32_t len; 4472 uint32_t len;
4424 4473
@@ -4815,7 +4864,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
4815 goto out; 4864 goto out;
4816 if ((status = decode_remove(&xdr, &res->cinfo)) != 0) 4865 if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
4817 goto out; 4866 goto out;
4818 decode_getfattr(&xdr, &res->dir_attr, res->server, 4867 decode_getfattr(&xdr, res->dir_attr, res->server,
4819 !RPC_IS_ASYNC(rqstp->rq_task)); 4868 !RPC_IS_ASYNC(rqstp->rq_task));
4820out: 4869out:
4821 return status; 4870 return status;
@@ -5259,6 +5308,19 @@ out:
5259 return status; 5308 return status;
5260} 5309}
5261 5310
5311static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
5312{
5313 struct xdr_stream xdr;
5314 struct compound_hdr hdr;
5315 int status;
5316
5317 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
5318 status = decode_compound_hdr(&xdr, &hdr);
5319 if (!status)
5320 status = decode_release_lockowner(&xdr);
5321 return status;
5322}
5323
5262/* 5324/*
5263 * Decode READLINK response 5325 * Decode READLINK response
5264 */ 5326 */
@@ -5498,7 +5560,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
5498 * Decode SETCLIENTID response 5560 * Decode SETCLIENTID response
5499 */ 5561 */
5500static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, 5562static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
5501 struct nfs_client *clp) 5563 struct nfs4_setclientid_res *res)
5502{ 5564{
5503 struct xdr_stream xdr; 5565 struct xdr_stream xdr;
5504 struct compound_hdr hdr; 5566 struct compound_hdr hdr;
@@ -5507,7 +5569,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
5507 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5569 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
5508 status = decode_compound_hdr(&xdr, &hdr); 5570 status = decode_compound_hdr(&xdr, &hdr);
5509 if (!status) 5571 if (!status)
5510 status = decode_setclientid(&xdr, clp); 5572 status = decode_setclientid(&xdr, res);
5511 return status; 5573 return status;
5512} 5574}
5513 5575
@@ -5866,6 +5928,7 @@ struct rpc_procinfo nfs4_procedures[] = {
5866 PROC(GETACL, enc_getacl, dec_getacl), 5928 PROC(GETACL, enc_getacl, dec_getacl),
5867 PROC(SETACL, enc_setacl, dec_setacl), 5929 PROC(SETACL, enc_setacl, dec_setacl),
5868 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), 5930 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
5931 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
5869#if defined(CONFIG_NFS_V4_1) 5932#if defined(CONFIG_NFS_V4_1)
5870 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), 5933 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
5871 PROC(CREATE_SESSION, enc_create_session, dec_create_session), 5934 PROC(CREATE_SESSION, enc_create_session, dec_create_session),
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8c55b27c0de4..df101d9f546a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -105,7 +105,7 @@ static char nfs_root_name[256] __initdata = "";
105static __be32 servaddr __initdata = 0; 105static __be32 servaddr __initdata = 0;
106 106
107/* Name of directory to mount */ 107/* Name of directory to mount */
108static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, }; 108static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
109 109
110/* NFS-related data */ 110/* NFS-related data */
111static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ 111static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void)
488 */ 488 */
489static int __init root_nfs_get_handle(void) 489static int __init root_nfs_get_handle(void)
490{ 490{
491 struct nfs_fh fh;
492 struct sockaddr_in sin; 491 struct sockaddr_in sin;
493 unsigned int auth_flav_len = 0; 492 unsigned int auth_flav_len = 0;
494 struct nfs_mount_request request = { 493 struct nfs_mount_request request = {
@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(void)
499 NFS_MNT3_VERSION : NFS_MNT_VERSION, 498 NFS_MNT3_VERSION : NFS_MNT_VERSION,
500 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? 499 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
501 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, 500 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
502 .fh = &fh,
503 .auth_flav_len = &auth_flav_len, 501 .auth_flav_len = &auth_flav_len,
504 }; 502 };
505 int status; 503 int status = -ENOMEM;
506 504
505 request.fh = nfs_alloc_fhandle();
506 if (!request.fh)
507 goto out;
507 set_sockaddr(&sin, servaddr, htons(mount_port)); 508 set_sockaddr(&sin, servaddr, htons(mount_port));
508 status = nfs_mount(&request); 509 status = nfs_mount(&request);
509 if (status < 0) 510 if (status < 0)
510 printk(KERN_ERR "Root-NFS: Server returned error %d " 511 printk(KERN_ERR "Root-NFS: Server returned error %d "
511 "while mounting %s\n", status, nfs_export_path); 512 "while mounting %s\n", status, nfs_export_path);
512 else { 513 else {
513 nfs_data.root.size = fh.size; 514 nfs_data.root.size = request.fh->size;
514 memcpy(nfs_data.root.data, fh.data, fh.size); 515 memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
515 } 516 }
516 517 nfs_free_fhandle(request.fh);
518out:
517 return status; 519 return status;
518} 520}
519 521
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29d9d36cd5f4..919490232e17 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
60{ 60{
61 struct nfs_page *req; 61 struct nfs_page *req;
62 62
63 for (;;) { 63 /* try to allocate the request struct */
64 /* try to allocate the request struct */ 64 req = nfs_page_alloc();
65 req = nfs_page_alloc(); 65 if (req == NULL)
66 if (req != NULL) 66 return ERR_PTR(-ENOMEM);
67 break;
68
69 if (fatal_signal_pending(current))
70 return ERR_PTR(-ERESTARTSYS);
71 yield();
72 }
73 67
74 /* Initialize the request struct. Initially, we assume a 68 /* Initialize the request struct. Initially, we assume a
75 * long write-back delay. This will be adjusted in 69 * long write-back delay. This will be adjusted in
@@ -85,6 +79,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
85 req->wb_pgbase = offset; 79 req->wb_pgbase = offset;
86 req->wb_bytes = count; 80 req->wb_bytes = count;
87 req->wb_context = get_nfs_open_context(ctx); 81 req->wb_context = get_nfs_open_context(ctx);
82 req->wb_lock_context = nfs_get_lock_context(ctx);
88 kref_init(&req->wb_kref); 83 kref_init(&req->wb_kref);
89 return req; 84 return req;
90} 85}
@@ -147,11 +142,16 @@ void nfs_clear_request(struct nfs_page *req)
147{ 142{
148 struct page *page = req->wb_page; 143 struct page *page = req->wb_page;
149 struct nfs_open_context *ctx = req->wb_context; 144 struct nfs_open_context *ctx = req->wb_context;
145 struct nfs_lock_context *l_ctx = req->wb_lock_context;
150 146
151 if (page != NULL) { 147 if (page != NULL) {
152 page_cache_release(page); 148 page_cache_release(page);
153 req->wb_page = NULL; 149 req->wb_page = NULL;
154 } 150 }
151 if (l_ctx != NULL) {
152 nfs_put_lock_context(l_ctx);
153 req->wb_lock_context = NULL;
154 }
155 if (ctx != NULL) { 155 if (ctx != NULL) {
156 put_nfs_open_context(ctx); 156 put_nfs_open_context(ctx);
157 req->wb_context = NULL; 157 req->wb_context = NULL;
@@ -241,7 +241,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
241{ 241{
242 if (req->wb_context->cred != prev->wb_context->cred) 242 if (req->wb_context->cred != prev->wb_context->cred)
243 return 0; 243 return 0;
244 if (req->wb_context->lockowner != prev->wb_context->lockowner) 244 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
245 return 0; 245 return 0;
246 if (req->wb_context->state != prev->wb_context->state) 246 if (req->wb_context->state != prev->wb_context->state)
247 return 0; 247 return 0;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 0288be80444f..611bec22f552 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -224,35 +224,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page,
224 return status; 224 return status;
225} 225}
226 226
227struct nfs_createdata {
228 struct nfs_createargs arg;
229 struct nfs_diropok res;
230 struct nfs_fh fhandle;
231 struct nfs_fattr fattr;
232};
233
234static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
235 struct dentry *dentry, struct iattr *sattr)
236{
237 struct nfs_createdata *data;
238
239 data = kmalloc(sizeof(*data), GFP_KERNEL);
240
241 if (data != NULL) {
242 data->arg.fh = NFS_FH(dir);
243 data->arg.name = dentry->d_name.name;
244 data->arg.len = dentry->d_name.len;
245 data->arg.sattr = sattr;
246 nfs_fattr_init(&data->fattr);
247 data->fhandle.size = 0;
248 data->res.fh = &data->fhandle;
249 data->res.fattr = &data->fattr;
250 }
251 return data;
252};
253
254static void nfs_free_createdata(const struct nfs_createdata *data)
255{
256 kfree(data);
257}
258
227static int 259static int
228nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
229 int flags, struct nameidata *nd) 261 int flags, struct nameidata *nd)
230{ 262{
231 struct nfs_fh fhandle; 263 struct nfs_createdata *data;
232 struct nfs_fattr fattr;
233 struct nfs_createargs arg = {
234 .fh = NFS_FH(dir),
235 .name = dentry->d_name.name,
236 .len = dentry->d_name.len,
237 .sattr = sattr
238 };
239 struct nfs_diropok res = {
240 .fh = &fhandle,
241 .fattr = &fattr
242 };
243 struct rpc_message msg = { 264 struct rpc_message msg = {
244 .rpc_proc = &nfs_procedures[NFSPROC_CREATE], 265 .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
245 .rpc_argp = &arg,
246 .rpc_resp = &res,
247 }; 266 };
248 int status; 267 int status = -ENOMEM;
249 268
250 nfs_fattr_init(&fattr);
251 dprintk("NFS call create %s\n", dentry->d_name.name); 269 dprintk("NFS call create %s\n", dentry->d_name.name);
270 data = nfs_alloc_createdata(dir, dentry, sattr);
271 if (data == NULL)
272 goto out;
273 msg.rpc_argp = &data->arg;
274 msg.rpc_resp = &data->res;
252 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 275 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
253 nfs_mark_for_revalidate(dir); 276 nfs_mark_for_revalidate(dir);
254 if (status == 0) 277 if (status == 0)
255 status = nfs_instantiate(dentry, &fhandle, &fattr); 278 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
279 nfs_free_createdata(data);
280out:
256 dprintk("NFS reply create: %d\n", status); 281 dprintk("NFS reply create: %d\n", status);
257 return status; 282 return status;
258} 283}
@@ -264,24 +289,12 @@ static int
264nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 289nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
265 dev_t rdev) 290 dev_t rdev)
266{ 291{
267 struct nfs_fh fhandle; 292 struct nfs_createdata *data;
268 struct nfs_fattr fattr;
269 struct nfs_createargs arg = {
270 .fh = NFS_FH(dir),
271 .name = dentry->d_name.name,
272 .len = dentry->d_name.len,
273 .sattr = sattr
274 };
275 struct nfs_diropok res = {
276 .fh = &fhandle,
277 .fattr = &fattr
278 };
279 struct rpc_message msg = { 293 struct rpc_message msg = {
280 .rpc_proc = &nfs_procedures[NFSPROC_CREATE], 294 .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
281 .rpc_argp = &arg,
282 .rpc_resp = &res,
283 }; 295 };
284 int status, mode; 296 umode_t mode;
297 int status = -ENOMEM;
285 298
286 dprintk("NFS call mknod %s\n", dentry->d_name.name); 299 dprintk("NFS call mknod %s\n", dentry->d_name.name);
287 300
@@ -294,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
294 sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ 307 sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
295 } 308 }
296 309
297 nfs_fattr_init(&fattr); 310 data = nfs_alloc_createdata(dir, dentry, sattr);
311 if (data == NULL)
312 goto out;
313 msg.rpc_argp = &data->arg;
314 msg.rpc_resp = &data->res;
315
298 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 316 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
299 nfs_mark_for_revalidate(dir); 317 nfs_mark_for_revalidate(dir);
300 318
301 if (status == -EINVAL && S_ISFIFO(mode)) { 319 if (status == -EINVAL && S_ISFIFO(mode)) {
302 sattr->ia_mode = mode; 320 sattr->ia_mode = mode;
303 nfs_fattr_init(&fattr); 321 nfs_fattr_init(data->res.fattr);
304 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 322 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
305 } 323 }
306 if (status == 0) 324 if (status == 0)
307 status = nfs_instantiate(dentry, &fhandle, &fattr); 325 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
326 nfs_free_createdata(data);
327out:
308 dprintk("NFS reply mknod: %d\n", status); 328 dprintk("NFS reply mknod: %d\n", status);
309 return status; 329 return status;
310} 330}
@@ -398,8 +418,8 @@ static int
398nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, 418nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
399 unsigned int len, struct iattr *sattr) 419 unsigned int len, struct iattr *sattr)
400{ 420{
401 struct nfs_fh fhandle; 421 struct nfs_fh *fh;
402 struct nfs_fattr fattr; 422 struct nfs_fattr *fattr;
403 struct nfs_symlinkargs arg = { 423 struct nfs_symlinkargs arg = {
404 .fromfh = NFS_FH(dir), 424 .fromfh = NFS_FH(dir),
405 .fromname = dentry->d_name.name, 425 .fromname = dentry->d_name.name,
@@ -412,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
412 .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], 432 .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK],
413 .rpc_argp = &arg, 433 .rpc_argp = &arg,
414 }; 434 };
415 int status; 435 int status = -ENAMETOOLONG;
436
437 dprintk("NFS call symlink %s\n", dentry->d_name.name);
416 438
417 if (len > NFS2_MAXPATHLEN) 439 if (len > NFS2_MAXPATHLEN)
418 return -ENAMETOOLONG; 440 goto out;
419 441
420 dprintk("NFS call symlink %s\n", dentry->d_name.name); 442 fh = nfs_alloc_fhandle();
443 fattr = nfs_alloc_fattr();
444 status = -ENOMEM;
445 if (fh == NULL || fattr == NULL)
446 goto out;
421 447
422 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 448 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
423 nfs_mark_for_revalidate(dir); 449 nfs_mark_for_revalidate(dir);
@@ -427,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
427 * filehandle size to zero indicates to nfs_instantiate that it 453 * filehandle size to zero indicates to nfs_instantiate that it
428 * should fill in the data with a LOOKUP call on the wire. 454 * should fill in the data with a LOOKUP call on the wire.
429 */ 455 */
430 if (status == 0) { 456 if (status == 0)
431 nfs_fattr_init(&fattr); 457 status = nfs_instantiate(dentry, fh, fattr);
432 fhandle.size = 0;
433 status = nfs_instantiate(dentry, &fhandle, &fattr);
434 }
435 458
459 nfs_free_fattr(fattr);
460 nfs_free_fhandle(fh);
461out:
436 dprintk("NFS reply symlink: %d\n", status); 462 dprintk("NFS reply symlink: %d\n", status);
437 return status; 463 return status;
438} 464}
@@ -440,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
440static int 466static int
441nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) 467nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
442{ 468{
443 struct nfs_fh fhandle; 469 struct nfs_createdata *data;
444 struct nfs_fattr fattr;
445 struct nfs_createargs arg = {
446 .fh = NFS_FH(dir),
447 .name = dentry->d_name.name,
448 .len = dentry->d_name.len,
449 .sattr = sattr
450 };
451 struct nfs_diropok res = {
452 .fh = &fhandle,
453 .fattr = &fattr
454 };
455 struct rpc_message msg = { 470 struct rpc_message msg = {
456 .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], 471 .rpc_proc = &nfs_procedures[NFSPROC_MKDIR],
457 .rpc_argp = &arg,
458 .rpc_resp = &res,
459 }; 472 };
460 int status; 473 int status = -ENOMEM;
461 474
462 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 475 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
463 nfs_fattr_init(&fattr); 476 data = nfs_alloc_createdata(dir, dentry, sattr);
477 if (data == NULL)
478 goto out;
479 msg.rpc_argp = &data->arg;
480 msg.rpc_resp = &data->res;
481
464 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 482 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
465 nfs_mark_for_revalidate(dir); 483 nfs_mark_for_revalidate(dir);
466 if (status == 0) 484 if (status == 0)
467 status = nfs_instantiate(dentry, &fhandle, &fattr); 485 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
486 nfs_free_createdata(data);
487out:
468 dprintk("NFS reply mkdir: %d\n", status); 488 dprintk("NFS reply mkdir: %d\n", status);
469 return status; 489 return status;
470} 490}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index db9b360ae19d..87adc2744246 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool;
40 40
41struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) 41struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
42{ 42{
43 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); 43 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
44 44
45 if (p) { 45 if (p) {
46 memset(p, 0, sizeof(*p)); 46 memset(p, 0, sizeof(*p));
@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
50 if (pagecount <= ARRAY_SIZE(p->page_array)) 50 if (pagecount <= ARRAY_SIZE(p->page_array))
51 p->pagevec = p->page_array; 51 p->pagevec = p->page_array;
52 else { 52 else {
53 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); 53 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
54 if (!p->pagevec) { 54 if (!p->pagevec) {
55 mempool_free(p, nfs_rdata_mempool); 55 mempool_free(p, nfs_rdata_mempool);
56 p = NULL; 56 p = NULL;
@@ -190,6 +190,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
190 data->args.pages = data->pagevec; 190 data->args.pages = data->pagevec;
191 data->args.count = count; 191 data->args.count = count;
192 data->args.context = get_nfs_open_context(req->wb_context); 192 data->args.context = get_nfs_open_context(req->wb_context);
193 data->args.lock_context = req->wb_lock_context;
193 194
194 data->res.fattr = &data->fattr; 195 data->res.fattr = &data->fattr;
195 data->res.count = count; 196 data->res.count = count;
@@ -410,7 +411,7 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
410{ 411{
411 struct nfs_read_data *data = calldata; 412 struct nfs_read_data *data = calldata;
412 413
413 if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, 414 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
414 &data->args.seq_args, &data->res.seq_res, 415 &data->args.seq_args, &data->res.seq_res,
415 0, task)) 416 0, task))
416 return; 417 return;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b4148fc00f9f..f4cbf0c306c6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -141,7 +141,6 @@ static const match_table_t nfs_mount_option_tokens = {
141 { Opt_resvport, "resvport" }, 141 { Opt_resvport, "resvport" },
142 { Opt_noresvport, "noresvport" }, 142 { Opt_noresvport, "noresvport" },
143 { Opt_fscache, "fsc" }, 143 { Opt_fscache, "fsc" },
144 { Opt_fscache_uniq, "fsc=%s" },
145 { Opt_nofscache, "nofsc" }, 144 { Opt_nofscache, "nofsc" },
146 145
147 { Opt_port, "port=%s" }, 146 { Opt_port, "port=%s" },
@@ -171,6 +170,7 @@ static const match_table_t nfs_mount_option_tokens = {
171 { Opt_mountaddr, "mountaddr=%s" }, 170 { Opt_mountaddr, "mountaddr=%s" },
172 171
173 { Opt_lookupcache, "lookupcache=%s" }, 172 { Opt_lookupcache, "lookupcache=%s" },
173 { Opt_fscache_uniq, "fsc=%s" },
174 174
175 { Opt_err, NULL } 175 { Opt_err, NULL }
176}; 176};
@@ -270,7 +270,7 @@ static const struct super_operations nfs_sops = {
270 .write_inode = nfs_write_inode, 270 .write_inode = nfs_write_inode,
271 .put_super = nfs_put_super, 271 .put_super = nfs_put_super,
272 .statfs = nfs_statfs, 272 .statfs = nfs_statfs,
273 .clear_inode = nfs_clear_inode, 273 .evict_inode = nfs_evict_inode,
274 .umount_begin = nfs_umount_begin, 274 .umount_begin = nfs_umount_begin,
275 .show_options = nfs_show_options, 275 .show_options = nfs_show_options,
276 .show_stats = nfs_show_stats, 276 .show_stats = nfs_show_stats,
@@ -340,7 +340,7 @@ static const struct super_operations nfs4_sops = {
340 .write_inode = nfs_write_inode, 340 .write_inode = nfs_write_inode,
341 .put_super = nfs_put_super, 341 .put_super = nfs_put_super,
342 .statfs = nfs_statfs, 342 .statfs = nfs_statfs,
343 .clear_inode = nfs4_clear_inode, 343 .evict_inode = nfs4_evict_inode,
344 .umount_begin = nfs_umount_begin, 344 .umount_begin = nfs_umount_begin,
345 .show_options = nfs_show_options, 345 .show_options = nfs_show_options,
346 .show_stats = nfs_show_stats, 346 .show_stats = nfs_show_stats,
@@ -423,15 +423,27 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
423 unsigned char blockbits; 423 unsigned char blockbits;
424 unsigned long blockres; 424 unsigned long blockres;
425 struct nfs_fh *fh = NFS_FH(dentry->d_inode); 425 struct nfs_fh *fh = NFS_FH(dentry->d_inode);
426 struct nfs_fattr fattr; 426 struct nfs_fsstat res;
427 struct nfs_fsstat res = { 427 int error = -ENOMEM;
428 .fattr = &fattr, 428
429 }; 429 res.fattr = nfs_alloc_fattr();
430 int error; 430 if (res.fattr == NULL)
431 goto out_err;
431 432
432 error = server->nfs_client->rpc_ops->statfs(server, fh, &res); 433 error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
434 if (unlikely(error == -ESTALE)) {
435 struct dentry *pd_dentry;
436
437 pd_dentry = dget_parent(dentry);
438 if (pd_dentry != NULL) {
439 nfs_zap_caches(pd_dentry->d_inode);
440 dput(pd_dentry);
441 }
442 }
443 nfs_free_fattr(res.fattr);
433 if (error < 0) 444 if (error < 0)
434 goto out_err; 445 goto out_err;
446
435 buf->f_type = NFS_SUPER_MAGIC; 447 buf->f_type = NFS_SUPER_MAGIC;
436 448
437 /* 449 /*
@@ -542,6 +554,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
542{ 554{
543 struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; 555 struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
544 556
557 if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
558 return;
559
545 switch (sap->sa_family) { 560 switch (sap->sa_family) {
546 case AF_INET: { 561 case AF_INET: {
547 struct sockaddr_in *sin = (struct sockaddr_in *)sap; 562 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
@@ -566,6 +581,22 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
566 nfs_show_mountd_netid(m, nfss, showdefaults); 581 nfs_show_mountd_netid(m, nfss, showdefaults);
567} 582}
568 583
584#ifdef CONFIG_NFS_V4
585static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
586 int showdefaults)
587{
588 struct nfs_client *clp = nfss->nfs_client;
589
590 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
591 seq_printf(m, ",minorversion=%u", clp->cl_minorversion);
592}
593#else
594static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
595 int showdefaults)
596{
597}
598#endif
599
569/* 600/*
570 * Describe the mount options in force on this server representation 601 * Describe the mount options in force on this server representation
571 */ 602 */
@@ -627,13 +658,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
627 658
628 if (version != 4) 659 if (version != 4)
629 nfs_show_mountd_options(m, nfss, showdefaults); 660 nfs_show_mountd_options(m, nfss, showdefaults);
661 else
662 nfs_show_nfsv4_options(m, nfss, showdefaults);
630 663
631#ifdef CONFIG_NFS_V4
632 if (clp->rpc_ops->version == 4)
633 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
634#endif
635 if (nfss->options & NFS_OPTION_FSCACHE) 664 if (nfss->options & NFS_OPTION_FSCACHE)
636 seq_printf(m, ",fsc"); 665 seq_printf(m, ",fsc");
666
667 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
668 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
669 seq_printf(m, ",lookupcache=none");
670 else
671 seq_printf(m, ",lookupcache=pos");
672 }
637} 673}
638 674
639/* 675/*
@@ -1046,14 +1082,6 @@ static int nfs_parse_mount_options(char *raw,
1046 kfree(mnt->fscache_uniq); 1082 kfree(mnt->fscache_uniq);
1047 mnt->fscache_uniq = NULL; 1083 mnt->fscache_uniq = NULL;
1048 break; 1084 break;
1049 case Opt_fscache_uniq:
1050 string = match_strdup(args);
1051 if (!string)
1052 goto out_nomem;
1053 kfree(mnt->fscache_uniq);
1054 mnt->fscache_uniq = string;
1055 mnt->options |= NFS_OPTION_FSCACHE;
1056 break;
1057 1085
1058 /* 1086 /*
1059 * options that take numeric values 1087 * options that take numeric values
@@ -1064,7 +1092,7 @@ static int nfs_parse_mount_options(char *raw,
1064 goto out_nomem; 1092 goto out_nomem;
1065 rc = strict_strtoul(string, 10, &option); 1093 rc = strict_strtoul(string, 10, &option);
1066 kfree(string); 1094 kfree(string);
1067 if (rc != 0 || option > USHORT_MAX) 1095 if (rc != 0 || option > USHRT_MAX)
1068 goto out_invalid_value; 1096 goto out_invalid_value;
1069 mnt->nfs_server.port = option; 1097 mnt->nfs_server.port = option;
1070 break; 1098 break;
@@ -1185,7 +1213,7 @@ static int nfs_parse_mount_options(char *raw,
1185 goto out_nomem; 1213 goto out_nomem;
1186 rc = strict_strtoul(string, 10, &option); 1214 rc = strict_strtoul(string, 10, &option);
1187 kfree(string); 1215 kfree(string);
1188 if (rc != 0 || option > USHORT_MAX) 1216 if (rc != 0 || option > USHRT_MAX)
1189 goto out_invalid_value; 1217 goto out_invalid_value;
1190 mnt->mount_server.port = option; 1218 mnt->mount_server.port = option;
1191 break; 1219 break;
@@ -1384,6 +1412,14 @@ static int nfs_parse_mount_options(char *raw,
1384 return 0; 1412 return 0;
1385 }; 1413 };
1386 break; 1414 break;
1415 case Opt_fscache_uniq:
1416 string = match_strdup(args);
1417 if (string == NULL)
1418 goto out_nomem;
1419 kfree(mnt->fscache_uniq);
1420 mnt->fscache_uniq = string;
1421 mnt->options |= NFS_OPTION_FSCACHE;
1422 break;
1387 1423
1388 /* 1424 /*
1389 * Special options 1425 * Special options
@@ -1762,6 +1798,7 @@ static int nfs_validate_mount_data(void *options,
1762 * can deal with. 1798 * can deal with.
1763 */ 1799 */
1764 args->flags = data->flags & NFS_MOUNT_FLAGMASK; 1800 args->flags = data->flags & NFS_MOUNT_FLAGMASK;
1801 args->flags |= NFS_MOUNT_LEGACY_INTERFACE;
1765 args->rsize = data->rsize; 1802 args->rsize = data->rsize;
1766 args->wsize = data->wsize; 1803 args->wsize = data->wsize;
1767 args->timeo = data->timeo; 1804 args->timeo = data->timeo;
@@ -2172,7 +2209,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2172 int error = -ENOMEM; 2209 int error = -ENOMEM;
2173 2210
2174 data = nfs_alloc_parsed_mount_data(3); 2211 data = nfs_alloc_parsed_mount_data(3);
2175 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2212 mntfh = nfs_alloc_fhandle();
2176 if (data == NULL || mntfh == NULL) 2213 if (data == NULL || mntfh == NULL)
2177 goto out_free_fh; 2214 goto out_free_fh;
2178 2215
@@ -2247,7 +2284,7 @@ out:
2247 kfree(data->fscache_uniq); 2284 kfree(data->fscache_uniq);
2248 security_free_mnt_opts(&data->lsm_opts); 2285 security_free_mnt_opts(&data->lsm_opts);
2249out_free_fh: 2286out_free_fh:
2250 kfree(mntfh); 2287 nfs_free_fhandle(mntfh);
2251 kfree(data); 2288 kfree(data);
2252 return error; 2289 return error;
2253 2290
@@ -2556,7 +2593,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2556 }; 2593 };
2557 int error = -ENOMEM; 2594 int error = -ENOMEM;
2558 2595
2559 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2596 mntfh = nfs_alloc_fhandle();
2560 if (data == NULL || mntfh == NULL) 2597 if (data == NULL || mntfh == NULL)
2561 goto out_free_fh; 2598 goto out_free_fh;
2562 2599
@@ -2614,7 +2651,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2614out: 2651out:
2615 security_free_mnt_opts(&data->lsm_opts); 2652 security_free_mnt_opts(&data->lsm_opts);
2616out_free_fh: 2653out_free_fh:
2617 kfree(mntfh); 2654 nfs_free_fhandle(mntfh);
2618 return error; 2655 return error;
2619 2656
2620out_free: 2657out_free:
@@ -2669,41 +2706,120 @@ out_freepage:
2669 free_page((unsigned long)page); 2706 free_page((unsigned long)page);
2670} 2707}
2671 2708
2709struct nfs_referral_count {
2710 struct list_head list;
2711 const struct task_struct *task;
2712 unsigned int referral_count;
2713};
2714
2715static LIST_HEAD(nfs_referral_count_list);
2716static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
2717
2718static struct nfs_referral_count *nfs_find_referral_count(void)
2719{
2720 struct nfs_referral_count *p;
2721
2722 list_for_each_entry(p, &nfs_referral_count_list, list) {
2723 if (p->task == current)
2724 return p;
2725 }
2726 return NULL;
2727}
2728
2729#define NFS_MAX_NESTED_REFERRALS 2
2730
2731static int nfs_referral_loop_protect(void)
2732{
2733 struct nfs_referral_count *p, *new;
2734 int ret = -ENOMEM;
2735
2736 new = kmalloc(sizeof(*new), GFP_KERNEL);
2737 if (!new)
2738 goto out;
2739 new->task = current;
2740 new->referral_count = 1;
2741
2742 ret = 0;
2743 spin_lock(&nfs_referral_count_list_lock);
2744 p = nfs_find_referral_count();
2745 if (p != NULL) {
2746 if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
2747 ret = -ELOOP;
2748 else
2749 p->referral_count++;
2750 } else {
2751 list_add(&new->list, &nfs_referral_count_list);
2752 new = NULL;
2753 }
2754 spin_unlock(&nfs_referral_count_list_lock);
2755 kfree(new);
2756out:
2757 return ret;
2758}
2759
2760static void nfs_referral_loop_unprotect(void)
2761{
2762 struct nfs_referral_count *p;
2763
2764 spin_lock(&nfs_referral_count_list_lock);
2765 p = nfs_find_referral_count();
2766 p->referral_count--;
2767 if (p->referral_count == 0)
2768 list_del(&p->list);
2769 else
2770 p = NULL;
2771 spin_unlock(&nfs_referral_count_list_lock);
2772 kfree(p);
2773}
2774
2672static int nfs_follow_remote_path(struct vfsmount *root_mnt, 2775static int nfs_follow_remote_path(struct vfsmount *root_mnt,
2673 const char *export_path, struct vfsmount *mnt_target) 2776 const char *export_path, struct vfsmount *mnt_target)
2674{ 2777{
2778 struct nameidata *nd = NULL;
2675 struct mnt_namespace *ns_private; 2779 struct mnt_namespace *ns_private;
2676 struct nameidata nd;
2677 struct super_block *s; 2780 struct super_block *s;
2678 int ret; 2781 int ret;
2679 2782
2783 nd = kmalloc(sizeof(*nd), GFP_KERNEL);
2784 if (nd == NULL)
2785 return -ENOMEM;
2786
2680 ns_private = create_mnt_ns(root_mnt); 2787 ns_private = create_mnt_ns(root_mnt);
2681 ret = PTR_ERR(ns_private); 2788 ret = PTR_ERR(ns_private);
2682 if (IS_ERR(ns_private)) 2789 if (IS_ERR(ns_private))
2683 goto out_mntput; 2790 goto out_mntput;
2684 2791
2792 ret = nfs_referral_loop_protect();
2793 if (ret != 0)
2794 goto out_put_mnt_ns;
2795
2685 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, 2796 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
2686 export_path, LOOKUP_FOLLOW, &nd); 2797 export_path, LOOKUP_FOLLOW, nd);
2687 2798
2799 nfs_referral_loop_unprotect();
2688 put_mnt_ns(ns_private); 2800 put_mnt_ns(ns_private);
2689 2801
2690 if (ret != 0) 2802 if (ret != 0)
2691 goto out_err; 2803 goto out_err;
2692 2804
2693 s = nd.path.mnt->mnt_sb; 2805 s = nd->path.mnt->mnt_sb;
2694 atomic_inc(&s->s_active); 2806 atomic_inc(&s->s_active);
2695 mnt_target->mnt_sb = s; 2807 mnt_target->mnt_sb = s;
2696 mnt_target->mnt_root = dget(nd.path.dentry); 2808 mnt_target->mnt_root = dget(nd->path.dentry);
2697 2809
2698 /* Correct the device pathname */ 2810 /* Correct the device pathname */
2699 nfs_fix_devname(&nd.path, mnt_target); 2811 nfs_fix_devname(&nd->path, mnt_target);
2700 2812
2701 path_put(&nd.path); 2813 path_put(&nd->path);
2814 kfree(nd);
2702 down_write(&s->s_umount); 2815 down_write(&s->s_umount);
2703 return 0; 2816 return 0;
2817out_put_mnt_ns:
2818 put_mnt_ns(ns_private);
2704out_mntput: 2819out_mntput:
2705 mntput(root_mnt); 2820 mntput(root_mnt);
2706out_err: 2821out_err:
2822 kfree(nd);
2707 return ret; 2823 return ret;
2708} 2824}
2709 2825
@@ -2874,17 +2990,21 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2874 struct super_block *s; 2990 struct super_block *s;
2875 struct nfs_server *server; 2991 struct nfs_server *server;
2876 struct dentry *mntroot; 2992 struct dentry *mntroot;
2877 struct nfs_fh mntfh; 2993 struct nfs_fh *mntfh;
2878 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 2994 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
2879 struct nfs_sb_mountdata sb_mntdata = { 2995 struct nfs_sb_mountdata sb_mntdata = {
2880 .mntflags = flags, 2996 .mntflags = flags,
2881 }; 2997 };
2882 int error; 2998 int error = -ENOMEM;
2883 2999
2884 dprintk("--> nfs4_referral_get_sb()\n"); 3000 dprintk("--> nfs4_referral_get_sb()\n");
2885 3001
3002 mntfh = nfs_alloc_fhandle();
3003 if (mntfh == NULL)
3004 goto out_err_nofh;
3005
2886 /* create a new volume representation */ 3006 /* create a new volume representation */
2887 server = nfs4_create_referral_server(data, &mntfh); 3007 server = nfs4_create_referral_server(data, mntfh);
2888 if (IS_ERR(server)) { 3008 if (IS_ERR(server)) {
2889 error = PTR_ERR(server); 3009 error = PTR_ERR(server);
2890 goto out_err_noserver; 3010 goto out_err_noserver;
@@ -2916,7 +3036,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2916 nfs_fscache_get_super_cookie(s, NULL, data); 3036 nfs_fscache_get_super_cookie(s, NULL, data);
2917 } 3037 }
2918 3038
2919 mntroot = nfs4_get_root(s, &mntfh); 3039 mntroot = nfs4_get_root(s, mntfh);
2920 if (IS_ERR(mntroot)) { 3040 if (IS_ERR(mntroot)) {
2921 error = PTR_ERR(mntroot); 3041 error = PTR_ERR(mntroot);
2922 goto error_splat_super; 3042 goto error_splat_super;
@@ -2933,12 +3053,15 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2933 3053
2934 security_sb_clone_mnt_opts(data->sb, s); 3054 security_sb_clone_mnt_opts(data->sb, s);
2935 3055
3056 nfs_free_fhandle(mntfh);
2936 dprintk("<-- nfs4_referral_get_sb() = 0\n"); 3057 dprintk("<-- nfs4_referral_get_sb() = 0\n");
2937 return 0; 3058 return 0;
2938 3059
2939out_err_nosb: 3060out_err_nosb:
2940 nfs_free_server(server); 3061 nfs_free_server(server);
2941out_err_noserver: 3062out_err_noserver:
3063 nfs_free_fhandle(mntfh);
3064out_err_nofh:
2942 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); 3065 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
2943 return error; 3066 return error;
2944 3067
@@ -2947,6 +3070,7 @@ error_splat_super:
2947 bdi_unregister(&server->backing_dev_info); 3070 bdi_unregister(&server->backing_dev_info);
2948error_splat_bdi: 3071error_splat_bdi:
2949 deactivate_locked_super(s); 3072 deactivate_locked_super(s);
3073 nfs_free_fhandle(mntfh);
2950 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 3074 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2951 return error; 3075 return error;
2952} 3076}
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6da3d3ff6edd..2f84adaad427 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -23,6 +23,7 @@ struct nfs_unlinkdata {
23 struct nfs_removeres res; 23 struct nfs_removeres res;
24 struct inode *dir; 24 struct inode *dir;
25 struct rpc_cred *cred; 25 struct rpc_cred *cred;
26 struct nfs_fattr dir_attr;
26}; 27};
27 28
28/** 29/**
@@ -109,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
109 struct nfs_unlinkdata *data = calldata; 110 struct nfs_unlinkdata *data = calldata;
110 struct nfs_server *server = NFS_SERVER(data->dir); 111 struct nfs_server *server = NFS_SERVER(data->dir);
111 112
112 if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, 113 if (nfs4_setup_sequence(server, &data->args.seq_args,
113 &data->res.seq_res, 1, task)) 114 &data->res.seq_res, 1, task))
114 return; 115 return;
115 rpc_call_start(task); 116 rpc_call_start(task);
@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
169 } 170 }
170 nfs_sb_active(dir->i_sb); 171 nfs_sb_active(dir->i_sb);
171 data->args.fh = NFS_FH(dir); 172 data->args.fh = NFS_FH(dir);
172 nfs_fattr_init(&data->res.dir_attr); 173 nfs_fattr_init(data->res.dir_attr);
173 174
174 NFS_PROTO(dir)->unlink_setup(&msg, dir); 175 NFS_PROTO(dir)->unlink_setup(&msg, dir);
175 176
@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
259 goto out_free; 260 goto out_free;
260 } 261 }
261 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; 262 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
263 data->res.dir_attr = &data->dir_attr;
262 264
263 status = -EBUSY; 265 status = -EBUSY;
264 spin_lock(&dentry->d_lock); 266 spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..874972d9427c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -222,7 +222,7 @@ static void nfs_end_page_writeback(struct page *page)
222 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 222 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
223} 223}
224 224
225static struct nfs_page *nfs_find_and_lock_request(struct page *page) 225static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
226{ 226{
227 struct inode *inode = page->mapping->host; 227 struct inode *inode = page->mapping->host;
228 struct nfs_page *req; 228 struct nfs_page *req;
@@ -241,7 +241,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
241 * request as dirty (in which case we don't care). 241 * request as dirty (in which case we don't care).
242 */ 242 */
243 spin_unlock(&inode->i_lock); 243 spin_unlock(&inode->i_lock);
244 ret = nfs_wait_on_request(req); 244 if (!nonblock)
245 ret = nfs_wait_on_request(req);
246 else
247 ret = -EAGAIN;
245 nfs_release_request(req); 248 nfs_release_request(req);
246 if (ret != 0) 249 if (ret != 0)
247 return ERR_PTR(ret); 250 return ERR_PTR(ret);
@@ -256,12 +259,12 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
256 * May return an error if the user signalled nfs_wait_on_request(). 259 * May return an error if the user signalled nfs_wait_on_request().
257 */ 260 */
258static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, 261static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
259 struct page *page) 262 struct page *page, bool nonblock)
260{ 263{
261 struct nfs_page *req; 264 struct nfs_page *req;
262 int ret = 0; 265 int ret = 0;
263 266
264 req = nfs_find_and_lock_request(page); 267 req = nfs_find_and_lock_request(page, nonblock);
265 if (!req) 268 if (!req)
266 goto out; 269 goto out;
267 ret = PTR_ERR(req); 270 ret = PTR_ERR(req);
@@ -283,12 +286,20 @@ out:
283static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) 286static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
284{ 287{
285 struct inode *inode = page->mapping->host; 288 struct inode *inode = page->mapping->host;
289 int ret;
286 290
287 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 291 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
288 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 292 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
289 293
290 nfs_pageio_cond_complete(pgio, page->index); 294 nfs_pageio_cond_complete(pgio, page->index);
291 return nfs_page_async_flush(pgio, page); 295 ret = nfs_page_async_flush(pgio, page,
296 wbc->sync_mode == WB_SYNC_NONE ||
297 wbc->nonblocking != 0);
298 if (ret == -EAGAIN) {
299 redirty_page_for_writepage(wbc, page);
300 ret = 0;
301 }
302 return ret;
292} 303}
293 304
294/* 305/*
@@ -689,7 +700,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
689 req = nfs_page_find_request(page); 700 req = nfs_page_find_request(page);
690 if (req == NULL) 701 if (req == NULL)
691 return 0; 702 return 0;
692 do_flush = req->wb_page != page || req->wb_context != ctx; 703 do_flush = req->wb_page != page || req->wb_context != ctx ||
704 req->wb_lock_context->lockowner != current->files ||
705 req->wb_lock_context->pid != current->tgid;
693 nfs_release_request(req); 706 nfs_release_request(req);
694 if (!do_flush) 707 if (!do_flush)
695 return 0; 708 return 0;
@@ -813,6 +826,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
813 data->args.pages = data->pagevec; 826 data->args.pages = data->pagevec;
814 data->args.count = count; 827 data->args.count = count;
815 data->args.context = get_nfs_open_context(req->wb_context); 828 data->args.context = get_nfs_open_context(req->wb_context);
829 data->args.lock_context = req->wb_lock_context;
816 data->args.stable = NFS_UNSTABLE; 830 data->args.stable = NFS_UNSTABLE;
817 if (how & FLUSH_STABLE) { 831 if (how & FLUSH_STABLE) {
818 data->args.stable = NFS_DATA_SYNC; 832 data->args.stable = NFS_DATA_SYNC;
@@ -1036,9 +1050,9 @@ out:
1036void nfs_write_prepare(struct rpc_task *task, void *calldata) 1050void nfs_write_prepare(struct rpc_task *task, void *calldata)
1037{ 1051{
1038 struct nfs_write_data *data = calldata; 1052 struct nfs_write_data *data = calldata;
1039 struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client;
1040 1053
1041 if (nfs4_setup_sequence(clp, &data->args.seq_args, 1054 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
1055 &data->args.seq_args,
1042 &data->res.seq_res, 1, task)) 1056 &data->res.seq_res, 1, task))
1043 return; 1057 return;
1044 rpc_call_start(task); 1058 rpc_call_start(task);
@@ -1379,14 +1393,14 @@ static const struct rpc_call_ops nfs_commit_ops = {
1379 .rpc_release = nfs_commit_release, 1393 .rpc_release = nfs_commit_release,
1380}; 1394};
1381 1395
1382static int nfs_commit_inode(struct inode *inode, int how) 1396int nfs_commit_inode(struct inode *inode, int how)
1383{ 1397{
1384 LIST_HEAD(head); 1398 LIST_HEAD(head);
1385 int may_wait = how & FLUSH_SYNC; 1399 int may_wait = how & FLUSH_SYNC;
1386 int res = 0; 1400 int res = 0;
1387 1401
1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) 1402 if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
1389 goto out; 1403 goto out_mark_dirty;
1390 spin_lock(&inode->i_lock); 1404 spin_lock(&inode->i_lock);
1391 res = nfs_scan_commit(inode, &head, 0, 0); 1405 res = nfs_scan_commit(inode, &head, 0, 0);
1392 spin_unlock(&inode->i_lock); 1406 spin_unlock(&inode->i_lock);
@@ -1398,9 +1412,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, 1412 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
1399 nfs_wait_bit_killable, 1413 nfs_wait_bit_killable,
1400 TASK_KILLABLE); 1414 TASK_KILLABLE);
1415 else
1416 goto out_mark_dirty;
1401 } else 1417 } else
1402 nfs_commit_clear_lock(NFS_I(inode)); 1418 nfs_commit_clear_lock(NFS_I(inode));
1403out: 1419 return res;
1420 /* Note: If we exit without ensuring that the commit is complete,
1421 * we must mark the inode as dirty. Otherwise, future calls to
1422 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
1423 * that the data is on the disk.
1424 */
1425out_mark_dirty:
1426 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1404 return res; 1427 return res;
1405} 1428}
1406 1429
@@ -1434,11 +1457,6 @@ out_mark_dirty:
1434 return ret; 1457 return ret;
1435} 1458}
1436#else 1459#else
1437static int nfs_commit_inode(struct inode *inode, int how)
1438{
1439 return 0;
1440}
1441
1442static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) 1460static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1443{ 1461{
1444 return 0; 1462 return 0;
@@ -1509,14 +1527,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
1509 }; 1527 };
1510 int ret; 1528 int ret;
1511 1529
1512 while(PagePrivate(page)) { 1530 for (;;) {
1513 wait_on_page_writeback(page); 1531 wait_on_page_writeback(page);
1514 if (clear_page_dirty_for_io(page)) { 1532 if (clear_page_dirty_for_io(page)) {
1515 ret = nfs_writepage_locked(page, &wbc); 1533 ret = nfs_writepage_locked(page, &wbc);
1516 if (ret < 0) 1534 if (ret < 0)
1517 goto out_error; 1535 goto out_error;
1536 continue;
1518 } 1537 }
1519 ret = sync_inode(inode, &wbc); 1538 if (!PagePrivate(page))
1539 break;
1540 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1520 if (ret < 0) 1541 if (ret < 0)
1521 goto out_error; 1542 goto out_error;
1522 } 1543 }
@@ -1534,7 +1555,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1534 1555
1535 nfs_fscache_release_page(page, GFP_KERNEL); 1556 nfs_fscache_release_page(page, GFP_KERNEL);
1536 1557
1537 req = nfs_find_and_lock_request(page); 1558 req = nfs_find_and_lock_request(page, false);
1538 ret = PTR_ERR(req); 1559 ret = PTR_ERR(req);
1539 if (IS_ERR(req)) 1560 if (IS_ERR(req))
1540 goto out; 1561 goto out;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 503b9da159a3..4264377552e2 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -69,7 +69,7 @@ config NFSD_V4
69 depends on NFSD && PROC_FS && EXPERIMENTAL 69 depends on NFSD && PROC_FS && EXPERIMENTAL
70 select NFSD_V3 70 select NFSD_V3
71 select FS_POSIX_ACL 71 select FS_POSIX_ACL
72 select RPCSEC_GSS_KRB5 72 select SUNRPC_GSS
73 help 73 help
74 This option enables support in your system's NFS server for 74 This option enables support in your system's NFS server for
75 version 4 of the NFS protocol (RFC 3530). 75 version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 872a5ef550c7..c2a4f71d87dd 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_cache = {
259 .alloc = expkey_alloc, 259 .alloc = expkey_alloc,
260}; 260};
261 261
262static struct svc_expkey * 262static int
263svc_expkey_lookup(struct svc_expkey *item) 263svc_expkey_hash(struct svc_expkey *item)
264{ 264{
265 struct cache_head *ch;
266 int hash = item->ek_fsidtype; 265 int hash = item->ek_fsidtype;
267 char * cp = (char*)item->ek_fsid; 266 char * cp = (char*)item->ek_fsid;
268 int len = key_len(item->ek_fsidtype); 267 int len = key_len(item->ek_fsidtype);
@@ -270,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *item)
270 hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); 269 hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
271 hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); 270 hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
272 hash &= EXPKEY_HASHMASK; 271 hash &= EXPKEY_HASHMASK;
272 return hash;
273}
274
275static struct svc_expkey *
276svc_expkey_lookup(struct svc_expkey *item)
277{
278 struct cache_head *ch;
279 int hash = svc_expkey_hash(item);
273 280
274 ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h, 281 ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
275 hash); 282 hash);
@@ -283,13 +290,7 @@ static struct svc_expkey *
283svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) 290svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
284{ 291{
285 struct cache_head *ch; 292 struct cache_head *ch;
286 int hash = new->ek_fsidtype; 293 int hash = svc_expkey_hash(new);
287 char * cp = (char*)new->ek_fsid;
288 int len = key_len(new->ek_fsidtype);
289
290 hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
291 hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS);
292 hash &= EXPKEY_HASHMASK;
293 294
294 ch = sunrpc_cache_update(&svc_expkey_cache, &new->h, 295 ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
295 &old->h, hash); 296 &old->h, hash);
@@ -738,14 +739,22 @@ struct cache_detail svc_export_cache = {
738 .alloc = svc_export_alloc, 739 .alloc = svc_export_alloc,
739}; 740};
740 741
741static struct svc_export * 742static int
742svc_export_lookup(struct svc_export *exp) 743svc_export_hash(struct svc_export *exp)
743{ 744{
744 struct cache_head *ch;
745 int hash; 745 int hash;
746
746 hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); 747 hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
747 hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); 748 hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
748 hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); 749 hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
750 return hash;
751}
752
753static struct svc_export *
754svc_export_lookup(struct svc_export *exp)
755{
756 struct cache_head *ch;
757 int hash = svc_export_hash(exp);
749 758
750 ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h, 759 ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
751 hash); 760 hash);
@@ -759,10 +768,7 @@ static struct svc_export *
759svc_export_update(struct svc_export *new, struct svc_export *old) 768svc_export_update(struct svc_export *new, struct svc_export *old)
760{ 769{
761 struct cache_head *ch; 770 struct cache_head *ch;
762 int hash; 771 int hash = svc_export_hash(old);
763 hash = hash_ptr(old->ex_client, EXPORT_HASHBITS);
764 hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS);
765 hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS);
766 772
767 ch = sunrpc_cache_update(&svc_export_cache, &new->h, 773 ch = sunrpc_cache_update(&svc_export_cache, &new->h,
768 &old->h, 774 &old->h,
@@ -1071,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp)
1071 err = 0; 1077 err = 0;
1072finish: 1078finish:
1073 kfree(new.ex_pathname); 1079 kfree(new.ex_pathname);
1074 if (exp) 1080 if (!IS_ERR_OR_NULL(exp))
1075 exp_put(exp); 1081 exp_put(exp);
1076 if (fsid_key && !IS_ERR(fsid_key)) 1082 if (!IS_ERR_OR_NULL(fsid_key))
1077 cache_put(&fsid_key->h, &svc_expkey_cache); 1083 cache_put(&fsid_key->h, &svc_expkey_cache);
1078 path_put(&path); 1084 path_put(&path);
1079out_put_clp: 1085out_put_clp:
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 3d68f45a37b9..5b7e3021e06b 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -168,7 +168,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
168 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); 168 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
169 169
170 fh_copy(&resp->fh, &argp->fh); 170 fh_copy(&resp->fh, &argp->fh);
171 nfserr = nfsd_read(rqstp, &resp->fh, NULL, 171 nfserr = nfsd_read(rqstp, &resp->fh,
172 argp->offset, 172 argp->offset,
173 rqstp->rq_vec, argp->vlen, 173 rqstp->rq_vec, argp->vlen,
174 &resp->count); 174 &resp->count);
@@ -271,7 +271,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
271 fh_init(&resp->fh, NFS3_FHSIZE); 271 fh_init(&resp->fh, NFS3_FHSIZE);
272 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, 272 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
273 &argp->attrs, S_IFDIR, 0, &resp->fh); 273 &argp->attrs, S_IFDIR, 0, &resp->fh);
274 274 fh_unlock(&resp->dirfh);
275 RETURN_STATUS(nfserr); 275 RETURN_STATUS(nfserr);
276} 276}
277 277
@@ -327,7 +327,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
327 type = nfs3_ftypes[argp->ftype]; 327 type = nfs3_ftypes[argp->ftype];
328 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, 328 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
329 &argp->attrs, type, rdev, &resp->fh); 329 &argp->attrs, type, rdev, &resp->fh);
330 330 fh_unlock(&resp->dirfh);
331 RETURN_STATUS(nfserr); 331 RETURN_STATUS(nfserr);
332} 332}
333 333
@@ -348,6 +348,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
348 /* Unlink. -S_IFDIR means file must not be a directory */ 348 /* Unlink. -S_IFDIR means file must not be a directory */
349 fh_copy(&resp->fh, &argp->fh); 349 fh_copy(&resp->fh, &argp->fh);
350 nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len); 350 nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
351 fh_unlock(&resp->fh);
351 RETURN_STATUS(nfserr); 352 RETURN_STATUS(nfserr);
352} 353}
353 354
@@ -367,6 +368,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
367 368
368 fh_copy(&resp->fh, &argp->fh); 369 fh_copy(&resp->fh, &argp->fh);
369 nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); 370 nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
371 fh_unlock(&resp->fh);
370 RETURN_STATUS(nfserr); 372 RETURN_STATUS(nfserr);
371} 373}
372 374
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7e32bd394e86..988cbb3a19b6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
32 */ 32 */
33 33
34#include <linux/sunrpc/clnt.h> 34#include <linux/sunrpc/clnt.h>
35#include <linux/sunrpc/svc_xprt.h>
35#include <linux/slab.h> 36#include <linux/slab.h>
36#include "nfsd.h" 37#include "nfsd.h"
37#include "state.h" 38#include "state.h"
@@ -79,11 +80,6 @@ enum nfs_cb_opnum4 {
79 cb_sequence_dec_sz + \ 80 cb_sequence_dec_sz + \
80 op_dec_sz) 81 op_dec_sz)
81 82
82struct nfs4_rpc_args {
83 void *args_op;
84 struct nfsd4_cb_sequence args_seq;
85};
86
87/* 83/*
88* Generic encode routines from fs/nfs/nfs4xdr.c 84* Generic encode routines from fs/nfs/nfs4xdr.c
89*/ 85*/
@@ -147,8 +143,6 @@ struct nfs4_cb_compound_hdr {
147 u32 minorversion; 143 u32 minorversion;
148 /* res */ 144 /* res */
149 int status; 145 int status;
150 u32 taglen;
151 char *tag;
152}; 146};
153 147
154static struct { 148static struct {
@@ -209,6 +203,16 @@ nfs_cb_stat_to_errno(int stat)
209 */ 203 */
210 204
211static void 205static void
206encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
207{
208 __be32 *p;
209
210 RESERVE_SPACE(sizeof(stateid_t));
211 WRITE32(sid->si_generation);
212 WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
213}
214
215static void
212encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) 216encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
213{ 217{
214 __be32 * p; 218 __be32 * p;
@@ -233,10 +237,10 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
233 __be32 *p; 237 __be32 *p;
234 int len = dp->dl_fh.fh_size; 238 int len = dp->dl_fh.fh_size;
235 239
236 RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); 240 RESERVE_SPACE(4);
237 WRITE32(OP_CB_RECALL); 241 WRITE32(OP_CB_RECALL);
238 WRITE32(dp->dl_stateid.si_generation); 242 encode_stateid(xdr, &dp->dl_stateid);
239 WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); 243 RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
240 WRITE32(0); /* truncate optimization not implemented */ 244 WRITE32(0); /* truncate optimization not implemented */
241 WRITE32(len); 245 WRITE32(len);
242 WRITEMEM(&dp->dl_fh.fh_base, len); 246 WRITEMEM(&dp->dl_fh.fh_base, len);
@@ -297,13 +301,14 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
297static int 301static int
298decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ 302decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
299 __be32 *p; 303 __be32 *p;
304 u32 taglen;
300 305
301 READ_BUF(8); 306 READ_BUF(8);
302 READ32(hdr->status); 307 READ32(hdr->status);
303 READ32(hdr->taglen); 308 /* We've got no use for the tag; ignore it: */
304 READ_BUF(hdr->taglen + 4); 309 READ32(taglen);
305 hdr->tag = (char *)p; 310 READ_BUF(taglen + 4);
306 p += XDR_QUADLEN(hdr->taglen); 311 p += XDR_QUADLEN(taglen);
307 READ32(hdr->nops); 312 READ32(hdr->nops);
308 return 0; 313 return 0;
309} 314}
@@ -428,13 +433,19 @@ static struct rpc_procinfo nfs4_cb_procedures[] = {
428}; 433};
429 434
430static struct rpc_version nfs_cb_version4 = { 435static struct rpc_version nfs_cb_version4 = {
436/*
437 * Note on the callback rpc program version number: despite language in rfc
438 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
439 * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
440 * in practice that appears to be what implementations use. The section
441 * 18.36.3 language is expected to be fixed in an erratum.
442 */
431 .number = 1, 443 .number = 1,
432 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), 444 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures),
433 .procs = nfs4_cb_procedures 445 .procs = nfs4_cb_procedures
434}; 446};
435 447
436static struct rpc_version * nfs_cb_version[] = { 448static struct rpc_version * nfs_cb_version[] = {
437 NULL,
438 &nfs_cb_version4, 449 &nfs_cb_version4,
439}; 450};
440 451
@@ -456,15 +467,14 @@ static struct rpc_program cb_program = {
456 467
457static int max_cb_time(void) 468static int max_cb_time(void)
458{ 469{
459 return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ; 470 return max(nfsd4_lease/10, (time_t)1) * HZ;
460} 471}
461 472
462/* Reference counting, callback cleanup, etc., all look racy as heck. 473/* Reference counting, callback cleanup, etc., all look racy as heck.
463 * And why is cb_set an atomic? */ 474 * And why is cl_cb_set an atomic? */
464 475
465int setup_callback_client(struct nfs4_client *clp) 476int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
466{ 477{
467 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
468 struct rpc_timeout timeparms = { 478 struct rpc_timeout timeparms = {
469 .to_initval = max_cb_time(), 479 .to_initval = max_cb_time(),
470 .to_retries = 0, 480 .to_retries = 0,
@@ -476,7 +486,7 @@ int setup_callback_client(struct nfs4_client *clp)
476 .timeout = &timeparms, 486 .timeout = &timeparms,
477 .program = &cb_program, 487 .program = &cb_program,
478 .prognumber = cb->cb_prog, 488 .prognumber = cb->cb_prog,
479 .version = nfs_cb_version[1]->number, 489 .version = 0,
480 .authflavor = clp->cl_flavor, 490 .authflavor = clp->cl_flavor,
481 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 491 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
482 .client_name = clp->cl_principal, 492 .client_name = clp->cl_principal,
@@ -486,7 +496,7 @@ int setup_callback_client(struct nfs4_client *clp)
486 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 496 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
487 return -EINVAL; 497 return -EINVAL;
488 if (cb->cb_minorversion) { 498 if (cb->cb_minorversion) {
489 args.bc_xprt = clp->cl_cb_xprt; 499 args.bc_xprt = cb->cb_xprt;
490 args.protocol = XPRT_TRANSPORT_BC_TCP; 500 args.protocol = XPRT_TRANSPORT_BC_TCP;
491 } 501 }
492 /* Create RPC client */ 502 /* Create RPC client */
@@ -496,7 +506,7 @@ int setup_callback_client(struct nfs4_client *clp)
496 PTR_ERR(client)); 506 PTR_ERR(client));
497 return PTR_ERR(client); 507 return PTR_ERR(client);
498 } 508 }
499 cb->cb_client = client; 509 nfsd4_set_callback_client(clp, client);
500 return 0; 510 return 0;
501 511
502} 512}
@@ -514,8 +524,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
514 if (task->tk_status) 524 if (task->tk_status)
515 warn_no_callback_path(clp, task->tk_status); 525 warn_no_callback_path(clp, task->tk_status);
516 else 526 else
517 atomic_set(&clp->cl_cb_conn.cb_set, 1); 527 atomic_set(&clp->cl_cb_set, 1);
518 put_nfs4_client(clp);
519} 528}
520 529
521static const struct rpc_call_ops nfsd4_cb_probe_ops = { 530static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -537,7 +546,6 @@ int set_callback_cred(void)
537 546
538void do_probe_callback(struct nfs4_client *clp) 547void do_probe_callback(struct nfs4_client *clp)
539{ 548{
540 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
541 struct rpc_message msg = { 549 struct rpc_message msg = {
542 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 550 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
543 .rpc_argp = clp, 551 .rpc_argp = clp,
@@ -545,34 +553,27 @@ void do_probe_callback(struct nfs4_client *clp)
545 }; 553 };
546 int status; 554 int status;
547 555
548 status = rpc_call_async(cb->cb_client, &msg, 556 status = rpc_call_async(clp->cl_cb_client, &msg,
549 RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 557 RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
550 &nfsd4_cb_probe_ops, (void *)clp); 558 &nfsd4_cb_probe_ops, (void *)clp);
551 if (status) { 559 if (status)
552 warn_no_callback_path(clp, status); 560 warn_no_callback_path(clp, status);
553 put_nfs4_client(clp);
554 }
555} 561}
556 562
557/* 563/*
558 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... 564 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
559 */ 565 */
560void 566void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
561nfsd4_probe_callback(struct nfs4_client *clp)
562{ 567{
563 int status; 568 int status;
564 569
565 BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set)); 570 BUG_ON(atomic_read(&clp->cl_cb_set));
566 571
567 status = setup_callback_client(clp); 572 status = setup_callback_client(clp, cb);
568 if (status) { 573 if (status) {
569 warn_no_callback_path(clp, status); 574 warn_no_callback_path(clp, status);
570 return; 575 return;
571 } 576 }
572
573 /* the task holds a reference to the nfs4_client struct */
574 atomic_inc(&clp->cl_count);
575
576 do_probe_callback(clp); 577 do_probe_callback(clp);
577} 578}
578 579
@@ -658,47 +659,57 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
658 } 659 }
659} 660}
660 661
662
661static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 663static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
662{ 664{
663 struct nfs4_delegation *dp = calldata; 665 struct nfs4_delegation *dp = calldata;
664 struct nfs4_client *clp = dp->dl_client; 666 struct nfs4_client *clp = dp->dl_client;
667 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
665 668
666 nfsd4_cb_done(task, calldata); 669 nfsd4_cb_done(task, calldata);
667 670
671 if (current_rpc_client == NULL) {
672 /* We're shutting down; give up. */
673 /* XXX: err, or is it ok just to fall through
674 * and rpc_restart_call? */
675 return;
676 }
677
668 switch (task->tk_status) { 678 switch (task->tk_status) {
669 case -EIO: 679 case 0:
670 /* Network partition? */ 680 return;
671 atomic_set(&clp->cl_cb_conn.cb_set, 0);
672 warn_no_callback_path(clp, task->tk_status);
673 case -EBADHANDLE: 681 case -EBADHANDLE:
674 case -NFS4ERR_BAD_STATEID: 682 case -NFS4ERR_BAD_STATEID:
675 /* Race: client probably got cb_recall 683 /* Race: client probably got cb_recall
676 * before open reply granting delegation */ 684 * before open reply granting delegation */
677 break; 685 break;
678 default: 686 default:
679 /* success, or error we can't handle */ 687 /* Network partition? */
680 goto done; 688 atomic_set(&clp->cl_cb_set, 0);
689 warn_no_callback_path(clp, task->tk_status);
690 if (current_rpc_client != task->tk_client) {
691 /* queue a callback on the new connection: */
692 atomic_inc(&dp->dl_count);
693 nfsd4_cb_recall(dp);
694 return;
695 }
681 } 696 }
682 if (dp->dl_retries--) { 697 if (dp->dl_retries--) {
683 rpc_delay(task, 2*HZ); 698 rpc_delay(task, 2*HZ);
684 task->tk_status = 0; 699 task->tk_status = 0;
685 rpc_restart_call(task); 700 rpc_restart_call_prepare(task);
686 return; 701 return;
687 } else { 702 } else {
688 atomic_set(&clp->cl_cb_conn.cb_set, 0); 703 atomic_set(&clp->cl_cb_set, 0);
689 warn_no_callback_path(clp, task->tk_status); 704 warn_no_callback_path(clp, task->tk_status);
690 } 705 }
691done:
692 kfree(task->tk_msg.rpc_argp);
693} 706}
694 707
695static void nfsd4_cb_recall_release(void *calldata) 708static void nfsd4_cb_recall_release(void *calldata)
696{ 709{
697 struct nfs4_delegation *dp = calldata; 710 struct nfs4_delegation *dp = calldata;
698 struct nfs4_client *clp = dp->dl_client;
699 711
700 nfs4_put_delegation(dp); 712 nfs4_put_delegation(dp);
701 put_nfs4_client(clp);
702} 713}
703 714
704static const struct rpc_call_ops nfsd4_cb_recall_ops = { 715static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -707,33 +718,73 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
707 .rpc_release = nfsd4_cb_recall_release, 718 .rpc_release = nfsd4_cb_recall_release,
708}; 719};
709 720
721static struct workqueue_struct *callback_wq;
722
723int nfsd4_create_callback_queue(void)
724{
725 callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
726 if (!callback_wq)
727 return -ENOMEM;
728 return 0;
729}
730
731void nfsd4_destroy_callback_queue(void)
732{
733 destroy_workqueue(callback_wq);
734}
735
736/* must be called under the state lock */
737void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
738{
739 struct rpc_clnt *old = clp->cl_cb_client;
740
741 clp->cl_cb_client = new;
742 /*
743 * After this, any work that saw the old value of cl_cb_client will
744 * be gone:
745 */
746 flush_workqueue(callback_wq);
747 /* So we can safely shut it down: */
748 if (old)
749 rpc_shutdown_client(old);
750}
751
710/* 752/*
711 * called with dp->dl_count inc'ed. 753 * called with dp->dl_count inc'ed.
712 */ 754 */
713void 755static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
714nfsd4_cb_recall(struct nfs4_delegation *dp)
715{ 756{
716 struct nfs4_client *clp = dp->dl_client; 757 struct nfs4_client *clp = dp->dl_client;
717 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; 758 struct rpc_clnt *clnt = clp->cl_cb_client;
718 struct nfs4_rpc_args *args; 759 struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
719 struct rpc_message msg = { 760 struct rpc_message msg = {
720 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], 761 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
721 .rpc_cred = callback_cred 762 .rpc_cred = callback_cred
722 }; 763 };
723 int status = -ENOMEM;
724 764
725 args = kzalloc(sizeof(*args), GFP_KERNEL); 765 if (clnt == NULL) {
726 if (!args) 766 nfs4_put_delegation(dp);
727 goto out; 767 return; /* Client is shutting down; give up. */
768 }
769
728 args->args_op = dp; 770 args->args_op = dp;
729 msg.rpc_argp = args; 771 msg.rpc_argp = args;
730 dp->dl_retries = 1; 772 dp->dl_retries = 1;
731 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, 773 rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
732 &nfsd4_cb_recall_ops, dp); 774}
733out: 775
734 if (status) { 776void nfsd4_do_callback_rpc(struct work_struct *w)
735 kfree(args); 777{
736 put_nfs4_client(clp); 778 /* XXX: for now, just send off delegation recall. */
737 nfs4_put_delegation(dp); 779 /* In future, generalize to handle any sort of callback. */
738 } 780 struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
781 struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
782
783 _nfsd4_cb_recall(dp);
784}
785
786
787void nfsd4_cb_recall(struct nfs4_delegation *dp)
788{
789 queue_work(callback_wq, &dp->dl_recall.cb_work);
739} 790}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2ab9e8501bfe..59ec449b0c7f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[];
969static const char *nfsd4_op_name(unsigned opnum); 969static const char *nfsd4_op_name(unsigned opnum);
970 970
971/* 971/*
972 * Enforce NFSv4.1 COMPOUND ordering rules. 972 * Enforce NFSv4.1 COMPOUND ordering rules:
973 * 973 *
974 * TODO: 974 * Also note, enforced elsewhere:
975 * - enforce NFS4ERR_NOT_ONLY_OP, 975 * - SEQUENCE other than as first op results in
976 * - DESTROY_SESSION MUST be the final operation in the COMPOUND request. 976 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
977 * - BIND_CONN_TO_SESSION must be the only op in its compound
978 * (Will be enforced in nfsd4_bind_conn_to_session().)
979 * - DESTROY_SESSION must be the final operation in a compound, if
980 * sessionid's in SEQUENCE and DESTROY_SESSION are the same.
981 * (Enforced in nfsd4_destroy_session().)
977 */ 982 */
978static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args) 983static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
979{ 984{
980 if (args->minorversion && args->opcnt > 0) { 985 struct nfsd4_op *op = &args->ops[0];
981 struct nfsd4_op *op = &args->ops[0]; 986
982 return (op->status == nfserr_op_illegal) || 987 /* These ordering requirements don't apply to NFSv4.0: */
983 (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP); 988 if (args->minorversion == 0)
984 } 989 return nfs_ok;
985 return true; 990 /* This is weird, but OK, not our problem: */
991 if (args->opcnt == 0)
992 return nfs_ok;
993 if (op->status == nfserr_op_illegal)
994 return nfs_ok;
995 if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
996 return nfserr_op_not_in_session;
997 if (op->opnum == OP_SEQUENCE)
998 return nfs_ok;
999 if (args->opcnt != 1)
1000 return nfserr_not_only_op;
1001 return nfs_ok;
986} 1002}
987 1003
988/* 1004/*
@@ -1012,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1012 resp->rqstp = rqstp; 1028 resp->rqstp = rqstp;
1013 resp->cstate.minorversion = args->minorversion; 1029 resp->cstate.minorversion = args->minorversion;
1014 resp->cstate.replay_owner = NULL; 1030 resp->cstate.replay_owner = NULL;
1031 resp->cstate.session = NULL;
1015 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); 1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
1016 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); 1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
1017 /* Use the deferral mechanism only for NFSv4.0 compounds */ 1034 /* Use the deferral mechanism only for NFSv4.0 compounds */
@@ -1024,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1024 if (args->minorversion > nfsd_supported_minorversion) 1041 if (args->minorversion > nfsd_supported_minorversion)
1025 goto out; 1042 goto out;
1026 1043
1027 if (!nfs41_op_ordering_ok(args)) { 1044 status = nfs41_check_op_ordering(args);
1045 if (status) {
1028 op = &args->ops[0]; 1046 op = &args->ops[0];
1029 op->status = nfserr_sequence_pos; 1047 op->status = status;
1030 goto encode_op; 1048 goto encode_op;
1031 } 1049 }
1032 1050
1033 status = nfs_ok;
1034 while (!status && resp->opcnt < args->opcnt) { 1051 while (!status && resp->opcnt < args->opcnt) {
1035 op = &args->ops[resp->opcnt++]; 1052 op = &args->ops[resp->opcnt++];
1036 1053
@@ -1295,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
1295 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1312 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1296 .op_name = "OP_SEQUENCE", 1313 .op_name = "OP_SEQUENCE",
1297 }, 1314 },
1315 [OP_RECLAIM_COMPLETE] = {
1316 .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
1317 .op_flags = ALLOWED_WITHOUT_FH,
1318 .op_name = "OP_RECLAIM_COMPLETE",
1319 },
1298}; 1320};
1299 1321
1300static const char *nfsd4_op_name(unsigned opnum) 1322static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7a9ae3254a4b..7e26caab2a26 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -44,8 +44,7 @@
44#define NFSDDBG_FACILITY NFSDDBG_PROC 44#define NFSDDBG_FACILITY NFSDDBG_PROC
45 45
46/* Globals */ 46/* Globals */
47static struct path rec_dir; 47static struct file *rec_file;
48static int rec_dir_init = 0;
49 48
50static int 49static int
51nfs4_save_creds(const struct cred **original_creds) 50nfs4_save_creds(const struct cred **original_creds)
@@ -117,33 +116,28 @@ out_no_tfm:
117 return status; 116 return status;
118} 117}
119 118
120static void
121nfsd4_sync_rec_dir(void)
122{
123 vfs_fsync(NULL, rec_dir.dentry, 0);
124}
125
126int 119int
127nfsd4_create_clid_dir(struct nfs4_client *clp) 120nfsd4_create_clid_dir(struct nfs4_client *clp)
128{ 121{
129 const struct cred *original_cred; 122 const struct cred *original_cred;
130 char *dname = clp->cl_recdir; 123 char *dname = clp->cl_recdir;
131 struct dentry *dentry; 124 struct dentry *dir, *dentry;
132 int status; 125 int status;
133 126
134 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); 127 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
135 128
136 if (!rec_dir_init || clp->cl_firststate) 129 if (!rec_file || clp->cl_firststate)
137 return 0; 130 return 0;
138 131
139 status = nfs4_save_creds(&original_cred); 132 status = nfs4_save_creds(&original_cred);
140 if (status < 0) 133 if (status < 0)
141 return status; 134 return status;
142 135
136 dir = rec_file->f_path.dentry;
143 /* lock the parent */ 137 /* lock the parent */
144 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 138 mutex_lock(&dir->d_inode->i_mutex);
145 139
146 dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); 140 dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
147 if (IS_ERR(dentry)) { 141 if (IS_ERR(dentry)) {
148 status = PTR_ERR(dentry); 142 status = PTR_ERR(dentry);
149 goto out_unlock; 143 goto out_unlock;
@@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
153 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); 147 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
154 goto out_put; 148 goto out_put;
155 } 149 }
156 status = mnt_want_write(rec_dir.mnt); 150 status = mnt_want_write(rec_file->f_path.mnt);
157 if (status) 151 if (status)
158 goto out_put; 152 goto out_put;
159 status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); 153 status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
160 mnt_drop_write(rec_dir.mnt); 154 mnt_drop_write(rec_file->f_path.mnt);
161out_put: 155out_put:
162 dput(dentry); 156 dput(dentry);
163out_unlock: 157out_unlock:
164 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 158 mutex_unlock(&dir->d_inode->i_mutex);
165 if (status == 0) { 159 if (status == 0) {
166 clp->cl_firststate = 1; 160 clp->cl_firststate = 1;
167 nfsd4_sync_rec_dir(); 161 vfs_fsync(rec_file, 0);
168 } 162 }
169 nfs4_reset_creds(original_cred); 163 nfs4_reset_creds(original_cred);
170 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); 164 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
206 struct dentry *dentry; 200 struct dentry *dentry;
207 int status; 201 int status;
208 202
209 if (!rec_dir_init) 203 if (!rec_file)
210 return 0; 204 return 0;
211 205
212 status = nfs4_save_creds(&original_cred); 206 status = nfs4_save_creds(&original_cred);
213 if (status < 0) 207 if (status < 0)
214 return status; 208 return status;
215 209
216 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 210 filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
217 current_cred()); 211 current_cred());
218 status = PTR_ERR(filp); 212 status = PTR_ERR(filp);
219 if (IS_ERR(filp)) 213 if (IS_ERR(filp))
@@ -250,13 +244,14 @@ out:
250static int 244static int
251nfsd4_unlink_clid_dir(char *name, int namlen) 245nfsd4_unlink_clid_dir(char *name, int namlen)
252{ 246{
253 struct dentry *dentry; 247 struct dentry *dir, *dentry;
254 int status; 248 int status;
255 249
256 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 250 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
257 251
258 mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 252 dir = rec_file->f_path.dentry;
259 dentry = lookup_one_len(name, rec_dir.dentry, namlen); 253 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
254 dentry = lookup_one_len(name, dir, namlen);
260 if (IS_ERR(dentry)) { 255 if (IS_ERR(dentry)) {
261 status = PTR_ERR(dentry); 256 status = PTR_ERR(dentry);
262 goto out_unlock; 257 goto out_unlock;
@@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
264 status = -ENOENT; 259 status = -ENOENT;
265 if (!dentry->d_inode) 260 if (!dentry->d_inode)
266 goto out; 261 goto out;
267 status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); 262 status = vfs_rmdir(dir->d_inode, dentry);
268out: 263out:
269 dput(dentry); 264 dput(dentry);
270out_unlock: 265out_unlock:
271 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 266 mutex_unlock(&dir->d_inode->i_mutex);
272 return status; 267 return status;
273} 268}
274 269
@@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
278 const struct cred *original_cred; 273 const struct cred *original_cred;
279 int status; 274 int status;
280 275
281 if (!rec_dir_init || !clp->cl_firststate) 276 if (!rec_file || !clp->cl_firststate)
282 return; 277 return;
283 278
284 status = mnt_want_write(rec_dir.mnt); 279 status = mnt_want_write(rec_file->f_path.mnt);
285 if (status) 280 if (status)
286 goto out; 281 goto out;
287 clp->cl_firststate = 0; 282 clp->cl_firststate = 0;
@@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
293 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); 288 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
294 nfs4_reset_creds(original_cred); 289 nfs4_reset_creds(original_cred);
295 if (status == 0) 290 if (status == 0)
296 nfsd4_sync_rec_dir(); 291 vfs_fsync(rec_file, 0);
297 mnt_drop_write(rec_dir.mnt); 292 mnt_drop_write(rec_file->f_path.mnt);
298out: 293out:
299 if (status) 294 if (status)
300 printk("NFSD: Failed to remove expired client state directory" 295 printk("NFSD: Failed to remove expired client state directory"
@@ -323,19 +318,19 @@ void
323nfsd4_recdir_purge_old(void) { 318nfsd4_recdir_purge_old(void) {
324 int status; 319 int status;
325 320
326 if (!rec_dir_init) 321 if (!rec_file)
327 return; 322 return;
328 status = mnt_want_write(rec_dir.mnt); 323 status = mnt_want_write(rec_file->f_path.mnt);
329 if (status) 324 if (status)
330 goto out; 325 goto out;
331 status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); 326 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
332 if (status == 0) 327 if (status == 0)
333 nfsd4_sync_rec_dir(); 328 vfs_fsync(rec_file, 0);
334 mnt_drop_write(rec_dir.mnt); 329 mnt_drop_write(rec_file->f_path.mnt);
335out: 330out:
336 if (status) 331 if (status)
337 printk("nfsd4: failed to purge old clients from recovery" 332 printk("nfsd4: failed to purge old clients from recovery"
338 " directory %s\n", rec_dir.dentry->d_name.name); 333 " directory %s\n", rec_file->f_path.dentry->d_name.name);
339} 334}
340 335
341static int 336static int
@@ -355,10 +350,13 @@ int
355nfsd4_recdir_load(void) { 350nfsd4_recdir_load(void) {
356 int status; 351 int status;
357 352
358 status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); 353 if (!rec_file)
354 return 0;
355
356 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
359 if (status) 357 if (status)
360 printk("nfsd4: failed loading clients from recovery" 358 printk("nfsd4: failed loading clients from recovery"
361 " directory %s\n", rec_dir.dentry->d_name.name); 359 " directory %s\n", rec_file->f_path.dentry->d_name.name);
362 return status; 360 return status;
363} 361}
364 362
@@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
375 printk("NFSD: Using %s as the NFSv4 state recovery directory\n", 373 printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
376 rec_dirname); 374 rec_dirname);
377 375
378 BUG_ON(rec_dir_init); 376 BUG_ON(rec_file);
379 377
380 status = nfs4_save_creds(&original_cred); 378 status = nfs4_save_creds(&original_cred);
381 if (status < 0) { 379 if (status < 0) {
@@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
385 return; 383 return;
386 } 384 }
387 385
388 status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 386 rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
389 &rec_dir); 387 if (IS_ERR(rec_file)) {
390 if (status)
391 printk("NFSD: unable to find recovery directory %s\n", 388 printk("NFSD: unable to find recovery directory %s\n",
392 rec_dirname); 389 rec_dirname);
390 rec_file = NULL;
391 }
393 392
394 if (!status)
395 rec_dir_init = 1;
396 nfs4_reset_creds(original_cred); 393 nfs4_reset_creds(original_cred);
397} 394}
398 395
399void 396void
400nfsd4_shutdown_recdir(void) 397nfsd4_shutdown_recdir(void)
401{ 398{
402 if (!rec_dir_init) 399 if (!rec_file)
403 return; 400 return;
404 rec_dir_init = 0; 401 fput(rec_file);
405 path_put(&rec_dir); 402 rec_file = NULL;
406} 403}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6a8fedaa4f55..cf0d2ffb3c84 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -45,13 +45,12 @@
45#define NFSDDBG_FACILITY NFSDDBG_PROC 45#define NFSDDBG_FACILITY NFSDDBG_PROC
46 46
47/* Globals */ 47/* Globals */
48static time_t lease_time = 90; /* default lease time */ 48time_t nfsd4_lease = 90; /* default lease time */
49static time_t user_lease_time = 90; 49time_t nfsd4_grace = 90;
50static time_t boot_time; 50static time_t boot_time;
51static u32 current_ownerid = 1; 51static u32 current_ownerid = 1;
52static u32 current_fileid = 1; 52static u32 current_fileid = 1;
53static u32 current_delegid = 1; 53static u32 current_delegid = 1;
54static u32 nfs4_init;
55static stateid_t zerostateid; /* bits all 0 */ 54static stateid_t zerostateid; /* bits all 0 */
56static stateid_t onestateid; /* bits all 1 */ 55static stateid_t onestateid; /* bits all 1 */
57static u64 current_sessionid = 1; 56static u64 current_sessionid = 1;
@@ -163,6 +162,46 @@ static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
163static struct list_head file_hashtbl[FILE_HASH_SIZE]; 162static struct list_head file_hashtbl[FILE_HASH_SIZE];
164static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; 163static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
165 164
165static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
166{
167 BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
168 atomic_inc(&fp->fi_access[oflag]);
169}
170
171static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
172{
173 if (oflag == O_RDWR) {
174 __nfs4_file_get_access(fp, O_RDONLY);
175 __nfs4_file_get_access(fp, O_WRONLY);
176 } else
177 __nfs4_file_get_access(fp, oflag);
178}
179
180static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
181{
182 if (fp->fi_fds[oflag]) {
183 fput(fp->fi_fds[oflag]);
184 fp->fi_fds[oflag] = NULL;
185 }
186}
187
188static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
189{
190 if (atomic_dec_and_test(&fp->fi_access[oflag])) {
191 nfs4_file_put_fd(fp, O_RDWR);
192 nfs4_file_put_fd(fp, oflag);
193 }
194}
195
196static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
197{
198 if (oflag == O_RDWR) {
199 __nfs4_file_put_access(fp, O_RDONLY);
200 __nfs4_file_put_access(fp, O_WRONLY);
201 } else
202 __nfs4_file_put_access(fp, oflag);
203}
204
166static struct nfs4_delegation * 205static struct nfs4_delegation *
167alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) 206alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
168{ 207{
@@ -171,6 +210,13 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
171 struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn; 210 struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
172 211
173 dprintk("NFSD alloc_init_deleg\n"); 212 dprintk("NFSD alloc_init_deleg\n");
213 /*
214 * Major work on the lease subsystem (for example, to support
215 * calbacks on stat) will be required before we can support
216 * write delegations properly.
217 */
218 if (type != NFS4_OPEN_DELEGATE_READ)
219 return NULL;
174 if (fp->fi_had_conflict) 220 if (fp->fi_had_conflict)
175 return NULL; 221 return NULL;
176 if (num_delegations > max_delegations) 222 if (num_delegations > max_delegations)
@@ -185,12 +231,11 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
185 dp->dl_client = clp; 231 dp->dl_client = clp;
186 get_nfs4_file(fp); 232 get_nfs4_file(fp);
187 dp->dl_file = fp; 233 dp->dl_file = fp;
234 nfs4_file_get_access(fp, O_RDONLY);
188 dp->dl_flock = NULL; 235 dp->dl_flock = NULL;
189 get_file(stp->st_vfs_file);
190 dp->dl_vfs_file = stp->st_vfs_file;
191 dp->dl_type = type; 236 dp->dl_type = type;
192 dp->dl_ident = cb->cb_ident; 237 dp->dl_ident = cb->cb_ident;
193 dp->dl_stateid.si_boot = get_seconds(); 238 dp->dl_stateid.si_boot = boot_time;
194 dp->dl_stateid.si_stateownerid = current_delegid++; 239 dp->dl_stateid.si_stateownerid = current_delegid++;
195 dp->dl_stateid.si_fileid = 0; 240 dp->dl_stateid.si_fileid = 0;
196 dp->dl_stateid.si_generation = 0; 241 dp->dl_stateid.si_generation = 0;
@@ -199,6 +244,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
199 atomic_set(&dp->dl_count, 1); 244 atomic_set(&dp->dl_count, 1);
200 list_add(&dp->dl_perfile, &fp->fi_delegations); 245 list_add(&dp->dl_perfile, &fp->fi_delegations);
201 list_add(&dp->dl_perclnt, &clp->cl_delegations); 246 list_add(&dp->dl_perclnt, &clp->cl_delegations);
247 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
202 return dp; 248 return dp;
203} 249}
204 250
@@ -221,15 +267,12 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
221static void 267static void
222nfs4_close_delegation(struct nfs4_delegation *dp) 268nfs4_close_delegation(struct nfs4_delegation *dp)
223{ 269{
224 struct file *filp = dp->dl_vfs_file; 270 struct file *filp = find_readable_file(dp->dl_file);
225 271
226 dprintk("NFSD: close_delegation dp %p\n",dp); 272 dprintk("NFSD: close_delegation dp %p\n",dp);
227 dp->dl_vfs_file = NULL;
228 /* The following nfsd_close may not actually close the file,
229 * but we want to remove the lease in any case. */
230 if (dp->dl_flock) 273 if (dp->dl_flock)
231 vfs_setlease(filp, F_UNLCK, &dp->dl_flock); 274 vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
232 nfsd_close(filp); 275 nfs4_file_put_access(dp->dl_file, O_RDONLY);
233} 276}
234 277
235/* Called under the state lock. */ 278/* Called under the state lock. */
@@ -249,6 +292,9 @@ unhash_delegation(struct nfs4_delegation *dp)
249 * SETCLIENTID state 292 * SETCLIENTID state
250 */ 293 */
251 294
295/* client_lock protects the client lru list and session hash table */
296static DEFINE_SPINLOCK(client_lock);
297
252/* Hash tables for nfs4_clientid state */ 298/* Hash tables for nfs4_clientid state */
253#define CLIENT_HASH_BITS 4 299#define CLIENT_HASH_BITS 4
254#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) 300#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
@@ -298,8 +344,12 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
298 344
299static void release_lock_stateid(struct nfs4_stateid *stp) 345static void release_lock_stateid(struct nfs4_stateid *stp)
300{ 346{
347 struct file *file;
348
301 unhash_generic_stateid(stp); 349 unhash_generic_stateid(stp);
302 locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner); 350 file = find_any_file(stp->st_file);
351 if (file)
352 locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
303 free_generic_stateid(stp); 353 free_generic_stateid(stp);
304} 354}
305 355
@@ -337,11 +387,85 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp)
337 } 387 }
338} 388}
339 389
390/*
391 * We store the NONE, READ, WRITE, and BOTH bits separately in the
392 * st_{access,deny}_bmap field of the stateid, in order to track not
393 * only what share bits are currently in force, but also what
394 * combinations of share bits previous opens have used. This allows us
395 * to enforce the recommendation of rfc 3530 14.2.19 that the server
396 * return an error if the client attempt to downgrade to a combination
397 * of share bits not explicable by closing some of its previous opens.
398 *
399 * XXX: This enforcement is actually incomplete, since we don't keep
400 * track of access/deny bit combinations; so, e.g., we allow:
401 *
402 * OPEN allow read, deny write
403 * OPEN allow both, deny none
404 * DOWNGRADE allow read, deny none
405 *
406 * which we should reject.
407 */
408static void
409set_access(unsigned int *access, unsigned long bmap) {
410 int i;
411
412 *access = 0;
413 for (i = 1; i < 4; i++) {
414 if (test_bit(i, &bmap))
415 *access |= i;
416 }
417}
418
419static void
420set_deny(unsigned int *deny, unsigned long bmap) {
421 int i;
422
423 *deny = 0;
424 for (i = 0; i < 4; i++) {
425 if (test_bit(i, &bmap))
426 *deny |= i ;
427 }
428}
429
430static int
431test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
432 unsigned int access, deny;
433
434 set_access(&access, stp->st_access_bmap);
435 set_deny(&deny, stp->st_deny_bmap);
436 if ((access & open->op_share_deny) || (deny & open->op_share_access))
437 return 0;
438 return 1;
439}
440
441static int nfs4_access_to_omode(u32 access)
442{
443 switch (access & NFS4_SHARE_ACCESS_BOTH) {
444 case NFS4_SHARE_ACCESS_READ:
445 return O_RDONLY;
446 case NFS4_SHARE_ACCESS_WRITE:
447 return O_WRONLY;
448 case NFS4_SHARE_ACCESS_BOTH:
449 return O_RDWR;
450 }
451 BUG();
452}
453
454static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
455{
456 unsigned int access;
457
458 set_access(&access, stp->st_access_bmap);
459 return nfs4_access_to_omode(access);
460}
461
340static void release_open_stateid(struct nfs4_stateid *stp) 462static void release_open_stateid(struct nfs4_stateid *stp)
341{ 463{
464 int oflag = nfs4_access_bmap_to_omode(stp);
465
342 unhash_generic_stateid(stp); 466 unhash_generic_stateid(stp);
343 release_stateid_lockowners(stp); 467 release_stateid_lockowners(stp);
344 nfsd_close(stp->st_vfs_file); 468 nfs4_file_put_access(stp->st_file, oflag);
345 free_generic_stateid(stp); 469 free_generic_stateid(stp);
346} 470}
347 471
@@ -367,7 +491,6 @@ static void release_openowner(struct nfs4_stateowner *sop)
367 nfs4_put_stateowner(sop); 491 nfs4_put_stateowner(sop);
368} 492}
369 493
370static DEFINE_SPINLOCK(sessionid_lock);
371#define SESSION_HASH_SIZE 512 494#define SESSION_HASH_SIZE 512
372static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; 495static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
373 496
@@ -454,7 +577,7 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
454 spin_unlock(&nfsd_drc_lock); 577 spin_unlock(&nfsd_drc_lock);
455 578
456 if (fchan->maxreqs == 0) 579 if (fchan->maxreqs == 0)
457 return nfserr_serverfault; 580 return nfserr_jukebox;
458 581
459 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; 582 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
460 return 0; 583 return 0;
@@ -539,7 +662,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
539 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) 662 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
540 + sizeof(struct nfsd4_session) > PAGE_SIZE); 663 + sizeof(struct nfsd4_session) > PAGE_SIZE);
541 664
542 status = nfserr_serverfault; 665 status = nfserr_jukebox;
543 /* allocate struct nfsd4_session and slot table pointers in one piece */ 666 /* allocate struct nfsd4_session and slot table pointers in one piece */
544 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *); 667 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
545 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); 668 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
@@ -565,10 +688,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
565 688
566 new->se_flags = cses->flags; 689 new->se_flags = cses->flags;
567 kref_init(&new->se_ref); 690 kref_init(&new->se_ref);
568 spin_lock(&sessionid_lock); 691 spin_lock(&client_lock);
569 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 692 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
570 list_add(&new->se_perclnt, &clp->cl_sessions); 693 list_add(&new->se_perclnt, &clp->cl_sessions);
571 spin_unlock(&sessionid_lock); 694 spin_unlock(&client_lock);
572 695
573 status = nfs_ok; 696 status = nfs_ok;
574out: 697out:
@@ -579,7 +702,7 @@ out_free:
579 goto out; 702 goto out;
580} 703}
581 704
582/* caller must hold sessionid_lock */ 705/* caller must hold client_lock */
583static struct nfsd4_session * 706static struct nfsd4_session *
584find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) 707find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
585{ 708{
@@ -588,10 +711,8 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
588 711
589 dump_sessionid(__func__, sessionid); 712 dump_sessionid(__func__, sessionid);
590 idx = hash_sessionid(sessionid); 713 idx = hash_sessionid(sessionid);
591 dprintk("%s: idx is %d\n", __func__, idx);
592 /* Search in the appropriate list */ 714 /* Search in the appropriate list */
593 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { 715 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
594 dump_sessionid("list traversal", &elem->se_sessionid);
595 if (!memcmp(elem->se_sessionid.data, sessionid->data, 716 if (!memcmp(elem->se_sessionid.data, sessionid->data,
596 NFS4_MAX_SESSIONID_LEN)) { 717 NFS4_MAX_SESSIONID_LEN)) {
597 return elem; 718 return elem;
@@ -602,7 +723,7 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
602 return NULL; 723 return NULL;
603} 724}
604 725
605/* caller must hold sessionid_lock */ 726/* caller must hold client_lock */
606static void 727static void
607unhash_session(struct nfsd4_session *ses) 728unhash_session(struct nfsd4_session *ses)
608{ 729{
@@ -610,15 +731,6 @@ unhash_session(struct nfsd4_session *ses)
610 list_del(&ses->se_perclnt); 731 list_del(&ses->se_perclnt);
611} 732}
612 733
613static void
614release_session(struct nfsd4_session *ses)
615{
616 spin_lock(&sessionid_lock);
617 unhash_session(ses);
618 spin_unlock(&sessionid_lock);
619 nfsd4_put_session(ses);
620}
621
622void 734void
623free_session(struct kref *kref) 735free_session(struct kref *kref)
624{ 736{
@@ -634,9 +746,18 @@ free_session(struct kref *kref)
634 kfree(ses); 746 kfree(ses);
635} 747}
636 748
749/* must be called under the client_lock */
637static inline void 750static inline void
638renew_client(struct nfs4_client *clp) 751renew_client_locked(struct nfs4_client *clp)
639{ 752{
753 if (is_client_expired(clp)) {
754 dprintk("%s: client (clientid %08x/%08x) already expired\n",
755 __func__,
756 clp->cl_clientid.cl_boot,
757 clp->cl_clientid.cl_id);
758 return;
759 }
760
640 /* 761 /*
641 * Move client to the end to the LRU list. 762 * Move client to the end to the LRU list.
642 */ 763 */
@@ -647,6 +768,14 @@ renew_client(struct nfs4_client *clp)
647 clp->cl_time = get_seconds(); 768 clp->cl_time = get_seconds();
648} 769}
649 770
771static inline void
772renew_client(struct nfs4_client *clp)
773{
774 spin_lock(&client_lock);
775 renew_client_locked(clp);
776 spin_unlock(&client_lock);
777}
778
650/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ 779/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
651static int 780static int
652STALE_CLIENTID(clientid_t *clid) 781STALE_CLIENTID(clientid_t *clid)
@@ -680,27 +809,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
680 return clp; 809 return clp;
681} 810}
682 811
683static void
684shutdown_callback_client(struct nfs4_client *clp)
685{
686 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
687
688 if (clnt) {
689 /*
690 * Callback threads take a reference on the client, so there
691 * should be no outstanding callbacks at this point.
692 */
693 clp->cl_cb_conn.cb_client = NULL;
694 rpc_shutdown_client(clnt);
695 }
696}
697
698static inline void 812static inline void
699free_client(struct nfs4_client *clp) 813free_client(struct nfs4_client *clp)
700{ 814{
701 shutdown_callback_client(clp);
702 if (clp->cl_cb_xprt)
703 svc_xprt_put(clp->cl_cb_xprt);
704 if (clp->cl_cred.cr_group_info) 815 if (clp->cl_cred.cr_group_info)
705 put_group_info(clp->cl_cred.cr_group_info); 816 put_group_info(clp->cl_cred.cr_group_info);
706 kfree(clp->cl_principal); 817 kfree(clp->cl_principal);
@@ -709,10 +820,33 @@ free_client(struct nfs4_client *clp)
709} 820}
710 821
711void 822void
712put_nfs4_client(struct nfs4_client *clp) 823release_session_client(struct nfsd4_session *session)
713{ 824{
714 if (atomic_dec_and_test(&clp->cl_count)) 825 struct nfs4_client *clp = session->se_client;
826
827 if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
828 return;
829 if (is_client_expired(clp)) {
715 free_client(clp); 830 free_client(clp);
831 session->se_client = NULL;
832 } else
833 renew_client_locked(clp);
834 spin_unlock(&client_lock);
835}
836
837/* must be called under the client_lock */
838static inline void
839unhash_client_locked(struct nfs4_client *clp)
840{
841 mark_client_expired(clp);
842 list_del(&clp->cl_lru);
843 while (!list_empty(&clp->cl_sessions)) {
844 struct nfsd4_session *ses;
845 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
846 se_perclnt);
847 unhash_session(ses);
848 nfsd4_put_session(ses);
849 }
716} 850}
717 851
718static void 852static void
@@ -722,9 +856,6 @@ expire_client(struct nfs4_client *clp)
722 struct nfs4_delegation *dp; 856 struct nfs4_delegation *dp;
723 struct list_head reaplist; 857 struct list_head reaplist;
724 858
725 dprintk("NFSD: expire_client cl_count %d\n",
726 atomic_read(&clp->cl_count));
727
728 INIT_LIST_HEAD(&reaplist); 859 INIT_LIST_HEAD(&reaplist);
729 spin_lock(&recall_lock); 860 spin_lock(&recall_lock);
730 while (!list_empty(&clp->cl_delegations)) { 861 while (!list_empty(&clp->cl_delegations)) {
@@ -740,20 +871,20 @@ expire_client(struct nfs4_client *clp)
740 list_del_init(&dp->dl_recall_lru); 871 list_del_init(&dp->dl_recall_lru);
741 unhash_delegation(dp); 872 unhash_delegation(dp);
742 } 873 }
743 list_del(&clp->cl_idhash);
744 list_del(&clp->cl_strhash);
745 list_del(&clp->cl_lru);
746 while (!list_empty(&clp->cl_openowners)) { 874 while (!list_empty(&clp->cl_openowners)) {
747 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 875 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
748 release_openowner(sop); 876 release_openowner(sop);
749 } 877 }
750 while (!list_empty(&clp->cl_sessions)) { 878 nfsd4_set_callback_client(clp, NULL);
751 struct nfsd4_session *ses; 879 if (clp->cl_cb_conn.cb_xprt)
752 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, 880 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
753 se_perclnt); 881 list_del(&clp->cl_idhash);
754 release_session(ses); 882 list_del(&clp->cl_strhash);
755 } 883 spin_lock(&client_lock);
756 put_nfs4_client(clp); 884 unhash_client_locked(clp);
885 if (atomic_read(&clp->cl_refcount) == 0)
886 free_client(clp);
887 spin_unlock(&client_lock);
757} 888}
758 889
759static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 890static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -839,14 +970,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
839 } 970 }
840 971
841 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 972 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
842 atomic_set(&clp->cl_count, 1); 973 atomic_set(&clp->cl_refcount, 0);
843 atomic_set(&clp->cl_cb_conn.cb_set, 0); 974 atomic_set(&clp->cl_cb_set, 0);
844 INIT_LIST_HEAD(&clp->cl_idhash); 975 INIT_LIST_HEAD(&clp->cl_idhash);
845 INIT_LIST_HEAD(&clp->cl_strhash); 976 INIT_LIST_HEAD(&clp->cl_strhash);
846 INIT_LIST_HEAD(&clp->cl_openowners); 977 INIT_LIST_HEAD(&clp->cl_openowners);
847 INIT_LIST_HEAD(&clp->cl_delegations); 978 INIT_LIST_HEAD(&clp->cl_delegations);
848 INIT_LIST_HEAD(&clp->cl_sessions); 979 INIT_LIST_HEAD(&clp->cl_sessions);
849 INIT_LIST_HEAD(&clp->cl_lru); 980 INIT_LIST_HEAD(&clp->cl_lru);
981 clp->cl_time = get_seconds();
850 clear_bit(0, &clp->cl_cb_slot_busy); 982 clear_bit(0, &clp->cl_cb_slot_busy);
851 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 983 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
852 copy_verf(clp, verf); 984 copy_verf(clp, verf);
@@ -877,8 +1009,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
877 list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); 1009 list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
878 idhashval = clientid_hashval(clp->cl_clientid.cl_id); 1010 idhashval = clientid_hashval(clp->cl_clientid.cl_id);
879 list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); 1011 list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
880 list_add_tail(&clp->cl_lru, &client_lru); 1012 renew_client(clp);
881 clp->cl_time = get_seconds();
882} 1013}
883 1014
884static void 1015static void
@@ -888,10 +1019,9 @@ move_to_confirmed(struct nfs4_client *clp)
888 unsigned int strhashval; 1019 unsigned int strhashval;
889 1020
890 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); 1021 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
891 list_del_init(&clp->cl_strhash);
892 list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); 1022 list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
893 strhashval = clientstr_hashval(clp->cl_recdir); 1023 strhashval = clientstr_hashval(clp->cl_recdir);
894 list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); 1024 list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
895 renew_client(clp); 1025 renew_client(clp);
896} 1026}
897 1027
@@ -1207,7 +1337,7 @@ out_new:
1207 /* Normal case */ 1337 /* Normal case */
1208 new = create_client(exid->clname, dname, rqstp, &verf); 1338 new = create_client(exid->clname, dname, rqstp, &verf);
1209 if (new == NULL) { 1339 if (new == NULL) {
1210 status = nfserr_serverfault; 1340 status = nfserr_jukebox;
1211 goto out; 1341 goto out;
1212 } 1342 }
1213 1343
@@ -1327,15 +1457,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1327 cs_slot->sl_seqid++; /* from 0 to 1 */ 1457 cs_slot->sl_seqid++; /* from 0 to 1 */
1328 move_to_confirmed(unconf); 1458 move_to_confirmed(unconf);
1329 1459
1330 /*
1331 * We do not support RDMA or persistent sessions
1332 */
1333 cr_ses->flags &= ~SESSION4_PERSIST;
1334 cr_ses->flags &= ~SESSION4_RDMA;
1335
1336 if (cr_ses->flags & SESSION4_BACK_CHAN) { 1460 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1337 unconf->cl_cb_xprt = rqstp->rq_xprt; 1461 unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
1338 svc_xprt_get(unconf->cl_cb_xprt); 1462 svc_xprt_get(rqstp->rq_xprt);
1339 rpc_copy_addr( 1463 rpc_copy_addr(
1340 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, 1464 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1341 sa); 1465 sa);
@@ -1344,7 +1468,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1344 cstate->minorversion; 1468 cstate->minorversion;
1345 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; 1469 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1346 unconf->cl_cb_seq_nr = 1; 1470 unconf->cl_cb_seq_nr = 1;
1347 nfsd4_probe_callback(unconf); 1471 nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
1348 } 1472 }
1349 conf = unconf; 1473 conf = unconf;
1350 } else { 1474 } else {
@@ -1352,6 +1476,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1352 goto out; 1476 goto out;
1353 } 1477 }
1354 1478
1479 /*
1480 * We do not support RDMA or persistent sessions
1481 */
1482 cr_ses->flags &= ~SESSION4_PERSIST;
1483 cr_ses->flags &= ~SESSION4_RDMA;
1484
1355 status = alloc_init_session(rqstp, conf, cr_ses); 1485 status = alloc_init_session(rqstp, conf, cr_ses);
1356 if (status) 1486 if (status)
1357 goto out; 1487 goto out;
@@ -1369,6 +1499,21 @@ out:
1369 return status; 1499 return status;
1370} 1500}
1371 1501
1502static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
1503{
1504 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1505 struct nfsd4_compoundargs *argp = rqstp->rq_argp;
1506
1507 return argp->opcnt == resp->opcnt;
1508}
1509
1510static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
1511{
1512 if (!session)
1513 return 0;
1514 return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
1515}
1516
1372__be32 1517__be32
1373nfsd4_destroy_session(struct svc_rqst *r, 1518nfsd4_destroy_session(struct svc_rqst *r,
1374 struct nfsd4_compound_state *cstate, 1519 struct nfsd4_compound_state *cstate,
@@ -1384,19 +1529,25 @@ nfsd4_destroy_session(struct svc_rqst *r,
1384 * - Do we need to clear any callback info from previous session? 1529 * - Do we need to clear any callback info from previous session?
1385 */ 1530 */
1386 1531
1532 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
1533 if (!nfsd4_last_compound_op(r))
1534 return nfserr_not_only_op;
1535 }
1387 dump_sessionid(__func__, &sessionid->sessionid); 1536 dump_sessionid(__func__, &sessionid->sessionid);
1388 spin_lock(&sessionid_lock); 1537 spin_lock(&client_lock);
1389 ses = find_in_sessionid_hashtbl(&sessionid->sessionid); 1538 ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
1390 if (!ses) { 1539 if (!ses) {
1391 spin_unlock(&sessionid_lock); 1540 spin_unlock(&client_lock);
1392 goto out; 1541 goto out;
1393 } 1542 }
1394 1543
1395 unhash_session(ses); 1544 unhash_session(ses);
1396 spin_unlock(&sessionid_lock); 1545 spin_unlock(&client_lock);
1397 1546
1547 nfs4_lock_state();
1398 /* wait for callbacks */ 1548 /* wait for callbacks */
1399 shutdown_callback_client(ses->se_client); 1549 nfsd4_set_callback_client(ses->se_client, NULL);
1550 nfs4_unlock_state();
1400 nfsd4_put_session(ses); 1551 nfsd4_put_session(ses);
1401 status = nfs_ok; 1552 status = nfs_ok;
1402out: 1553out:
@@ -1417,7 +1568,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1417 if (resp->opcnt != 1) 1568 if (resp->opcnt != 1)
1418 return nfserr_sequence_pos; 1569 return nfserr_sequence_pos;
1419 1570
1420 spin_lock(&sessionid_lock); 1571 spin_lock(&client_lock);
1421 status = nfserr_badsession; 1572 status = nfserr_badsession;
1422 session = find_in_sessionid_hashtbl(&seq->sessionid); 1573 session = find_in_sessionid_hashtbl(&seq->sessionid);
1423 if (!session) 1574 if (!session)
@@ -1456,23 +1607,47 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1456 cstate->slot = slot; 1607 cstate->slot = slot;
1457 cstate->session = session; 1608 cstate->session = session;
1458 1609
1459 /* Hold a session reference until done processing the compound:
1460 * nfsd4_put_session called only if the cstate slot is set.
1461 */
1462 nfsd4_get_session(session);
1463out: 1610out:
1464 spin_unlock(&sessionid_lock); 1611 /* Hold a session reference until done processing the compound. */
1465 /* Renew the clientid on success and on replay */
1466 if (cstate->session) { 1612 if (cstate->session) {
1467 nfs4_lock_state(); 1613 nfsd4_get_session(cstate->session);
1468 renew_client(session->se_client); 1614 atomic_inc(&session->se_client->cl_refcount);
1469 nfs4_unlock_state();
1470 } 1615 }
1616 spin_unlock(&client_lock);
1471 dprintk("%s: return %d\n", __func__, ntohl(status)); 1617 dprintk("%s: return %d\n", __func__, ntohl(status));
1472 return status; 1618 return status;
1473} 1619}
1474 1620
1475__be32 1621__be32
1622nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
1623{
1624 if (rc->rca_one_fs) {
1625 if (!cstate->current_fh.fh_dentry)
1626 return nfserr_nofilehandle;
1627 /*
1628 * We don't take advantage of the rca_one_fs case.
1629 * That's OK, it's optional, we can safely ignore it.
1630 */
1631 return nfs_ok;
1632 }
1633 nfs4_lock_state();
1634 if (is_client_expired(cstate->session->se_client)) {
1635 nfs4_unlock_state();
1636 /*
1637 * The following error isn't really legal.
1638 * But we only get here if the client just explicitly
1639 * destroyed the client. Surely it no longer cares what
1640 * error it gets back on an operation for the dead
1641 * client.
1642 */
1643 return nfserr_stale_clientid;
1644 }
1645 nfsd4_create_clid_dir(cstate->session->se_client);
1646 nfs4_unlock_state();
1647 return nfs_ok;
1648}
1649
1650__be32
1476nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1651nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1477 struct nfsd4_setclientid *setclid) 1652 struct nfsd4_setclientid *setclid)
1478{ 1653{
@@ -1631,9 +1806,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1631 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 1806 if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
1632 status = nfserr_clid_inuse; 1807 status = nfserr_clid_inuse;
1633 else { 1808 else {
1634 /* XXX: We just turn off callbacks until we can handle 1809 atomic_set(&conf->cl_cb_set, 0);
1635 * change request correctly. */ 1810 nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
1636 atomic_set(&conf->cl_cb_conn.cb_set, 0);
1637 expire_client(unconf); 1811 expire_client(unconf);
1638 status = nfs_ok; 1812 status = nfs_ok;
1639 1813
@@ -1667,7 +1841,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1667 } 1841 }
1668 move_to_confirmed(unconf); 1842 move_to_confirmed(unconf);
1669 conf = unconf; 1843 conf = unconf;
1670 nfsd4_probe_callback(conf); 1844 nfsd4_probe_callback(conf, &conf->cl_cb_conn);
1671 status = nfs_ok; 1845 status = nfs_ok;
1672 } 1846 }
1673 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 1847 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1700,12 +1874,14 @@ alloc_init_file(struct inode *ino)
1700 INIT_LIST_HEAD(&fp->fi_hash); 1874 INIT_LIST_HEAD(&fp->fi_hash);
1701 INIT_LIST_HEAD(&fp->fi_stateids); 1875 INIT_LIST_HEAD(&fp->fi_stateids);
1702 INIT_LIST_HEAD(&fp->fi_delegations); 1876 INIT_LIST_HEAD(&fp->fi_delegations);
1703 spin_lock(&recall_lock);
1704 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1705 spin_unlock(&recall_lock);
1706 fp->fi_inode = igrab(ino); 1877 fp->fi_inode = igrab(ino);
1707 fp->fi_id = current_fileid++; 1878 fp->fi_id = current_fileid++;
1708 fp->fi_had_conflict = false; 1879 fp->fi_had_conflict = false;
1880 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
1881 memset(fp->fi_access, 0, sizeof(fp->fi_access));
1882 spin_lock(&recall_lock);
1883 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1884 spin_unlock(&recall_lock);
1709 return fp; 1885 return fp;
1710 } 1886 }
1711 return NULL; 1887 return NULL;
@@ -1827,7 +2003,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
1827 stp->st_stateowner = sop; 2003 stp->st_stateowner = sop;
1828 get_nfs4_file(fp); 2004 get_nfs4_file(fp);
1829 stp->st_file = fp; 2005 stp->st_file = fp;
1830 stp->st_stateid.si_boot = get_seconds(); 2006 stp->st_stateid.si_boot = boot_time;
1831 stp->st_stateid.si_stateownerid = sop->so_id; 2007 stp->st_stateid.si_stateownerid = sop->so_id;
1832 stp->st_stateid.si_fileid = fp->fi_id; 2008 stp->st_stateid.si_fileid = fp->fi_id;
1833 stp->st_stateid.si_generation = 0; 2009 stp->st_stateid.si_generation = 0;
@@ -1914,57 +2090,6 @@ static inline int deny_valid(u32 x)
1914} 2090}
1915 2091
1916/* 2092/*
1917 * We store the NONE, READ, WRITE, and BOTH bits separately in the
1918 * st_{access,deny}_bmap field of the stateid, in order to track not
1919 * only what share bits are currently in force, but also what
1920 * combinations of share bits previous opens have used. This allows us
1921 * to enforce the recommendation of rfc 3530 14.2.19 that the server
1922 * return an error if the client attempt to downgrade to a combination
1923 * of share bits not explicable by closing some of its previous opens.
1924 *
1925 * XXX: This enforcement is actually incomplete, since we don't keep
1926 * track of access/deny bit combinations; so, e.g., we allow:
1927 *
1928 * OPEN allow read, deny write
1929 * OPEN allow both, deny none
1930 * DOWNGRADE allow read, deny none
1931 *
1932 * which we should reject.
1933 */
1934static void
1935set_access(unsigned int *access, unsigned long bmap) {
1936 int i;
1937
1938 *access = 0;
1939 for (i = 1; i < 4; i++) {
1940 if (test_bit(i, &bmap))
1941 *access |= i;
1942 }
1943}
1944
1945static void
1946set_deny(unsigned int *deny, unsigned long bmap) {
1947 int i;
1948
1949 *deny = 0;
1950 for (i = 0; i < 4; i++) {
1951 if (test_bit(i, &bmap))
1952 *deny |= i ;
1953 }
1954}
1955
1956static int
1957test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
1958 unsigned int access, deny;
1959
1960 set_access(&access, stp->st_access_bmap);
1961 set_deny(&deny, stp->st_deny_bmap);
1962 if ((access & open->op_share_deny) || (deny & open->op_share_access))
1963 return 0;
1964 return 1;
1965}
1966
1967/*
1968 * Called to check deny when READ with all zero stateid or 2093 * Called to check deny when READ with all zero stateid or
1969 * WRITE with all zero or all one stateid 2094 * WRITE with all zero or all one stateid
1970 */ 2095 */
@@ -1995,14 +2120,12 @@ out:
1995} 2120}
1996 2121
1997static inline void 2122static inline void
1998nfs4_file_downgrade(struct file *filp, unsigned int share_access) 2123nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
1999{ 2124{
2000 if (share_access & NFS4_SHARE_ACCESS_WRITE) { 2125 if (share_access & NFS4_SHARE_ACCESS_WRITE)
2001 drop_file_write_access(filp); 2126 nfs4_file_put_access(fp, O_WRONLY);
2002 spin_lock(&filp->f_lock); 2127 if (share_access & NFS4_SHARE_ACCESS_READ)
2003 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; 2128 nfs4_file_put_access(fp, O_RDONLY);
2004 spin_unlock(&filp->f_lock);
2005 }
2006} 2129}
2007 2130
2008/* 2131/*
@@ -2028,7 +2151,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2028 * lock) we know the server hasn't removed the lease yet, we know 2151 * lock) we know the server hasn't removed the lease yet, we know
2029 * it's safe to take a reference: */ 2152 * it's safe to take a reference: */
2030 atomic_inc(&dp->dl_count); 2153 atomic_inc(&dp->dl_count);
2031 atomic_inc(&dp->dl_client->cl_count);
2032 2154
2033 spin_lock(&recall_lock); 2155 spin_lock(&recall_lock);
2034 list_add_tail(&dp->dl_recall_lru, &del_recall_lru); 2156 list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
@@ -2199,6 +2321,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
2199 return NULL; 2321 return NULL;
2200} 2322}
2201 2323
2324int share_access_to_flags(u32 share_access)
2325{
2326 share_access &= ~NFS4_SHARE_WANT_MASK;
2327
2328 return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
2329}
2330
2202static __be32 2331static __be32
2203nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, 2332nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
2204 struct nfs4_delegation **dp) 2333 struct nfs4_delegation **dp)
@@ -2209,8 +2338,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
2209 *dp = find_delegation_file(fp, &open->op_delegate_stateid); 2338 *dp = find_delegation_file(fp, &open->op_delegate_stateid);
2210 if (*dp == NULL) 2339 if (*dp == NULL)
2211 goto out; 2340 goto out;
2212 flags = open->op_share_access == NFS4_SHARE_ACCESS_READ ? 2341 flags = share_access_to_flags(open->op_share_access);
2213 RD_STATE : WR_STATE;
2214 status = nfs4_check_delegmode(*dp, flags); 2342 status = nfs4_check_delegmode(*dp, flags);
2215 if (status) 2343 if (status)
2216 *dp = NULL; 2344 *dp = NULL;
@@ -2252,30 +2380,53 @@ nfs4_alloc_stateid(void)
2252 return kmem_cache_alloc(stateid_slab, GFP_KERNEL); 2380 return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
2253} 2381}
2254 2382
2383static inline int nfs4_access_to_access(u32 nfs4_access)
2384{
2385 int flags = 0;
2386
2387 if (nfs4_access & NFS4_SHARE_ACCESS_READ)
2388 flags |= NFSD_MAY_READ;
2389 if (nfs4_access & NFS4_SHARE_ACCESS_WRITE)
2390 flags |= NFSD_MAY_WRITE;
2391 return flags;
2392}
2393
2394static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
2395*fp, struct svc_fh *cur_fh, u32 nfs4_access)
2396{
2397 __be32 status;
2398 int oflag = nfs4_access_to_omode(nfs4_access);
2399 int access = nfs4_access_to_access(nfs4_access);
2400
2401 if (!fp->fi_fds[oflag]) {
2402 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
2403 &fp->fi_fds[oflag]);
2404 if (status == nfserr_dropit)
2405 status = nfserr_jukebox;
2406 if (status)
2407 return status;
2408 }
2409 nfs4_file_get_access(fp, oflag);
2410
2411 return nfs_ok;
2412}
2413
2255static __be32 2414static __be32
2256nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, 2415nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
2257 struct nfs4_delegation *dp, 2416 struct nfs4_file *fp, struct svc_fh *cur_fh,
2258 struct svc_fh *cur_fh, int flags) 2417 struct nfsd4_open *open)
2259{ 2418{
2260 struct nfs4_stateid *stp; 2419 struct nfs4_stateid *stp;
2420 __be32 status;
2261 2421
2262 stp = nfs4_alloc_stateid(); 2422 stp = nfs4_alloc_stateid();
2263 if (stp == NULL) 2423 if (stp == NULL)
2264 return nfserr_resource; 2424 return nfserr_resource;
2265 2425
2266 if (dp) { 2426 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access);
2267 get_file(dp->dl_vfs_file); 2427 if (status) {
2268 stp->st_vfs_file = dp->dl_vfs_file; 2428 kmem_cache_free(stateid_slab, stp);
2269 } else { 2429 return status;
2270 __be32 status;
2271 status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
2272 &stp->st_vfs_file);
2273 if (status) {
2274 if (status == nfserr_dropit)
2275 status = nfserr_jukebox;
2276 kmem_cache_free(stateid_slab, stp);
2277 return status;
2278 }
2279 } 2430 }
2280 *stpp = stp; 2431 *stpp = stp;
2281 return 0; 2432 return 0;
@@ -2297,35 +2448,28 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
2297} 2448}
2298 2449
2299static __be32 2450static __be32
2300nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) 2451nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
2301{ 2452{
2302 struct file *filp = stp->st_vfs_file; 2453 u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2303 struct inode *inode = filp->f_path.dentry->d_inode; 2454 bool new_access;
2304 unsigned int share_access, new_writer;
2305 __be32 status; 2455 __be32 status;
2306 2456
2307 set_access(&share_access, stp->st_access_bmap); 2457 new_access = !test_bit(op_share_access, &stp->st_access_bmap);
2308 new_writer = (~share_access) & open->op_share_access 2458 if (new_access) {
2309 & NFS4_SHARE_ACCESS_WRITE; 2459 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
2310 2460 if (status)
2311 if (new_writer) { 2461 return status;
2312 int err = get_write_access(inode);
2313 if (err)
2314 return nfserrno(err);
2315 err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
2316 if (err)
2317 return nfserrno(err);
2318 file_take_write(filp);
2319 } 2462 }
2320 status = nfsd4_truncate(rqstp, cur_fh, open); 2463 status = nfsd4_truncate(rqstp, cur_fh, open);
2321 if (status) { 2464 if (status) {
2322 if (new_writer) 2465 if (new_access) {
2323 put_write_access(inode); 2466 int oflag = nfs4_access_to_omode(new_access);
2467 nfs4_file_put_access(fp, oflag);
2468 }
2324 return status; 2469 return status;
2325 } 2470 }
2326 /* remember the open */ 2471 /* remember the open */
2327 filp->f_mode |= open->op_share_access; 2472 __set_bit(op_share_access, &stp->st_access_bmap);
2328 __set_bit(open->op_share_access, &stp->st_access_bmap);
2329 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 2473 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
2330 2474
2331 return nfs_ok; 2475 return nfs_ok;
@@ -2347,7 +2491,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2347{ 2491{
2348 struct nfs4_delegation *dp; 2492 struct nfs4_delegation *dp;
2349 struct nfs4_stateowner *sop = stp->st_stateowner; 2493 struct nfs4_stateowner *sop = stp->st_stateowner;
2350 struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn; 2494 int cb_up = atomic_read(&sop->so_client->cl_cb_set);
2351 struct file_lock fl, *flp = &fl; 2495 struct file_lock fl, *flp = &fl;
2352 int status, flag = 0; 2496 int status, flag = 0;
2353 2497
@@ -2355,7 +2499,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2355 open->op_recall = 0; 2499 open->op_recall = 0;
2356 switch (open->op_claim_type) { 2500 switch (open->op_claim_type) {
2357 case NFS4_OPEN_CLAIM_PREVIOUS: 2501 case NFS4_OPEN_CLAIM_PREVIOUS:
2358 if (!atomic_read(&cb->cb_set)) 2502 if (!cb_up)
2359 open->op_recall = 1; 2503 open->op_recall = 1;
2360 flag = open->op_delegate_type; 2504 flag = open->op_delegate_type;
2361 if (flag == NFS4_OPEN_DELEGATE_NONE) 2505 if (flag == NFS4_OPEN_DELEGATE_NONE)
@@ -2366,7 +2510,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2366 * had the chance to reclaim theirs.... */ 2510 * had the chance to reclaim theirs.... */
2367 if (locks_in_grace()) 2511 if (locks_in_grace())
2368 goto out; 2512 goto out;
2369 if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) 2513 if (!cb_up || !sop->so_confirmed)
2370 goto out; 2514 goto out;
2371 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 2515 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
2372 flag = NFS4_OPEN_DELEGATE_WRITE; 2516 flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2388,13 +2532,14 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2388 fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; 2532 fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2389 fl.fl_end = OFFSET_MAX; 2533 fl.fl_end = OFFSET_MAX;
2390 fl.fl_owner = (fl_owner_t)dp; 2534 fl.fl_owner = (fl_owner_t)dp;
2391 fl.fl_file = stp->st_vfs_file; 2535 fl.fl_file = find_readable_file(stp->st_file);
2536 BUG_ON(!fl.fl_file);
2392 fl.fl_pid = current->tgid; 2537 fl.fl_pid = current->tgid;
2393 2538
2394 /* vfs_setlease checks to see if delegation should be handed out. 2539 /* vfs_setlease checks to see if delegation should be handed out.
2395 * the lock_manager callbacks fl_mylease and fl_change are used 2540 * the lock_manager callbacks fl_mylease and fl_change are used
2396 */ 2541 */
2397 if ((status = vfs_setlease(stp->st_vfs_file, fl.fl_type, &flp))) { 2542 if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) {
2398 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2543 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
2399 unhash_delegation(dp); 2544 unhash_delegation(dp);
2400 flag = NFS4_OPEN_DELEGATE_NONE; 2545 flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2458,18 +2603,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2458 */ 2603 */
2459 if (stp) { 2604 if (stp) {
2460 /* Stateid was found, this is an OPEN upgrade */ 2605 /* Stateid was found, this is an OPEN upgrade */
2461 status = nfs4_upgrade_open(rqstp, current_fh, stp, open); 2606 status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
2462 if (status) 2607 if (status)
2463 goto out; 2608 goto out;
2464 update_stateid(&stp->st_stateid); 2609 update_stateid(&stp->st_stateid);
2465 } else { 2610 } else {
2466 /* Stateid was not found, this is a new OPEN */ 2611 status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
2467 int flags = 0;
2468 if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
2469 flags |= NFSD_MAY_READ;
2470 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
2471 flags |= NFSD_MAY_WRITE;
2472 status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
2473 if (status) 2612 if (status)
2474 goto out; 2613 goto out;
2475 init_stateid(stp, fp, open); 2614 init_stateid(stp, fp, open);
@@ -2483,10 +2622,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2483 } 2622 }
2484 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2623 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
2485 2624
2486 if (nfsd4_has_session(&resp->cstate)) { 2625 if (nfsd4_has_session(&resp->cstate))
2487 open->op_stateowner->so_confirmed = 1; 2626 open->op_stateowner->so_confirmed = 1;
2488 nfsd4_create_clid_dir(open->op_stateowner->so_client);
2489 }
2490 2627
2491 /* 2628 /*
2492 * Attempt to hand out a delegation. No error return, because the 2629 * Attempt to hand out a delegation. No error return, because the
@@ -2537,7 +2674,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2537 renew_client(clp); 2674 renew_client(clp);
2538 status = nfserr_cb_path_down; 2675 status = nfserr_cb_path_down;
2539 if (!list_empty(&clp->cl_delegations) 2676 if (!list_empty(&clp->cl_delegations)
2540 && !atomic_read(&clp->cl_cb_conn.cb_set)) 2677 && !atomic_read(&clp->cl_cb_set))
2541 goto out; 2678 goto out;
2542 status = nfs_ok; 2679 status = nfs_ok;
2543out: 2680out:
@@ -2554,6 +2691,12 @@ nfsd4_end_grace(void)
2554 dprintk("NFSD: end of grace period\n"); 2691 dprintk("NFSD: end of grace period\n");
2555 nfsd4_recdir_purge_old(); 2692 nfsd4_recdir_purge_old();
2556 locks_end_grace(&nfsd4_manager); 2693 locks_end_grace(&nfsd4_manager);
2694 /*
2695 * Now that every NFSv4 client has had the chance to recover and
2696 * to see the (possibly new, possibly shorter) lease time, we
2697 * can safely set the next grace time to the current lease time:
2698 */
2699 nfsd4_grace = nfsd4_lease;
2557} 2700}
2558 2701
2559static time_t 2702static time_t
@@ -2563,15 +2706,17 @@ nfs4_laundromat(void)
2563 struct nfs4_stateowner *sop; 2706 struct nfs4_stateowner *sop;
2564 struct nfs4_delegation *dp; 2707 struct nfs4_delegation *dp;
2565 struct list_head *pos, *next, reaplist; 2708 struct list_head *pos, *next, reaplist;
2566 time_t cutoff = get_seconds() - NFSD_LEASE_TIME; 2709 time_t cutoff = get_seconds() - nfsd4_lease;
2567 time_t t, clientid_val = NFSD_LEASE_TIME; 2710 time_t t, clientid_val = nfsd4_lease;
2568 time_t u, test_val = NFSD_LEASE_TIME; 2711 time_t u, test_val = nfsd4_lease;
2569 2712
2570 nfs4_lock_state(); 2713 nfs4_lock_state();
2571 2714
2572 dprintk("NFSD: laundromat service - starting\n"); 2715 dprintk("NFSD: laundromat service - starting\n");
2573 if (locks_in_grace()) 2716 if (locks_in_grace())
2574 nfsd4_end_grace(); 2717 nfsd4_end_grace();
2718 INIT_LIST_HEAD(&reaplist);
2719 spin_lock(&client_lock);
2575 list_for_each_safe(pos, next, &client_lru) { 2720 list_for_each_safe(pos, next, &client_lru) {
2576 clp = list_entry(pos, struct nfs4_client, cl_lru); 2721 clp = list_entry(pos, struct nfs4_client, cl_lru);
2577 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { 2722 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -2580,12 +2725,22 @@ nfs4_laundromat(void)
2580 clientid_val = t; 2725 clientid_val = t;
2581 break; 2726 break;
2582 } 2727 }
2728 if (atomic_read(&clp->cl_refcount)) {
2729 dprintk("NFSD: client in use (clientid %08x)\n",
2730 clp->cl_clientid.cl_id);
2731 continue;
2732 }
2733 unhash_client_locked(clp);
2734 list_add(&clp->cl_lru, &reaplist);
2735 }
2736 spin_unlock(&client_lock);
2737 list_for_each_safe(pos, next, &reaplist) {
2738 clp = list_entry(pos, struct nfs4_client, cl_lru);
2583 dprintk("NFSD: purging unused client (clientid %08x)\n", 2739 dprintk("NFSD: purging unused client (clientid %08x)\n",
2584 clp->cl_clientid.cl_id); 2740 clp->cl_clientid.cl_id);
2585 nfsd4_remove_clid_dir(clp); 2741 nfsd4_remove_clid_dir(clp);
2586 expire_client(clp); 2742 expire_client(clp);
2587 } 2743 }
2588 INIT_LIST_HEAD(&reaplist);
2589 spin_lock(&recall_lock); 2744 spin_lock(&recall_lock);
2590 list_for_each_safe(pos, next, &del_recall_lru) { 2745 list_for_each_safe(pos, next, &del_recall_lru) {
2591 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 2746 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -2605,7 +2760,7 @@ nfs4_laundromat(void)
2605 list_del_init(&dp->dl_recall_lru); 2760 list_del_init(&dp->dl_recall_lru);
2606 unhash_delegation(dp); 2761 unhash_delegation(dp);
2607 } 2762 }
2608 test_val = NFSD_LEASE_TIME; 2763 test_val = nfsd4_lease;
2609 list_for_each_safe(pos, next, &close_lru) { 2764 list_for_each_safe(pos, next, &close_lru) {
2610 sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); 2765 sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
2611 if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { 2766 if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
@@ -2655,45 +2810,17 @@ search_close_lru(u32 st_id, int flags)
2655static inline int 2810static inline int
2656nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) 2811nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
2657{ 2812{
2658 return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; 2813 return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
2659} 2814}
2660 2815
2661static int 2816static int
2662STALE_STATEID(stateid_t *stateid) 2817STALE_STATEID(stateid_t *stateid)
2663{ 2818{
2664 if (time_after((unsigned long)boot_time, 2819 if (stateid->si_boot == boot_time)
2665 (unsigned long)stateid->si_boot)) { 2820 return 0;
2666 dprintk("NFSD: stale stateid " STATEID_FMT "!\n", 2821 dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
2667 STATEID_VAL(stateid));
2668 return 1;
2669 }
2670 return 0;
2671}
2672
2673static int
2674EXPIRED_STATEID(stateid_t *stateid)
2675{
2676 if (time_before((unsigned long)boot_time,
2677 ((unsigned long)stateid->si_boot)) &&
2678 time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
2679 dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
2680 STATEID_VAL(stateid));
2681 return 1;
2682 }
2683 return 0;
2684}
2685
2686static __be32
2687stateid_error_map(stateid_t *stateid)
2688{
2689 if (STALE_STATEID(stateid))
2690 return nfserr_stale_stateid;
2691 if (EXPIRED_STATEID(stateid))
2692 return nfserr_expired;
2693
2694 dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
2695 STATEID_VAL(stateid)); 2822 STATEID_VAL(stateid));
2696 return nfserr_bad_stateid; 2823 return 1;
2697} 2824}
2698 2825
2699static inline int 2826static inline int
@@ -2716,6 +2843,9 @@ __be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
2716{ 2843{
2717 __be32 status = nfserr_openmode; 2844 __be32 status = nfserr_openmode;
2718 2845
2846 /* For lock stateid's, we test the parent open, not the lock: */
2847 if (stp->st_openstp)
2848 stp = stp->st_openstp;
2719 if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) 2849 if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
2720 goto out; 2850 goto out;
2721 if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap))) 2851 if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
@@ -2817,10 +2947,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2817 status = nfserr_bad_stateid; 2947 status = nfserr_bad_stateid;
2818 if (is_delegation_stateid(stateid)) { 2948 if (is_delegation_stateid(stateid)) {
2819 dp = find_delegation_stateid(ino, stateid); 2949 dp = find_delegation_stateid(ino, stateid);
2820 if (!dp) { 2950 if (!dp)
2821 status = stateid_error_map(stateid);
2822 goto out; 2951 goto out;
2823 }
2824 status = check_stateid_generation(stateid, &dp->dl_stateid, 2952 status = check_stateid_generation(stateid, &dp->dl_stateid,
2825 flags); 2953 flags);
2826 if (status) 2954 if (status)
@@ -2830,13 +2958,12 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2830 goto out; 2958 goto out;
2831 renew_client(dp->dl_client); 2959 renew_client(dp->dl_client);
2832 if (filpp) 2960 if (filpp)
2833 *filpp = dp->dl_vfs_file; 2961 *filpp = find_readable_file(dp->dl_file);
2962 BUG_ON(!*filpp);
2834 } else { /* open or lock stateid */ 2963 } else { /* open or lock stateid */
2835 stp = find_stateid(stateid, flags); 2964 stp = find_stateid(stateid, flags);
2836 if (!stp) { 2965 if (!stp)
2837 status = stateid_error_map(stateid);
2838 goto out; 2966 goto out;
2839 }
2840 if (nfs4_check_fh(current_fh, stp)) 2967 if (nfs4_check_fh(current_fh, stp))
2841 goto out; 2968 goto out;
2842 if (!stp->st_stateowner->so_confirmed) 2969 if (!stp->st_stateowner->so_confirmed)
@@ -2849,8 +2976,12 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2849 if (status) 2976 if (status)
2850 goto out; 2977 goto out;
2851 renew_client(stp->st_stateowner->so_client); 2978 renew_client(stp->st_stateowner->so_client);
2852 if (filpp) 2979 if (filpp) {
2853 *filpp = stp->st_vfs_file; 2980 if (flags & RD_STATE)
2981 *filpp = find_readable_file(stp->st_file);
2982 else
2983 *filpp = find_writeable_file(stp->st_file);
2984 }
2854 } 2985 }
2855 status = nfs_ok; 2986 status = nfs_ok;
2856out: 2987out:
@@ -2908,7 +3039,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2908 */ 3039 */
2909 sop = search_close_lru(stateid->si_stateownerid, flags); 3040 sop = search_close_lru(stateid->si_stateownerid, flags);
2910 if (sop == NULL) 3041 if (sop == NULL)
2911 return stateid_error_map(stateid); 3042 return nfserr_bad_stateid;
2912 *sopp = sop; 3043 *sopp = sop;
2913 goto check_replay; 3044 goto check_replay;
2914 } 3045 }
@@ -3086,8 +3217,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3086 goto out; 3217 goto out;
3087 } 3218 }
3088 set_access(&share_access, stp->st_access_bmap); 3219 set_access(&share_access, stp->st_access_bmap);
3089 nfs4_file_downgrade(stp->st_vfs_file, 3220 nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access);
3090 share_access & ~od->od_share_access);
3091 3221
3092 reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap); 3222 reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap);
3093 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); 3223 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
@@ -3175,10 +3305,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3175 if (!is_delegation_stateid(stateid)) 3305 if (!is_delegation_stateid(stateid))
3176 goto out; 3306 goto out;
3177 dp = find_delegation_stateid(inode, stateid); 3307 dp = find_delegation_stateid(inode, stateid);
3178 if (!dp) { 3308 if (!dp)
3179 status = stateid_error_map(stateid);
3180 goto out; 3309 goto out;
3181 }
3182 status = check_stateid_generation(stateid, &dp->dl_stateid, flags); 3310 status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
3183 if (status) 3311 if (status)
3184 goto out; 3312 goto out;
@@ -3308,11 +3436,9 @@ static inline void
3308nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) 3436nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
3309{ 3437{
3310 struct nfs4_stateowner *sop; 3438 struct nfs4_stateowner *sop;
3311 unsigned int hval;
3312 3439
3313 if (fl->fl_lmops == &nfsd_posix_mng_ops) { 3440 if (fl->fl_lmops == &nfsd_posix_mng_ops) {
3314 sop = (struct nfs4_stateowner *) fl->fl_owner; 3441 sop = (struct nfs4_stateowner *) fl->fl_owner;
3315 hval = lockownerid_hashval(sop->so_id);
3316 kref_get(&sop->so_ref); 3442 kref_get(&sop->so_ref);
3317 deny->ld_sop = sop; 3443 deny->ld_sop = sop;
3318 deny->ld_clientid = sop->so_client->cl_clientid; 3444 deny->ld_clientid = sop->so_client->cl_clientid;
@@ -3404,12 +3530,10 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
3404 stp->st_stateowner = sop; 3530 stp->st_stateowner = sop;
3405 get_nfs4_file(fp); 3531 get_nfs4_file(fp);
3406 stp->st_file = fp; 3532 stp->st_file = fp;
3407 stp->st_stateid.si_boot = get_seconds(); 3533 stp->st_stateid.si_boot = boot_time;
3408 stp->st_stateid.si_stateownerid = sop->so_id; 3534 stp->st_stateid.si_stateownerid = sop->so_id;
3409 stp->st_stateid.si_fileid = fp->fi_id; 3535 stp->st_stateid.si_fileid = fp->fi_id;
3410 stp->st_stateid.si_generation = 0; 3536 stp->st_stateid.si_generation = 0;
3411 stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */
3412 stp->st_access_bmap = open_stp->st_access_bmap;
3413 stp->st_deny_bmap = open_stp->st_deny_bmap; 3537 stp->st_deny_bmap = open_stp->st_deny_bmap;
3414 stp->st_openstp = open_stp; 3538 stp->st_openstp = open_stp;
3415 3539
@@ -3434,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3434 struct nfs4_stateowner *open_sop = NULL; 3558 struct nfs4_stateowner *open_sop = NULL;
3435 struct nfs4_stateowner *lock_sop = NULL; 3559 struct nfs4_stateowner *lock_sop = NULL;
3436 struct nfs4_stateid *lock_stp; 3560 struct nfs4_stateid *lock_stp;
3437 struct file *filp; 3561 struct nfs4_file *fp;
3562 struct file *filp = NULL;
3438 struct file_lock file_lock; 3563 struct file_lock file_lock;
3439 struct file_lock conflock; 3564 struct file_lock conflock;
3440 __be32 status = 0; 3565 __be32 status = 0;
@@ -3464,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3464 * lock stateid. 3589 * lock stateid.
3465 */ 3590 */
3466 struct nfs4_stateid *open_stp = NULL; 3591 struct nfs4_stateid *open_stp = NULL;
3467 struct nfs4_file *fp;
3468 3592
3469 status = nfserr_stale_clientid; 3593 status = nfserr_stale_clientid;
3470 if (!nfsd4_has_session(cstate) && 3594 if (!nfsd4_has_session(cstate) &&
@@ -3507,9 +3631,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3507 if (status) 3631 if (status)
3508 goto out; 3632 goto out;
3509 lock_sop = lock->lk_replay_owner; 3633 lock_sop = lock->lk_replay_owner;
3634 fp = lock_stp->st_file;
3510 } 3635 }
3511 /* lock->lk_replay_owner and lock_stp have been created or found */ 3636 /* lock->lk_replay_owner and lock_stp have been created or found */
3512 filp = lock_stp->st_vfs_file;
3513 3637
3514 status = nfserr_grace; 3638 status = nfserr_grace;
3515 if (locks_in_grace() && !lock->lk_reclaim) 3639 if (locks_in_grace() && !lock->lk_reclaim)
@@ -3522,11 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3522 switch (lock->lk_type) { 3646 switch (lock->lk_type) {
3523 case NFS4_READ_LT: 3647 case NFS4_READ_LT:
3524 case NFS4_READW_LT: 3648 case NFS4_READW_LT:
3649 if (find_readable_file(lock_stp->st_file)) {
3650 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
3651 filp = find_readable_file(lock_stp->st_file);
3652 }
3525 file_lock.fl_type = F_RDLCK; 3653 file_lock.fl_type = F_RDLCK;
3526 cmd = F_SETLK; 3654 cmd = F_SETLK;
3527 break; 3655 break;
3528 case NFS4_WRITE_LT: 3656 case NFS4_WRITE_LT:
3529 case NFS4_WRITEW_LT: 3657 case NFS4_WRITEW_LT:
3658 if (find_writeable_file(lock_stp->st_file)) {
3659 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
3660 filp = find_writeable_file(lock_stp->st_file);
3661 }
3530 file_lock.fl_type = F_WRLCK; 3662 file_lock.fl_type = F_WRLCK;
3531 cmd = F_SETLK; 3663 cmd = F_SETLK;
3532 break; 3664 break;
@@ -3534,6 +3666,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3534 status = nfserr_inval; 3666 status = nfserr_inval;
3535 goto out; 3667 goto out;
3536 } 3668 }
3669 if (!filp) {
3670 status = nfserr_openmode;
3671 goto out;
3672 }
3537 file_lock.fl_owner = (fl_owner_t)lock_sop; 3673 file_lock.fl_owner = (fl_owner_t)lock_sop;
3538 file_lock.fl_pid = current->tgid; 3674 file_lock.fl_pid = current->tgid;
3539 file_lock.fl_file = filp; 3675 file_lock.fl_file = filp;
@@ -3702,7 +3838,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3702 &locku->lu_stateowner, &stp, NULL))) 3838 &locku->lu_stateowner, &stp, NULL)))
3703 goto out; 3839 goto out;
3704 3840
3705 filp = stp->st_vfs_file; 3841 filp = find_any_file(stp->st_file);
3842 if (!filp) {
3843 status = nfserr_lock_range;
3844 goto out;
3845 }
3706 BUG_ON(!filp); 3846 BUG_ON(!filp);
3707 locks_init_lock(&file_lock); 3847 locks_init_lock(&file_lock);
3708 file_lock.fl_type = F_UNLCK; 3848 file_lock.fl_type = F_UNLCK;
@@ -3749,10 +3889,10 @@ out_nfserr:
3749 * 0: no locks held by lockowner 3889 * 0: no locks held by lockowner
3750 */ 3890 */
3751static int 3891static int
3752check_for_locks(struct file *filp, struct nfs4_stateowner *lowner) 3892check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
3753{ 3893{
3754 struct file_lock **flpp; 3894 struct file_lock **flpp;
3755 struct inode *inode = filp->f_path.dentry->d_inode; 3895 struct inode *inode = filp->fi_inode;
3756 int status = 0; 3896 int status = 0;
3757 3897
3758 lock_kernel(); 3898 lock_kernel();
@@ -3803,7 +3943,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
3803 continue; 3943 continue;
3804 list_for_each_entry(stp, &sop->so_stateids, 3944 list_for_each_entry(stp, &sop->so_stateids,
3805 st_perstateowner) { 3945 st_perstateowner) {
3806 if (check_for_locks(stp->st_vfs_file, sop)) 3946 if (check_for_locks(stp->st_file, sop))
3807 goto out; 3947 goto out;
3808 /* Note: so_perclient unused for lockowners, 3948 /* Note: so_perclient unused for lockowners,
3809 * so it's OK to fool with here. */ 3949 * so it's OK to fool with here. */
@@ -3976,12 +4116,6 @@ nfsd4_load_reboot_recovery_data(void)
3976 printk("NFSD: Failure reading reboot recovery data\n"); 4116 printk("NFSD: Failure reading reboot recovery data\n");
3977} 4117}
3978 4118
3979unsigned long
3980get_nfs4_grace_period(void)
3981{
3982 return max(user_lease_time, lease_time) * HZ;
3983}
3984
3985/* 4119/*
3986 * Since the lifetime of a delegation isn't limited to that of an open, a 4120 * Since the lifetime of a delegation isn't limited to that of an open, a
3987 * client may quite reasonably hang on to a delegation as long as it has 4121 * client may quite reasonably hang on to a delegation as long as it has
@@ -4008,41 +4142,34 @@ set_max_delegations(void)
4008static int 4142static int
4009__nfs4_state_start(void) 4143__nfs4_state_start(void)
4010{ 4144{
4011 unsigned long grace_time; 4145 int ret;
4012 4146
4013 boot_time = get_seconds(); 4147 boot_time = get_seconds();
4014 grace_time = get_nfs4_grace_period();
4015 lease_time = user_lease_time;
4016 locks_start_grace(&nfsd4_manager); 4148 locks_start_grace(&nfsd4_manager);
4017 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 4149 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
4018 grace_time/HZ); 4150 nfsd4_grace);
4151 ret = set_callback_cred();
4152 if (ret)
4153 return -ENOMEM;
4019 laundry_wq = create_singlethread_workqueue("nfsd4"); 4154 laundry_wq = create_singlethread_workqueue("nfsd4");
4020 if (laundry_wq == NULL) 4155 if (laundry_wq == NULL)
4021 return -ENOMEM; 4156 return -ENOMEM;
4022 queue_delayed_work(laundry_wq, &laundromat_work, grace_time); 4157 ret = nfsd4_create_callback_queue();
4158 if (ret)
4159 goto out_free_laundry;
4160 queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
4023 set_max_delegations(); 4161 set_max_delegations();
4024 return set_callback_cred(); 4162 return 0;
4163out_free_laundry:
4164 destroy_workqueue(laundry_wq);
4165 return ret;
4025} 4166}
4026 4167
4027int 4168int
4028nfs4_state_start(void) 4169nfs4_state_start(void)
4029{ 4170{
4030 int ret;
4031
4032 if (nfs4_init)
4033 return 0;
4034 nfsd4_load_reboot_recovery_data(); 4171 nfsd4_load_reboot_recovery_data();
4035 ret = __nfs4_state_start(); 4172 return __nfs4_state_start();
4036 if (ret)
4037 return ret;
4038 nfs4_init = 1;
4039 return 0;
4040}
4041
4042time_t
4043nfs4_lease_time(void)
4044{
4045 return lease_time;
4046} 4173}
4047 4174
4048static void 4175static void
@@ -4077,7 +4204,6 @@ __nfs4_state_shutdown(void)
4077 } 4204 }
4078 4205
4079 nfsd4_shutdown_recdir(); 4206 nfsd4_shutdown_recdir();
4080 nfs4_init = 0;
4081} 4207}
4082 4208
4083void 4209void
@@ -4090,6 +4216,7 @@ nfs4_state_shutdown(void)
4090 nfs4_release_reclaim(); 4216 nfs4_release_reclaim();
4091 __nfs4_state_shutdown(); 4217 __nfs4_state_shutdown();
4092 nfs4_unlock_state(); 4218 nfs4_unlock_state();
4219 nfsd4_destroy_callback_queue();
4093} 4220}
4094 4221
4095/* 4222/*
@@ -4128,21 +4255,3 @@ nfs4_recoverydir(void)
4128{ 4255{
4129 return user_recovery_dirname; 4256 return user_recovery_dirname;
4130} 4257}
4131
4132/*
4133 * Called when leasetime is changed.
4134 *
4135 * The only way the protocol gives us to handle on-the-fly lease changes is to
4136 * simulate a reboot. Instead of doing that, we just wait till the next time
4137 * we start to register any changes in lease time. If the administrator
4138 * really wants to change the lease time *now*, they can go ahead and bring
4139 * nfsd down and then back up again after changing the lease time.
4140 *
4141 * user_lease_time is protected by nfsd_mutex since it's only really accessed
4142 * when nfsd is starting
4143 */
4144void
4145nfs4_reset_lease(time_t leasetime)
4146{
4147 user_lease_time = leasetime;
4148}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 34ccf815ea8a..1a468bbd330f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1234,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
1234 DECODE_TAIL; 1234 DECODE_TAIL;
1235} 1235}
1236 1236
1237static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
1238{
1239 DECODE_HEAD;
1240
1241 READ_BUF(4);
1242 READ32(rc->rca_one_fs);
1243
1244 DECODE_TAIL;
1245}
1246
1237static __be32 1247static __be32
1238nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) 1248nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1239{ 1249{
@@ -1346,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1346 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, 1356 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
1347 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1357 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1348 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, 1358 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp,
1349 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_notsupp, 1359 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
1350}; 1360};
1351 1361
1352struct nfsd4_minorversion_ops { 1362struct nfsd4_minorversion_ops {
@@ -1746,6 +1756,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1746 struct nfs4_acl *acl = NULL; 1756 struct nfs4_acl *acl = NULL;
1747 struct nfsd4_compoundres *resp = rqstp->rq_resp; 1757 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1748 u32 minorversion = resp->cstate.minorversion; 1758 u32 minorversion = resp->cstate.minorversion;
1759 struct path path = {
1760 .mnt = exp->ex_path.mnt,
1761 .dentry = dentry,
1762 };
1749 1763
1750 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); 1764 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
1751 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); 1765 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -1766,7 +1780,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1766 FATTR4_WORD0_MAXNAME)) || 1780 FATTR4_WORD0_MAXNAME)) ||
1767 (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | 1781 (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
1768 FATTR4_WORD1_SPACE_TOTAL))) { 1782 FATTR4_WORD1_SPACE_TOTAL))) {
1769 err = vfs_statfs(dentry, &statfs); 1783 err = vfs_statfs(&path, &statfs);
1770 if (err) 1784 if (err)
1771 goto out_nfserr; 1785 goto out_nfserr;
1772 } 1786 }
@@ -1900,7 +1914,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1900 if (bmval0 & FATTR4_WORD0_LEASE_TIME) { 1914 if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
1901 if ((buflen -= 4) < 0) 1915 if ((buflen -= 4) < 0)
1902 goto out_resource; 1916 goto out_resource;
1903 WRITE32(NFSD_LEASE_TIME); 1917 WRITE32(nfsd4_lease);
1904 } 1918 }
1905 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { 1919 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
1906 if ((buflen -= 4) < 0) 1920 if ((buflen -= 4) < 0)
@@ -2620,7 +2634,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2620 } 2634 }
2621 read->rd_vlen = v; 2635 read->rd_vlen = v;
2622 2636
2623 nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, 2637 nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
2624 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, 2638 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
2625 &maxcount); 2639 &maxcount);
2626 2640
@@ -3307,11 +3321,15 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3307 iov = &rqstp->rq_res.head[0]; 3321 iov = &rqstp->rq_res.head[0];
3308 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3322 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
3309 BUG_ON(iov->iov_len > PAGE_SIZE); 3323 BUG_ON(iov->iov_len > PAGE_SIZE);
3310 if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) { 3324 if (nfsd4_has_session(cs)) {
3311 nfsd4_store_cache_entry(resp); 3325 if (cs->status != nfserr_replay_cache) {
3312 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); 3326 nfsd4_store_cache_entry(resp);
3313 resp->cstate.slot->sl_inuse = false; 3327 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3314 nfsd4_put_session(resp->cstate.session); 3328 cs->slot->sl_inuse = false;
3329 }
3330 /* Renew the clientid on success and on replay */
3331 release_session_client(cs->session);
3332 nfsd4_put_session(cs->session);
3315 } 3333 }
3316 return 1; 3334 return 1;
3317} 3335}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index e3591073098f..b53b1d042f1f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -46,6 +46,7 @@ enum {
46 */ 46 */
47#ifdef CONFIG_NFSD_V4 47#ifdef CONFIG_NFSD_V4
48 NFSD_Leasetime, 48 NFSD_Leasetime,
49 NFSD_Gracetime,
49 NFSD_RecoveryDir, 50 NFSD_RecoveryDir,
50#endif 51#endif
51}; 52};
@@ -70,6 +71,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size);
70static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); 71static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
71#ifdef CONFIG_NFSD_V4 72#ifdef CONFIG_NFSD_V4
72static ssize_t write_leasetime(struct file *file, char *buf, size_t size); 73static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
74static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
73static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 75static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
74#endif 76#endif
75 77
@@ -91,6 +93,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
91 [NFSD_MaxBlkSize] = write_maxblksize, 93 [NFSD_MaxBlkSize] = write_maxblksize,
92#ifdef CONFIG_NFSD_V4 94#ifdef CONFIG_NFSD_V4
93 [NFSD_Leasetime] = write_leasetime, 95 [NFSD_Leasetime] = write_leasetime,
96 [NFSD_Gracetime] = write_gracetime,
94 [NFSD_RecoveryDir] = write_recoverydir, 97 [NFSD_RecoveryDir] = write_recoverydir,
95#endif 98#endif
96}; 99};
@@ -946,15 +949,12 @@ static ssize_t __write_ports_addfd(char *buf)
946 if (err != 0) 949 if (err != 0)
947 return err; 950 return err;
948 951
949 err = lockd_up();
950 if (err != 0)
951 goto out;
952
953 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); 952 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
954 if (err < 0) 953 if (err < 0) {
955 lockd_down(); 954 svc_destroy(nfsd_serv);
955 return err;
956 }
956 957
957out:
958 /* Decrease the count, but don't shut down the service */ 958 /* Decrease the count, but don't shut down the service */
959 nfsd_serv->sv_nrthreads--; 959 nfsd_serv->sv_nrthreads--;
960 return err; 960 return err;
@@ -975,9 +975,6 @@ static ssize_t __write_ports_delfd(char *buf)
975 if (nfsd_serv != NULL) 975 if (nfsd_serv != NULL)
976 len = svc_sock_names(nfsd_serv, buf, 976 len = svc_sock_names(nfsd_serv, buf,
977 SIMPLE_TRANSACTION_LIMIT, toclose); 977 SIMPLE_TRANSACTION_LIMIT, toclose);
978 if (len >= 0)
979 lockd_down();
980
981 kfree(toclose); 978 kfree(toclose);
982 return len; 979 return len;
983} 980}
@@ -995,7 +992,7 @@ static ssize_t __write_ports_addxprt(char *buf)
995 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 992 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
996 return -EINVAL; 993 return -EINVAL;
997 994
998 if (port < 1 || port > USHORT_MAX) 995 if (port < 1 || port > USHRT_MAX)
999 return -EINVAL; 996 return -EINVAL;
1000 997
1001 err = nfsd_create_serv(); 998 err = nfsd_create_serv();
@@ -1011,6 +1008,9 @@ static ssize_t __write_ports_addxprt(char *buf)
1011 PF_INET6, port, SVC_SOCK_ANONYMOUS); 1008 PF_INET6, port, SVC_SOCK_ANONYMOUS);
1012 if (err < 0 && err != -EAFNOSUPPORT) 1009 if (err < 0 && err != -EAFNOSUPPORT)
1013 goto out_close; 1010 goto out_close;
1011
1012 /* Decrease the count, but don't shut down the service */
1013 nfsd_serv->sv_nrthreads--;
1014 return 0; 1014 return 0;
1015out_close: 1015out_close:
1016 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); 1016 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
@@ -1019,8 +1019,7 @@ out_close:
1019 svc_xprt_put(xprt); 1019 svc_xprt_put(xprt);
1020 } 1020 }
1021out_err: 1021out_err:
1022 /* Decrease the count, but don't shut down the service */ 1022 svc_destroy(nfsd_serv);
1023 nfsd_serv->sv_nrthreads--;
1024 return err; 1023 return err;
1025} 1024}
1026 1025
@@ -1037,7 +1036,7 @@ static ssize_t __write_ports_delxprt(char *buf)
1037 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) 1036 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
1038 return -EINVAL; 1037 return -EINVAL;
1039 1038
1040 if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) 1039 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
1041 return -EINVAL; 1040 return -EINVAL;
1042 1041
1043 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); 1042 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
@@ -1191,7 +1190,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
1191 bsize = NFSSVC_MAXBLKSIZE; 1190 bsize = NFSSVC_MAXBLKSIZE;
1192 bsize &= ~(1024-1); 1191 bsize &= ~(1024-1);
1193 mutex_lock(&nfsd_mutex); 1192 mutex_lock(&nfsd_mutex);
1194 if (nfsd_serv && nfsd_serv->sv_nrthreads) { 1193 if (nfsd_serv) {
1195 mutex_unlock(&nfsd_mutex); 1194 mutex_unlock(&nfsd_mutex);
1196 return -EBUSY; 1195 return -EBUSY;
1197 } 1196 }
@@ -1204,29 +1203,45 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
1204} 1203}
1205 1204
1206#ifdef CONFIG_NFSD_V4 1205#ifdef CONFIG_NFSD_V4
1207extern time_t nfs4_leasetime(void); 1206static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
1208
1209static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
1210{ 1207{
1211 /* if size > 10 seconds, call
1212 * nfs4_reset_lease() then write out the new lease (seconds) as reply
1213 */
1214 char *mesg = buf; 1208 char *mesg = buf;
1215 int rv, lease; 1209 int rv, i;
1216 1210
1217 if (size > 0) { 1211 if (size > 0) {
1218 if (nfsd_serv) 1212 if (nfsd_serv)
1219 return -EBUSY; 1213 return -EBUSY;
1220 rv = get_int(&mesg, &lease); 1214 rv = get_int(&mesg, &i);
1221 if (rv) 1215 if (rv)
1222 return rv; 1216 return rv;
1223 if (lease < 10 || lease > 3600) 1217 /*
1218 * Some sanity checking. We don't have a reason for
1219 * these particular numbers, but problems with the
1220 * extremes are:
1221 * - Too short: the briefest network outage may
1222 * cause clients to lose all their locks. Also,
1223 * the frequent polling may be wasteful.
1224 * - Too long: do you really want reboot recovery
1225 * to take more than an hour? Or to make other
1226 * clients wait an hour before being able to
1227 * revoke a dead client's locks?
1228 */
1229 if (i < 10 || i > 3600)
1224 return -EINVAL; 1230 return -EINVAL;
1225 nfs4_reset_lease(lease); 1231 *time = i;
1226 } 1232 }
1227 1233
1228 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", 1234 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
1229 nfs4_lease_time()); 1235}
1236
1237static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
1238{
1239 ssize_t rv;
1240
1241 mutex_lock(&nfsd_mutex);
1242 rv = __nfsd4_write_time(file, buf, size, time);
1243 mutex_unlock(&nfsd_mutex);
1244 return rv;
1230} 1245}
1231 1246
1232/** 1247/**
@@ -1252,12 +1267,22 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
1252 */ 1267 */
1253static ssize_t write_leasetime(struct file *file, char *buf, size_t size) 1268static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
1254{ 1269{
1255 ssize_t rv; 1270 return nfsd4_write_time(file, buf, size, &nfsd4_lease);
1271}
1256 1272
1257 mutex_lock(&nfsd_mutex); 1273/**
1258 rv = __write_leasetime(file, buf, size); 1274 * write_gracetime - Set or report current NFSv4 grace period time
1259 mutex_unlock(&nfsd_mutex); 1275 *
1260 return rv; 1276 * As above, but sets the time of the NFSv4 grace period.
1277 *
1278 * Note this should never be set to less than the *previous*
1279 * lease-period time, but we don't try to enforce this. (In the common
1280 * case (a new boot), we don't know what the previous lease time was
1281 * anyway.)
1282 */
1283static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
1284{
1285 return nfsd4_write_time(file, buf, size, &nfsd4_grace);
1261} 1286}
1262 1287
1263extern char *nfs4_recoverydir(void); 1288extern char *nfs4_recoverydir(void);
@@ -1281,6 +1306,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
1281 return -EINVAL; 1306 return -EINVAL;
1282 1307
1283 status = nfs4_reset_recoverydir(recdir); 1308 status = nfs4_reset_recoverydir(recdir);
1309 if (status)
1310 return status;
1284 } 1311 }
1285 1312
1286 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", 1313 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
@@ -1351,6 +1378,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1351 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1378 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
1352#ifdef CONFIG_NFSD_V4 1379#ifdef CONFIG_NFSD_V4
1353 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1380 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
1381 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
1354 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, 1382 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
1355#endif 1383#endif
1356 /* last one */ {""} 1384 /* last one */ {""}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e942a1aaac92..b76ac3a82e39 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -82,7 +82,6 @@ int nfs4_state_init(void);
82void nfsd4_free_slabs(void); 82void nfsd4_free_slabs(void);
83int nfs4_state_start(void); 83int nfs4_state_start(void);
84void nfs4_state_shutdown(void); 84void nfs4_state_shutdown(void);
85time_t nfs4_lease_time(void);
86void nfs4_reset_lease(time_t leasetime); 85void nfs4_reset_lease(time_t leasetime);
87int nfs4_reset_recoverydir(char *recdir); 86int nfs4_reset_recoverydir(char *recdir);
88#else 87#else
@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) { return 0; }
90static inline void nfsd4_free_slabs(void) { } 89static inline void nfsd4_free_slabs(void) { }
91static inline int nfs4_state_start(void) { return 0; } 90static inline int nfs4_state_start(void) { return 0; }
92static inline void nfs4_state_shutdown(void) { } 91static inline void nfs4_state_shutdown(void) { }
93static inline time_t nfs4_lease_time(void) { return 0; }
94static inline void nfs4_reset_lease(time_t leasetime) { } 92static inline void nfs4_reset_lease(time_t leasetime) { }
95static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } 93static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
96#endif 94#endif
@@ -155,6 +153,7 @@ void nfsd_lockd_shutdown(void);
155#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) 153#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID)
156#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) 154#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK)
157#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) 155#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME)
156#define nfserr_lock_range cpu_to_be32(NFSERR_LOCK_RANGE)
158#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) 157#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH)
159#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) 158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
160#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) 159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
@@ -229,6 +228,9 @@ extern struct timeval nfssvc_boot;
229 228
230#ifdef CONFIG_NFSD_V4 229#ifdef CONFIG_NFSD_V4
231 230
231extern time_t nfsd4_lease;
232extern time_t nfsd4_grace;
233
232/* before processing a COMPOUND operation, we have to check that there 234/* before processing a COMPOUND operation, we have to check that there
233 * is enough space in the buffer for XDR encode to succeed. otherwise, 235 * is enough space in the buffer for XDR encode to succeed. otherwise,
234 * we might process an operation with side effects, and be unable to 236 * we might process an operation with side effects, and be unable to
@@ -247,7 +249,6 @@ extern struct timeval nfssvc_boot;
247#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ 249#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
248#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ 250#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
249 251
250#define NFSD_LEASE_TIME (nfs4_lease_time())
251#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ 252#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */
252 253
253/* 254/*
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a4206..c16f8d8331b5 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
196static inline void 196static inline void
197fh_unlock(struct svc_fh *fhp) 197fh_unlock(struct svc_fh *fhp)
198{ 198{
199 BUG_ON(!fhp->fh_dentry);
200
201 if (fhp->fh_locked) { 199 if (fhp->fh_locked) {
202 fill_post_wcc(fhp); 200 fill_post_wcc(fhp);
203 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); 201 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a047ad6111ef..08e17264784b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -144,7 +144,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
144 svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); 144 svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
145 145
146 resp->count = argp->count; 146 resp->count = argp->count;
147 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 147 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
148 argp->offset, 148 argp->offset,
149 rqstp->rq_vec, argp->vlen, 149 rqstp->rq_vec, argp->vlen,
150 &resp->count); 150 &resp->count);
@@ -290,7 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
290 * gospel of sun micro 290 * gospel of sun micro
291 */ 291 */
292 if (type != S_IFREG) { 292 if (type != S_IFREG) {
293 int is_borc = 0;
294 if (type != S_IFBLK && type != S_IFCHR) { 293 if (type != S_IFBLK && type != S_IFCHR) {
295 rdev = 0; 294 rdev = 0;
296 } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) { 295 } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) {
@@ -298,7 +297,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
298 type = S_IFIFO; 297 type = S_IFIFO;
299 } else { 298 } else {
300 /* Okay, char or block special */ 299 /* Okay, char or block special */
301 is_borc = 1;
302 if (!rdev) 300 if (!rdev)
303 rdev = wanted; 301 rdev = wanted;
304 } 302 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 171699eb07c8..e2c43464f237 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -120,7 +120,7 @@ u32 nfsd_supported_minorversion;
120int nfsd_vers(int vers, enum vers_op change) 120int nfsd_vers(int vers, enum vers_op change)
121{ 121{
122 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) 122 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
123 return -1; 123 return 0;
124 switch(change) { 124 switch(change) {
125 case NFSD_SET: 125 case NFSD_SET:
126 nfsd_versions[vers] = nfsd_version[vers]; 126 nfsd_versions[vers] = nfsd_version[vers];
@@ -180,15 +180,80 @@ int nfsd_nrthreads(void)
180 return rv; 180 return rv;
181} 181}
182 182
183static int nfsd_init_socks(int port)
184{
185 int error;
186 if (!list_empty(&nfsd_serv->sv_permsocks))
187 return 0;
188
189 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
190 SVC_SOCK_DEFAULTS);
191 if (error < 0)
192 return error;
193
194 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
195 SVC_SOCK_DEFAULTS);
196 if (error < 0)
197 return error;
198
199 return 0;
200}
201
202static bool nfsd_up = false;
203
204static int nfsd_startup(unsigned short port, int nrservs)
205{
206 int ret;
207
208 if (nfsd_up)
209 return 0;
210 /*
211 * Readahead param cache - will no-op if it already exists.
212 * (Note therefore results will be suboptimal if number of
213 * threads is modified after nfsd start.)
214 */
215 ret = nfsd_racache_init(2*nrservs);
216 if (ret)
217 return ret;
218 ret = nfsd_init_socks(port);
219 if (ret)
220 goto out_racache;
221 ret = lockd_up();
222 if (ret)
223 goto out_racache;
224 ret = nfs4_state_start();
225 if (ret)
226 goto out_lockd;
227 nfsd_up = true;
228 return 0;
229out_lockd:
230 lockd_down();
231out_racache:
232 nfsd_racache_shutdown();
233 return ret;
234}
235
236static void nfsd_shutdown(void)
237{
238 /*
239 * write_ports can create the server without actually starting
240 * any threads--if we get shut down before any threads are
241 * started, then nfsd_last_thread will be run before any of this
242 * other initialization has been done.
243 */
244 if (!nfsd_up)
245 return;
246 nfs4_state_shutdown();
247 lockd_down();
248 nfsd_racache_shutdown();
249 nfsd_up = false;
250}
251
183static void nfsd_last_thread(struct svc_serv *serv) 252static void nfsd_last_thread(struct svc_serv *serv)
184{ 253{
185 /* When last nfsd thread exits we need to do some clean-up */ 254 /* When last nfsd thread exits we need to do some clean-up */
186 struct svc_xprt *xprt;
187 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
188 lockd_down();
189 nfsd_serv = NULL; 255 nfsd_serv = NULL;
190 nfsd_racache_shutdown(); 256 nfsd_shutdown();
191 nfs4_state_shutdown();
192 257
193 printk(KERN_WARNING "nfsd: last server has exited, flushing export " 258 printk(KERN_WARNING "nfsd: last server has exited, flushing export "
194 "cache\n"); 259 "cache\n");
@@ -263,45 +328,18 @@ int nfsd_create_serv(void)
263 nfsd_max_blksize >= 8*1024*2) 328 nfsd_max_blksize >= 8*1024*2)
264 nfsd_max_blksize /= 2; 329 nfsd_max_blksize /= 2;
265 } 330 }
331 nfsd_reset_versions();
266 332
267 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 333 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
268 nfsd_last_thread, nfsd, THIS_MODULE); 334 nfsd_last_thread, nfsd, THIS_MODULE);
269 if (nfsd_serv == NULL) 335 if (nfsd_serv == NULL)
270 err = -ENOMEM; 336 return -ENOMEM;
271 else
272 set_max_drc();
273 337
338 set_max_drc();
274 do_gettimeofday(&nfssvc_boot); /* record boot time */ 339 do_gettimeofday(&nfssvc_boot); /* record boot time */
275 return err; 340 return err;
276} 341}
277 342
278static int nfsd_init_socks(int port)
279{
280 int error;
281 if (!list_empty(&nfsd_serv->sv_permsocks))
282 return 0;
283
284 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
285 SVC_SOCK_DEFAULTS);
286 if (error < 0)
287 return error;
288
289 error = lockd_up();
290 if (error < 0)
291 return error;
292
293 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
294 SVC_SOCK_DEFAULTS);
295 if (error < 0)
296 return error;
297
298 error = lockd_up();
299 if (error < 0)
300 return error;
301
302 return 0;
303}
304
305int nfsd_nrpools(void) 343int nfsd_nrpools(void)
306{ 344{
307 if (nfsd_serv == NULL) 345 if (nfsd_serv == NULL)
@@ -376,10 +414,16 @@ int nfsd_set_nrthreads(int n, int *nthreads)
376 return err; 414 return err;
377} 415}
378 416
417/*
418 * Adjust the number of threads and return the new number of threads.
419 * This is also the function that starts the server if necessary, if
420 * this is the first time nrservs is nonzero.
421 */
379int 422int
380nfsd_svc(unsigned short port, int nrservs) 423nfsd_svc(unsigned short port, int nrservs)
381{ 424{
382 int error; 425 int error;
426 bool nfsd_up_before;
383 427
384 mutex_lock(&nfsd_mutex); 428 mutex_lock(&nfsd_mutex);
385 dprintk("nfsd: creating service\n"); 429 dprintk("nfsd: creating service\n");
@@ -391,34 +435,29 @@ nfsd_svc(unsigned short port, int nrservs)
391 if (nrservs == 0 && nfsd_serv == NULL) 435 if (nrservs == 0 && nfsd_serv == NULL)
392 goto out; 436 goto out;
393 437
394 /* Readahead param cache - will no-op if it already exists */ 438 error = nfsd_create_serv();
395 error = nfsd_racache_init(2*nrservs);
396 if (error<0)
397 goto out;
398 error = nfs4_state_start();
399 if (error) 439 if (error)
400 goto out; 440 goto out;
401 441
402 nfsd_reset_versions(); 442 nfsd_up_before = nfsd_up;
403
404 error = nfsd_create_serv();
405 443
444 error = nfsd_startup(port, nrservs);
406 if (error) 445 if (error)
407 goto out; 446 goto out_destroy;
408 error = nfsd_init_socks(port);
409 if (error)
410 goto failure;
411
412 error = svc_set_num_threads(nfsd_serv, NULL, nrservs); 447 error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
413 if (error == 0) 448 if (error)
414 /* We are holding a reference to nfsd_serv which 449 goto out_shutdown;
415 * we don't want to count in the return value, 450 /* We are holding a reference to nfsd_serv which
416 * so subtract 1 451 * we don't want to count in the return value,
417 */ 452 * so subtract 1
418 error = nfsd_serv->sv_nrthreads - 1; 453 */
419 failure: 454 error = nfsd_serv->sv_nrthreads - 1;
455out_shutdown:
456 if (error < 0 && !nfsd_up_before)
457 nfsd_shutdown();
458out_destroy:
420 svc_destroy(nfsd_serv); /* Release server */ 459 svc_destroy(nfsd_serv); /* Release server */
421 out: 460out:
422 mutex_unlock(&nfsd_mutex); 461 mutex_unlock(&nfsd_mutex);
423 return error; 462 return error;
424} 463}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index fefeae27f25e..322518c88e4b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence {
70 struct nfs4_client *cbs_clp; 70 struct nfs4_client *cbs_clp;
71}; 71};
72 72
73struct nfs4_rpc_args {
74 void *args_op;
75 struct nfsd4_cb_sequence args_seq;
76};
77
78struct nfsd4_callback {
79 struct nfs4_rpc_args cb_args;
80 struct work_struct cb_work;
81};
82
73struct nfs4_delegation { 83struct nfs4_delegation {
74 struct list_head dl_perfile; 84 struct list_head dl_perfile;
75 struct list_head dl_perclnt; 85 struct list_head dl_perclnt;
@@ -78,7 +88,6 @@ struct nfs4_delegation {
78 struct nfs4_client *dl_client; 88 struct nfs4_client *dl_client;
79 struct nfs4_file *dl_file; 89 struct nfs4_file *dl_file;
80 struct file_lock *dl_flock; 90 struct file_lock *dl_flock;
81 struct file *dl_vfs_file;
82 u32 dl_type; 91 u32 dl_type;
83 time_t dl_time; 92 time_t dl_time;
84/* For recall: */ 93/* For recall: */
@@ -86,6 +95,7 @@ struct nfs4_delegation {
86 stateid_t dl_stateid; 95 stateid_t dl_stateid;
87 struct knfsd_fh dl_fh; 96 struct knfsd_fh dl_fh;
88 int dl_retries; 97 int dl_retries;
98 struct nfsd4_callback dl_recall;
89}; 99};
90 100
91/* client delegation callback info */ 101/* client delegation callback info */
@@ -96,9 +106,7 @@ struct nfs4_cb_conn {
96 u32 cb_prog; 106 u32 cb_prog;
97 u32 cb_minorversion; 107 u32 cb_minorversion;
98 u32 cb_ident; /* minorversion 0 only */ 108 u32 cb_ident; /* minorversion 0 only */
99 /* RPC client info */ 109 struct svc_xprt *cb_xprt; /* minorversion 1 only */
100 atomic_t cb_set; /* successful CB_NULL call */
101 struct rpc_clnt * cb_client;
102}; 110};
103 111
104/* Maximum number of slots per session. 160 is useful for long haul TCP */ 112/* Maximum number of slots per session. 160 is useful for long haul TCP */
@@ -157,7 +165,7 @@ struct nfsd4_session {
157 struct list_head se_hash; /* hash by sessionid */ 165 struct list_head se_hash; /* hash by sessionid */
158 struct list_head se_perclnt; 166 struct list_head se_perclnt;
159 u32 se_flags; 167 u32 se_flags;
160 struct nfs4_client *se_client; /* for expire_client */ 168 struct nfs4_client *se_client;
161 struct nfs4_sessionid se_sessionid; 169 struct nfs4_sessionid se_sessionid;
162 struct nfsd4_channel_attrs se_fchannel; 170 struct nfsd4_channel_attrs se_fchannel;
163 struct nfsd4_channel_attrs se_bchannel; 171 struct nfsd4_channel_attrs se_bchannel;
@@ -212,25 +220,41 @@ struct nfs4_client {
212 struct svc_cred cl_cred; /* setclientid principal */ 220 struct svc_cred cl_cred; /* setclientid principal */
213 clientid_t cl_clientid; /* generated by server */ 221 clientid_t cl_clientid; /* generated by server */
214 nfs4_verifier cl_confirm; /* generated by server */ 222 nfs4_verifier cl_confirm; /* generated by server */
215 struct nfs4_cb_conn cl_cb_conn; /* callback info */
216 atomic_t cl_count; /* ref count */
217 u32 cl_firststate; /* recovery dir creation */ 223 u32 cl_firststate; /* recovery dir creation */
218 224
225 /* for v4.0 and v4.1 callbacks: */
226 struct nfs4_cb_conn cl_cb_conn;
227 struct rpc_clnt *cl_cb_client;
228 atomic_t cl_cb_set;
229
219 /* for nfs41 */ 230 /* for nfs41 */
220 struct list_head cl_sessions; 231 struct list_head cl_sessions;
221 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ 232 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
222 u32 cl_exchange_flags; 233 u32 cl_exchange_flags;
223 struct nfs4_sessionid cl_sessionid; 234 struct nfs4_sessionid cl_sessionid;
235 /* number of rpc's in progress over an associated session: */
236 atomic_t cl_refcount;
224 237
225 /* for nfs41 callbacks */ 238 /* for nfs41 callbacks */
226 /* We currently support a single back channel with a single slot */ 239 /* We currently support a single back channel with a single slot */
227 unsigned long cl_cb_slot_busy; 240 unsigned long cl_cb_slot_busy;
228 u32 cl_cb_seq_nr; 241 u32 cl_cb_seq_nr;
229 struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */
230 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ 242 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
231 /* wait here for slots */ 243 /* wait here for slots */
232}; 244};
233 245
246static inline void
247mark_client_expired(struct nfs4_client *clp)
248{
249 clp->cl_time = 0;
250}
251
252static inline bool
253is_client_expired(struct nfs4_client *clp)
254{
255 return clp->cl_time == 0;
256}
257
234/* struct nfs4_client_reset 258/* struct nfs4_client_reset
235 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl 259 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
236 * upon lease reset, or from upcall to state_daemon (to read in state 260 * upon lease reset, or from upcall to state_daemon (to read in state
@@ -317,12 +341,50 @@ struct nfs4_file {
317 struct list_head fi_hash; /* hash by "struct inode *" */ 341 struct list_head fi_hash; /* hash by "struct inode *" */
318 struct list_head fi_stateids; 342 struct list_head fi_stateids;
319 struct list_head fi_delegations; 343 struct list_head fi_delegations;
344 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
345 struct file * fi_fds[3];
346 /* One each for O_RDONLY, O_WRONLY: */
347 atomic_t fi_access[2];
348 /*
349 * Each open stateid contributes 1 to either fi_readers or
350 * fi_writers, or both, depending on the open mode. A
351 * delegation also takes an fi_readers reference. Lock
352 * stateid's take none.
353 */
354 atomic_t fi_readers;
355 atomic_t fi_writers;
320 struct inode *fi_inode; 356 struct inode *fi_inode;
321 u32 fi_id; /* used with stateowner->so_id 357 u32 fi_id; /* used with stateowner->so_id
322 * for stateid_hashtbl hash */ 358 * for stateid_hashtbl hash */
323 bool fi_had_conflict; 359 bool fi_had_conflict;
324}; 360};
325 361
362/* XXX: for first cut may fall back on returning file that doesn't work
363 * at all? */
364static inline struct file *find_writeable_file(struct nfs4_file *f)
365{
366 if (f->fi_fds[O_WRONLY])
367 return f->fi_fds[O_WRONLY];
368 return f->fi_fds[O_RDWR];
369}
370
371static inline struct file *find_readable_file(struct nfs4_file *f)
372{
373 if (f->fi_fds[O_RDONLY])
374 return f->fi_fds[O_RDONLY];
375 return f->fi_fds[O_RDWR];
376}
377
378static inline struct file *find_any_file(struct nfs4_file *f)
379{
380 if (f->fi_fds[O_RDWR])
381 return f->fi_fds[O_RDWR];
382 else if (f->fi_fds[O_WRONLY])
383 return f->fi_fds[O_WRONLY];
384 else
385 return f->fi_fds[O_RDONLY];
386}
387
326/* 388/*
327* nfs4_stateid can either be an open stateid or (eventually) a lock stateid 389* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
328* 390*
@@ -348,7 +410,6 @@ struct nfs4_stateid {
348 struct nfs4_stateowner * st_stateowner; 410 struct nfs4_stateowner * st_stateowner;
349 struct nfs4_file * st_file; 411 struct nfs4_file * st_file;
350 stateid_t st_stateid; 412 stateid_t st_stateid;
351 struct file * st_vfs_file;
352 unsigned long st_access_bmap; 413 unsigned long st_access_bmap;
353 unsigned long st_deny_bmap; 414 unsigned long st_deny_bmap;
354 struct nfs4_stateid * st_openstp; 415 struct nfs4_stateid * st_openstp;
@@ -377,11 +438,14 @@ extern void nfs4_lock_state(void);
377extern void nfs4_unlock_state(void); 438extern void nfs4_unlock_state(void);
378extern int nfs4_in_grace(void); 439extern int nfs4_in_grace(void);
379extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 440extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
380extern void put_nfs4_client(struct nfs4_client *clp);
381extern void nfs4_free_stateowner(struct kref *kref); 441extern void nfs4_free_stateowner(struct kref *kref);
382extern int set_callback_cred(void); 442extern int set_callback_cred(void);
383extern void nfsd4_probe_callback(struct nfs4_client *clp); 443extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
444extern void nfsd4_do_callback_rpc(struct work_struct *);
384extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 445extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
446extern int nfsd4_create_callback_queue(void);
447extern void nfsd4_destroy_callback_queue(void);
448extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
385extern void nfs4_put_delegation(struct nfs4_delegation *dp); 449extern void nfs4_put_delegation(struct nfs4_delegation *dp);
386extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 450extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
387extern void nfsd4_init_recdir(char *recdir_name); 451extern void nfsd4_init_recdir(char *recdir_name);
@@ -392,6 +456,7 @@ extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
392extern void nfsd4_recdir_purge_old(void); 456extern void nfsd4_recdir_purge_old(void);
393extern int nfsd4_create_clid_dir(struct nfs4_client *clp); 457extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
394extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); 458extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
459extern void release_session_client(struct nfsd4_session *);
395 460
396static inline void 461static inline void
397nfs4_put_stateowner(struct nfs4_stateowner *so) 462nfs4_put_stateowner(struct nfs4_stateowner *so)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6dd5f1970e01..661a6cf8e826 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -443,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
443 if (size_change) 443 if (size_change)
444 put_write_access(inode); 444 put_write_access(inode);
445 if (!err) 445 if (!err)
446 if (EX_ISSYNC(fhp->fh_export)) 446 commit_metadata(fhp);
447 write_inode_now(inode, 1);
448out: 447out:
449 return err; 448 return err;
450 449
@@ -605,7 +604,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
605 return error; 604 return error;
606} 605}
607 606
608#endif /* defined(CONFIG_NFS_V4) */ 607#endif /* defined(CONFIG_NFSD_V4) */
609 608
610#ifdef CONFIG_NFSD_V3 609#ifdef CONFIG_NFSD_V3
611/* 610/*
@@ -724,7 +723,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
724 struct inode *inode; 723 struct inode *inode;
725 int flags = O_RDONLY|O_LARGEFILE; 724 int flags = O_RDONLY|O_LARGEFILE;
726 __be32 err; 725 __be32 err;
727 int host_err; 726 int host_err = 0;
728 727
729 validate_process_creds(); 728 validate_process_creds();
730 729
@@ -761,7 +760,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
761 * Check to see if there are any leases on this file. 760 * Check to see if there are any leases on this file.
762 * This may block while leases are broken. 761 * This may block while leases are broken.
763 */ 762 */
764 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); 763 if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
764 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
765 if (host_err == -EWOULDBLOCK) 765 if (host_err == -EWOULDBLOCK)
766 host_err = -ETIMEDOUT; 766 host_err = -ETIMEDOUT;
767 if (host_err) /* NOMEM or WOULDBLOCK */ 767 if (host_err) /* NOMEM or WOULDBLOCK */
@@ -903,7 +903,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
903 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 903 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
904{ 904{
905 struct inode *inode; 905 struct inode *inode;
906 struct raparms *ra;
907 mm_segment_t oldfs; 906 mm_segment_t oldfs;
908 __be32 err; 907 __be32 err;
909 int host_err; 908 int host_err;
@@ -914,12 +913,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
914 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count)) 913 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
915 goto out; 914 goto out;
916 915
917 /* Get readahead parameters */
918 ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
919
920 if (ra && ra->p_set)
921 file->f_ra = ra->p_ra;
922
923 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 916 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
924 struct splice_desc sd = { 917 struct splice_desc sd = {
925 .len = 0, 918 .len = 0,
@@ -937,21 +930,11 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
937 set_fs(oldfs); 930 set_fs(oldfs);
938 } 931 }
939 932
940 /* Write back readahead params */
941 if (ra) {
942 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
943 spin_lock(&rab->pb_lock);
944 ra->p_ra = file->f_ra;
945 ra->p_set = 1;
946 ra->p_count--;
947 spin_unlock(&rab->pb_lock);
948 }
949
950 if (host_err >= 0) { 933 if (host_err >= 0) {
951 nfsdstats.io_read += host_err; 934 nfsdstats.io_read += host_err;
952 *count = host_err; 935 *count = host_err;
953 err = 0; 936 err = 0;
954 fsnotify_access(file->f_path.dentry); 937 fsnotify_access(file);
955 } else 938 } else
956 err = nfserrno(host_err); 939 err = nfserrno(host_err);
957out: 940out:
@@ -998,7 +981,7 @@ static int wait_for_concurrent_writes(struct file *file)
998 981
999 if (inode->i_state & I_DIRTY) { 982 if (inode->i_state & I_DIRTY) {
1000 dprintk("nfsd: write sync %d\n", task_pid_nr(current)); 983 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1001 err = vfs_fsync(file, file->f_path.dentry, 0); 984 err = vfs_fsync(file, 0);
1002 } 985 }
1003 last_ino = inode->i_ino; 986 last_ino = inode->i_ino;
1004 last_dev = inode->i_sb->s_dev; 987 last_dev = inode->i_sb->s_dev;
@@ -1062,7 +1045,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1062 goto out_nfserr; 1045 goto out_nfserr;
1063 *cnt = host_err; 1046 *cnt = host_err;
1064 nfsdstats.io_write += host_err; 1047 nfsdstats.io_write += host_err;
1065 fsnotify_modify(file->f_path.dentry); 1048 fsnotify_modify(file);
1066 1049
1067 /* clear setuid/setgid flag after write */ 1050 /* clear setuid/setgid flag after write */
1068 if (inode->i_mode & (S_ISUID | S_ISGID)) 1051 if (inode->i_mode & (S_ISUID | S_ISGID))
@@ -1086,8 +1069,45 @@ out:
1086 * on entry. On return, *count contains the number of bytes actually read. 1069 * on entry. On return, *count contains the number of bytes actually read.
1087 * N.B. After this call fhp needs an fh_put 1070 * N.B. After this call fhp needs an fh_put
1088 */ 1071 */
1072__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
1073 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
1074{
1075 struct file *file;
1076 struct inode *inode;
1077 struct raparms *ra;
1078 __be32 err;
1079
1080 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
1081 if (err)
1082 return err;
1083
1084 inode = file->f_path.dentry->d_inode;
1085
1086 /* Get readahead parameters */
1087 ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
1088
1089 if (ra && ra->p_set)
1090 file->f_ra = ra->p_ra;
1091
1092 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1093
1094 /* Write back readahead params */
1095 if (ra) {
1096 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
1097 spin_lock(&rab->pb_lock);
1098 ra->p_ra = file->f_ra;
1099 ra->p_set = 1;
1100 ra->p_count--;
1101 spin_unlock(&rab->pb_lock);
1102 }
1103
1104 nfsd_close(file);
1105 return err;
1106}
1107
1108/* As above, but use the provided file descriptor. */
1089__be32 1109__be32
1090nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1110nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1091 loff_t offset, struct kvec *vec, int vlen, 1111 loff_t offset, struct kvec *vec, int vlen,
1092 unsigned long *count) 1112 unsigned long *count)
1093{ 1113{
@@ -1099,13 +1119,8 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1099 if (err) 1119 if (err)
1100 goto out; 1120 goto out;
1101 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); 1121 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1102 } else { 1122 } else /* Note file may still be NULL in NFSv4 special stateid case: */
1103 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); 1123 err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
1104 if (err)
1105 goto out;
1106 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1107 nfsd_close(file);
1108 }
1109out: 1124out:
1110 return err; 1125 return err;
1111} 1126}
@@ -1169,12 +1184,12 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1169 goto out; 1184 goto out;
1170 } 1185 }
1171 1186
1172 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); 1187 err = nfsd_open(rqstp, fhp, S_IFREG,
1188 NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
1173 if (err) 1189 if (err)
1174 goto out; 1190 goto out;
1175 if (EX_ISSYNC(fhp->fh_export)) { 1191 if (EX_ISSYNC(fhp->fh_export)) {
1176 int err2 = vfs_fsync_range(file, file->f_path.dentry, 1192 int err2 = vfs_fsync_range(file, offset, end, 0);
1177 offset, end, 0);
1178 1193
1179 if (err2 != -EINVAL) 1194 if (err2 != -EINVAL)
1180 err = nfserrno(err2); 1195 err = nfserrno(err2);
@@ -1631,7 +1646,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1631 char *name, int len, struct svc_fh *tfhp) 1646 char *name, int len, struct svc_fh *tfhp)
1632{ 1647{
1633 struct dentry *ddir, *dnew, *dold; 1648 struct dentry *ddir, *dnew, *dold;
1634 struct inode *dirp, *dest; 1649 struct inode *dirp;
1635 __be32 err; 1650 __be32 err;
1636 int host_err; 1651 int host_err;
1637 1652
@@ -1659,7 +1674,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1659 goto out_nfserr; 1674 goto out_nfserr;
1660 1675
1661 dold = tfhp->fh_dentry; 1676 dold = tfhp->fh_dentry;
1662 dest = dold->d_inode;
1663 1677
1664 host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); 1678 host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
1665 if (host_err) { 1679 if (host_err) {
@@ -2019,9 +2033,17 @@ out:
2019__be32 2033__be32
2020nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) 2034nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
2021{ 2035{
2022 __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); 2036 __be32 err;
2023 if (!err && vfs_statfs(fhp->fh_dentry,stat)) 2037
2024 err = nfserr_io; 2038 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
2039 if (!err) {
2040 struct path path = {
2041 .mnt = fhp->fh_export->ex_path.mnt,
2042 .dentry = fhp->fh_dentry,
2043 };
2044 if (vfs_statfs(&path, stat))
2045 err = nfserr_io;
2046 }
2025 return err; 2047 return err;
2026} 2048}
2027 2049
@@ -2038,7 +2060,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2038 struct dentry *dentry, int acc) 2060 struct dentry *dentry, int acc)
2039{ 2061{
2040 struct inode *inode = dentry->d_inode; 2062 struct inode *inode = dentry->d_inode;
2041 struct path path;
2042 int err; 2063 int err;
2043 2064
2044 if (acc == NFSD_MAY_NOP) 2065 if (acc == NFSD_MAY_NOP)
@@ -2111,15 +2132,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2111 if (err == -EACCES && S_ISREG(inode->i_mode) && 2132 if (err == -EACCES && S_ISREG(inode->i_mode) &&
2112 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) 2133 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
2113 err = inode_permission(inode, MAY_EXEC); 2134 err = inode_permission(inode, MAY_EXEC);
2114 if (err)
2115 goto nfsd_out;
2116 2135
2117 /* Do integrity (permission) checking now, but defer incrementing
2118 * IMA counts to the actual file open.
2119 */
2120 path.mnt = exp->ex_path.mnt;
2121 path.dentry = dentry;
2122nfsd_out:
2123 return err? nfserrno(err) : 0; 2136 return err? nfserrno(err) : 0;
2124} 2137}
2125 2138
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 4b1de0a9ea75..9a370a5e36b7 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -20,6 +20,7 @@
20#define NFSD_MAY_OWNER_OVERRIDE 64 20#define NFSD_MAY_OWNER_OVERRIDE 64
21#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ 21#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
22#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 22#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
23#define NFSD_MAY_NOT_BREAK_LEASE 512
23 24
24#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) 25#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
25#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) 26#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -63,7 +64,9 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
63__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, 64__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int,
64 int, struct file **); 65 int, struct file **);
65void nfsd_close(struct file *); 66void nfsd_close(struct file *);
66__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, 67__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
68 loff_t, struct kvec *, int, unsigned long *);
69__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *,
67 loff_t, struct kvec *, int, unsigned long *); 70 loff_t, struct kvec *, int, unsigned long *);
68__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, 71__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
69 loff_t, struct kvec *,int, unsigned long *, int *); 72 loff_t, struct kvec *,int, unsigned long *, int *);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index efa337739534..4d476ff08ae6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -381,6 +381,10 @@ struct nfsd4_destroy_session {
381 struct nfs4_sessionid sessionid; 381 struct nfs4_sessionid sessionid;
382}; 382};
383 383
384struct nfsd4_reclaim_complete {
385 u32 rca_one_fs;
386};
387
384struct nfsd4_op { 388struct nfsd4_op {
385 int opnum; 389 int opnum;
386 __be32 status; 390 __be32 status;
@@ -421,6 +425,7 @@ struct nfsd4_op {
421 struct nfsd4_create_session create_session; 425 struct nfsd4_create_session create_session;
422 struct nfsd4_destroy_session destroy_session; 426 struct nfsd4_destroy_session destroy_session;
423 struct nfsd4_sequence sequence; 427 struct nfsd4_sequence sequence;
428 struct nfsd4_reclaim_complete reclaim_complete;
424 } u; 429 } u;
425 struct nfs4_replay * replay; 430 struct nfs4_replay * replay;
426}; 431};
@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
513extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, 518extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
514 struct nfsd4_sequence *seq); 519 struct nfsd4_sequence *seq);
515extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 520extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
516 struct nfsd4_compound_state *, 521 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
517struct nfsd4_exchange_id *); 522extern __be32 nfsd4_create_session(struct svc_rqst *,
518 extern __be32 nfsd4_create_session(struct svc_rqst *,
519 struct nfsd4_compound_state *, 523 struct nfsd4_compound_state *,
520 struct nfsd4_create_session *); 524 struct nfsd4_create_session *);
521extern __be32 nfsd4_sequence(struct svc_rqst *, 525extern __be32 nfsd4_sequence(struct svc_rqst *,
@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
524extern __be32 nfsd4_destroy_session(struct svc_rqst *, 528extern __be32 nfsd4_destroy_session(struct svc_rqst *,
525 struct nfsd4_compound_state *, 529 struct nfsd4_compound_state *,
526 struct nfsd4_destroy_session *); 530 struct nfsd4_destroy_session *);
531__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
527extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, 532extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
528 struct nfsd4_open *open); 533 struct nfsd4_open *open);
529extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, 534extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7cfb87e692da..d7fd696e595c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -31,6 +31,11 @@
31#include "alloc.h" 31#include "alloc.h"
32 32
33 33
34/**
35 * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
36 * descriptor block can maintain
37 * @inode: inode of metadata file using this allocator
38 */
34static inline unsigned long 39static inline unsigned long
35nilfs_palloc_groups_per_desc_block(const struct inode *inode) 40nilfs_palloc_groups_per_desc_block(const struct inode *inode)
36{ 41{
@@ -38,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
38 sizeof(struct nilfs_palloc_group_desc); 43 sizeof(struct nilfs_palloc_group_desc);
39} 44}
40 45
46/**
47 * nilfs_palloc_groups_count - get maximum number of groups
48 * @inode: inode of metadata file using this allocator
49 */
41static inline unsigned long 50static inline unsigned long
42nilfs_palloc_groups_count(const struct inode *inode) 51nilfs_palloc_groups_count(const struct inode *inode)
43{ 52{
44 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); 53 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
45} 54}
46 55
56/**
57 * nilfs_palloc_init_blockgroup - initialize private variables for allocator
58 * @inode: inode of metadata file using this allocator
59 * @entry_size: size of the persistent object
60 */
47int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) 61int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
48{ 62{
49 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 63 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -69,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
69 return 0; 83 return 0;
70} 84}
71 85
86/**
87 * nilfs_palloc_group - get group number and offset from an entry number
88 * @inode: inode of metadata file using this allocator
89 * @nr: serial number of the entry (e.g. inode number)
90 * @offset: pointer to store offset number in the group
91 */
72static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, 92static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
73 unsigned long *offset) 93 unsigned long *offset)
74{ 94{
@@ -78,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
78 return group; 98 return group;
79} 99}
80 100
101/**
102 * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
103 * @inode: inode of metadata file using this allocator
104 * @group: group number
105 *
106 * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
107 * block which contains a descriptor of the specified group.
108 */
81static unsigned long 109static unsigned long
82nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) 110nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
83{ 111{
@@ -86,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
86 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; 114 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
87} 115}
88 116
117/**
118 * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
119 * @inode: inode of metadata file using this allocator
120 * @group: group number
121 *
122 * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
123 * block used to allocate/deallocate entries in the specified group.
124 */
89static unsigned long 125static unsigned long
90nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) 126nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
91{ 127{
@@ -95,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
95 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; 131 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
96} 132}
97 133
134/**
135 * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
136 * @inode: inode of metadata file using this allocator
137 * @group: group number
138 * @desc: pointer to descriptor structure for the group
139 */
98static unsigned long 140static unsigned long
99nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, 141nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
100 const struct nilfs_palloc_group_desc *desc) 142 const struct nilfs_palloc_group_desc *desc)
@@ -107,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
107 return nfree; 149 return nfree;
108} 150}
109 151
152/**
153 * nilfs_palloc_group_desc_add_entries - adjust count of free entries
154 * @inode: inode of metadata file using this allocator
155 * @group: group number
156 * @desc: pointer to descriptor structure for the group
157 * @n: delta to be added
158 */
110static void 159static void
111nilfs_palloc_group_desc_add_entries(struct inode *inode, 160nilfs_palloc_group_desc_add_entries(struct inode *inode,
112 unsigned long group, 161 unsigned long group,
@@ -118,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
118 spin_unlock(nilfs_mdt_bgl_lock(inode, group)); 167 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
119} 168}
120 169
170/**
171 * nilfs_palloc_entry_blkoff - get block offset of an entry block
172 * @inode: inode of metadata file using this allocator
173 * @nr: serial number of the entry (e.g. inode number)
174 */
121static unsigned long 175static unsigned long
122nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) 176nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
123{ 177{
@@ -129,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
129 group_offset / NILFS_MDT(inode)->mi_entries_per_block; 183 group_offset / NILFS_MDT(inode)->mi_entries_per_block;
130} 184}
131 185
186/**
187 * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
188 * @inode: inode of metadata file
189 * @bh: buffer head of the buffer to be initialized
190 * @kaddr: kernel address mapped for the page including the buffer
191 */
132static void nilfs_palloc_desc_block_init(struct inode *inode, 192static void nilfs_palloc_desc_block_init(struct inode *inode,
133 struct buffer_head *bh, void *kaddr) 193 struct buffer_head *bh, void *kaddr)
134{ 194{
@@ -179,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
179 return ret; 239 return ret;
180} 240}
181 241
242/**
243 * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
244 * @inode: inode of metadata file using this allocator
245 * @group: group number
246 * @create: create flag
247 * @bhp: pointer to store the resultant buffer head
248 */
182static int nilfs_palloc_get_desc_block(struct inode *inode, 249static int nilfs_palloc_get_desc_block(struct inode *inode,
183 unsigned long group, 250 unsigned long group,
184 int create, struct buffer_head **bhp) 251 int create, struct buffer_head **bhp)
@@ -191,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
191 bhp, &cache->prev_desc, &cache->lock); 258 bhp, &cache->prev_desc, &cache->lock);
192} 259}
193 260
261/**
262 * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
263 * @inode: inode of metadata file using this allocator
264 * @group: group number
265 * @create: create flag
266 * @bhp: pointer to store the resultant buffer head
267 */
194static int nilfs_palloc_get_bitmap_block(struct inode *inode, 268static int nilfs_palloc_get_bitmap_block(struct inode *inode,
195 unsigned long group, 269 unsigned long group,
196 int create, struct buffer_head **bhp) 270 int create, struct buffer_head **bhp)
@@ -203,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
203 &cache->prev_bitmap, &cache->lock); 277 &cache->prev_bitmap, &cache->lock);
204} 278}
205 279
280/**
281 * nilfs_palloc_get_entry_block - get buffer head of an entry block
282 * @inode: inode of metadata file using this allocator
283 * @nr: serial number of the entry (e.g. inode number)
284 * @create: create flag
285 * @bhp: pointer to store the resultant buffer head
286 */
206int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, 287int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
207 int create, struct buffer_head **bhp) 288 int create, struct buffer_head **bhp)
208{ 289{
@@ -214,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
214 &cache->prev_entry, &cache->lock); 295 &cache->prev_entry, &cache->lock);
215} 296}
216 297
298/**
299 * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
300 * @inode: inode of metadata file using this allocator
301 * @group: group number
302 * @bh: buffer head of the buffer storing the group descriptor block
303 * @kaddr: kernel address mapped for the page including the buffer
304 */
217static struct nilfs_palloc_group_desc * 305static struct nilfs_palloc_group_desc *
218nilfs_palloc_block_get_group_desc(const struct inode *inode, 306nilfs_palloc_block_get_group_desc(const struct inode *inode,
219 unsigned long group, 307 unsigned long group,
@@ -223,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
223 group % nilfs_palloc_groups_per_desc_block(inode); 311 group % nilfs_palloc_groups_per_desc_block(inode);
224} 312}
225 313
314/**
315 * nilfs_palloc_block_get_entry - get kernel address of an entry
316 * @inode: inode of metadata file using this allocator
317 * @nr: serial number of the entry (e.g. inode number)
318 * @bh: buffer head of the buffer storing the entry block
319 * @kaddr: kernel address mapped for the page including the buffer
320 */
226void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, 321void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
227 const struct buffer_head *bh, void *kaddr) 322 const struct buffer_head *bh, void *kaddr)
228{ 323{
@@ -235,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
235 entry_offset * NILFS_MDT(inode)->mi_entry_size; 330 entry_offset * NILFS_MDT(inode)->mi_entry_size;
236} 331}
237 332
333/**
334 * nilfs_palloc_find_available_slot - find available slot in a group
335 * @inode: inode of metadata file using this allocator
336 * @group: group number
337 * @target: offset number of an entry in the group (start point)
338 * @bitmap: bitmap of the group
339 * @bsize: size in bits
340 */
238static int nilfs_palloc_find_available_slot(struct inode *inode, 341static int nilfs_palloc_find_available_slot(struct inode *inode,
239 unsigned long group, 342 unsigned long group,
240 unsigned long target, 343 unsigned long target,
241 unsigned char *bitmap, 344 unsigned char *bitmap,
242 int bsize) /* size in bits */ 345 int bsize)
243{ 346{
244 int curr, pos, end, i; 347 int curr, pos, end, i;
245 348
@@ -277,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
277 return -ENOSPC; 380 return -ENOSPC;
278} 381}
279 382
383/**
384 * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
385 * in a group descriptor block
386 * @inode: inode of metadata file using this allocator
387 * @curr: current group number
388 * @max: maximum number of groups
389 */
280static unsigned long 390static unsigned long
281nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, 391nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
282 unsigned long curr, unsigned long max) 392 unsigned long curr, unsigned long max)
@@ -287,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
287 max - curr + 1); 397 max - curr + 1);
288} 398}
289 399
400/**
401 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
402 * @inode: inode of metadata file using this allocator
403 * @req: nilfs_palloc_req structure exchanged for the allocation
404 */
290int nilfs_palloc_prepare_alloc_entry(struct inode *inode, 405int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
291 struct nilfs_palloc_req *req) 406 struct nilfs_palloc_req *req)
292{ 407{
@@ -366,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
366 return ret; 481 return ret;
367} 482}
368 483
484/**
485 * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
486 * @inode: inode of metadata file using this allocator
487 * @req: nilfs_palloc_req structure exchanged for the allocation
488 */
369void nilfs_palloc_commit_alloc_entry(struct inode *inode, 489void nilfs_palloc_commit_alloc_entry(struct inode *inode,
370 struct nilfs_palloc_req *req) 490 struct nilfs_palloc_req *req)
371{ 491{
@@ -377,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
377 brelse(req->pr_desc_bh); 497 brelse(req->pr_desc_bh);
378} 498}
379 499
500/**
501 * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
502 * @inode: inode of metadata file using this allocator
503 * @req: nilfs_palloc_req structure exchanged for the removal
504 */
380void nilfs_palloc_commit_free_entry(struct inode *inode, 505void nilfs_palloc_commit_free_entry(struct inode *inode,
381 struct nilfs_palloc_req *req) 506 struct nilfs_palloc_req *req)
382{ 507{
@@ -410,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
410 brelse(req->pr_desc_bh); 535 brelse(req->pr_desc_bh);
411} 536}
412 537
538/**
539 * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
540 * @inode: inode of metadata file using this allocator
541 * @req: nilfs_palloc_req structure exchanged for the allocation
542 */
413void nilfs_palloc_abort_alloc_entry(struct inode *inode, 543void nilfs_palloc_abort_alloc_entry(struct inode *inode,
414 struct nilfs_palloc_req *req) 544 struct nilfs_palloc_req *req)
415{ 545{
@@ -442,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
442 req->pr_desc_bh = NULL; 572 req->pr_desc_bh = NULL;
443} 573}
444 574
575/**
576 * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
577 * @inode: inode of metadata file using this allocator
578 * @req: nilfs_palloc_req structure exchanged for the removal
579 */
445int nilfs_palloc_prepare_free_entry(struct inode *inode, 580int nilfs_palloc_prepare_free_entry(struct inode *inode,
446 struct nilfs_palloc_req *req) 581 struct nilfs_palloc_req *req)
447{ 582{
@@ -464,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
464 return 0; 599 return 0;
465} 600}
466 601
602/**
603 * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
604 * @inode: inode of metadata file using this allocator
605 * @req: nilfs_palloc_req structure exchanged for the removal
606 */
467void nilfs_palloc_abort_free_entry(struct inode *inode, 607void nilfs_palloc_abort_free_entry(struct inode *inode,
468 struct nilfs_palloc_req *req) 608 struct nilfs_palloc_req *req)
469{ 609{
@@ -475,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
475 req->pr_desc_bh = NULL; 615 req->pr_desc_bh = NULL;
476} 616}
477 617
618/**
619 * nilfs_palloc_group_is_in - judge if an entry is in a group
620 * @inode: inode of metadata file using this allocator
621 * @group: group number
622 * @nr: serial number of the entry (e.g. inode number)
623 */
478static int 624static int
479nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) 625nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
480{ 626{
@@ -485,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
485 return (nr >= first) && (nr <= last); 631 return (nr >= first) && (nr <= last);
486} 632}
487 633
634/**
635 * nilfs_palloc_freev - deallocate a set of persistent objects
636 * @inode: inode of metadata file using this allocator
637 * @entry_nrs: array of entry numbers to be deallocated
638 * @nitems: number of entries stored in @entry_nrs
639 */
488int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) 640int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
489{ 641{
490 struct buffer_head *desc_bh, *bitmap_bh; 642 struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 5cccf874d692..9af34a7e6e13 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
29#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
30#include <linux/fs.h> 30#include <linux/fs.h>
31 31
32/**
33 * nilfs_palloc_entries_per_group - get the number of entries per group
34 * @inode: inode of metadata file using this allocator
35 *
36 * The number of entries per group is defined by the number of bits
37 * that a bitmap block can maintain.
38 */
32static inline unsigned long 39static inline unsigned long
33nilfs_palloc_entries_per_group(const struct inode *inode) 40nilfs_palloc_entries_per_group(const struct inode *inode)
34{ 41{
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index effdbdbe6c11..3dbdc1d356bf 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -26,6 +26,8 @@
26#include "nilfs.h" 26#include "nilfs.h"
27#include "bmap.h" 27#include "bmap.h"
28#include "sb.h" 28#include "sb.h"
29#include "btree.h"
30#include "direct.h"
29#include "btnode.h" 31#include "btnode.h"
30#include "mdt.h" 32#include "mdt.h"
31#include "dat.h" 33#include "dat.h"
@@ -533,7 +535,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
533 535
534void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 536void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
535{ 537{
536 memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union)); 538 memcpy(gcbmap, bmap, sizeof(*bmap));
537 init_rwsem(&gcbmap->b_sem); 539 init_rwsem(&gcbmap->b_sem);
538 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 540 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
539 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; 541 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
@@ -541,7 +543,7 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
541 543
542void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 544void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
543{ 545{
544 memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union)); 546 memcpy(bmap, gcbmap, sizeof(*bmap));
545 init_rwsem(&bmap->b_sem); 547 init_rwsem(&bmap->b_sem);
546 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 548 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
547 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; 549 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 9980d7dbab91..a20569b19929 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -32,11 +32,6 @@
32 32
33#define NILFS_BMAP_INVALID_PTR 0 33#define NILFS_BMAP_INVALID_PTR 0
34 34
35#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey)
36#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key)
37#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr)
38#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr)
39
40#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) 35#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff))
41 36
42 37
@@ -71,7 +66,7 @@ struct nilfs_bmap_operations {
71 int (*bop_delete)(struct nilfs_bmap *, __u64); 66 int (*bop_delete)(struct nilfs_bmap *, __u64);
72 void (*bop_clear)(struct nilfs_bmap *); 67 void (*bop_clear)(struct nilfs_bmap *);
73 68
74 int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *); 69 int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *);
75 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, 70 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
76 struct list_head *); 71 struct list_head *);
77 72
@@ -110,6 +105,7 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
110 * @b_last_allocated_ptr: last allocated ptr for data block 105 * @b_last_allocated_ptr: last allocated ptr for data block
111 * @b_ptr_type: pointer type 106 * @b_ptr_type: pointer type
112 * @b_state: state 107 * @b_state: state
108 * @b_nchildren_per_block: maximum number of child nodes for non-root nodes
113 */ 109 */
114struct nilfs_bmap { 110struct nilfs_bmap {
115 union { 111 union {
@@ -123,6 +119,7 @@ struct nilfs_bmap {
123 __u64 b_last_allocated_ptr; 119 __u64 b_last_allocated_ptr;
124 int b_ptr_type; 120 int b_ptr_type;
125 int b_state; 121 int b_state;
122 __u16 b_nchildren_per_block;
126}; 123};
127 124
128/* pointer type */ 125/* pointer type */
@@ -224,6 +221,13 @@ static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
224 nilfs_dat_abort_end(dat, &req->bpr_req); 221 nilfs_dat_abort_end(dat, &req->bpr_req);
225} 222}
226 223
224static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key,
225 __u64 ptr)
226{
227 bmap->b_last_allocated_key = key;
228 bmap->b_last_allocated_ptr = ptr;
229}
230
227__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, 231__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
228 const struct buffer_head *); 232 const struct buffer_head *);
229 233
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
deleted file mode 100644
index d41509bff47b..000000000000
--- a/fs/nilfs2/bmap_union.h
+++ /dev/null
@@ -1,42 +0,0 @@
1/*
2 * bmap_union.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_UNION_H
24#define _NILFS_BMAP_UNION_H
25
26#include "bmap.h"
27#include "direct.h"
28#include "btree.h"
29
30/**
31 * nilfs_bmap_union -
32 * @bi_bmap: bmap structure
33 * @bi_btree: direct map structure
34 * @bi_direct: B-tree structure
35 */
36union nilfs_bmap_union {
37 struct nilfs_bmap bi_bmap;
38 struct nilfs_direct bi_direct;
39 struct nilfs_btree bi_btree;
40};
41
42#endif /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 447ce47a3306..f78ab1044d1d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -96,10 +96,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
96} 96}
97 97
98int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, 98int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
99 sector_t pblocknr, struct buffer_head **pbh) 99 sector_t pblocknr, int mode,
100 struct buffer_head **pbh, sector_t *submit_ptr)
100{ 101{
101 struct buffer_head *bh; 102 struct buffer_head *bh;
102 struct inode *inode = NILFS_BTNC_I(btnc); 103 struct inode *inode = NILFS_BTNC_I(btnc);
104 struct page *page;
103 int err; 105 int err;
104 106
105 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); 107 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
@@ -107,6 +109,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
107 return -ENOMEM; 109 return -ENOMEM;
108 110
109 err = -EEXIST; /* internal code */ 111 err = -EEXIST; /* internal code */
112 page = bh->b_page;
110 113
111 if (buffer_uptodate(bh) || buffer_dirty(bh)) 114 if (buffer_uptodate(bh) || buffer_dirty(bh))
112 goto found; 115 goto found;
@@ -125,7 +128,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
125 } 128 }
126 } 129 }
127 } 130 }
128 lock_buffer(bh); 131
132 if (mode == READA) {
133 if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
134 err = -EBUSY; /* internal code */
135 brelse(bh);
136 goto out_locked;
137 }
138 } else { /* mode == READ */
139 lock_buffer(bh);
140 }
129 if (buffer_uptodate(bh)) { 141 if (buffer_uptodate(bh)) {
130 unlock_buffer(bh); 142 unlock_buffer(bh);
131 err = -EEXIST; /* internal code */ 143 err = -EEXIST; /* internal code */
@@ -136,15 +148,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
136 bh->b_blocknr = pblocknr; /* set block address for read */ 148 bh->b_blocknr = pblocknr; /* set block address for read */
137 bh->b_end_io = end_buffer_read_sync; 149 bh->b_end_io = end_buffer_read_sync;
138 get_bh(bh); 150 get_bh(bh);
139 submit_bh(READ, bh); 151 submit_bh(mode, bh);
140 bh->b_blocknr = blocknr; /* set back to the given block address */ 152 bh->b_blocknr = blocknr; /* set back to the given block address */
153 *submit_ptr = pblocknr;
141 err = 0; 154 err = 0;
142found: 155found:
143 *pbh = bh; 156 *pbh = bh;
144 157
145out_locked: 158out_locked:
146 unlock_page(bh->b_page); 159 unlock_page(page);
147 page_cache_release(bh->b_page); 160 page_cache_release(page);
148 return err; 161 return err;
149} 162}
150 163
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 07da83f07712..79037494f1e0 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -42,8 +42,8 @@ void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
42void nilfs_btnode_cache_clear(struct address_space *); 42void nilfs_btnode_cache_clear(struct address_space *);
43struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, 43struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
44 __u64 blocknr); 44 __u64 blocknr);
45int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, 45int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
46 struct buffer_head **); 46 struct buffer_head **, sector_t *);
47void nilfs_btnode_delete(struct buffer_head *); 47void nilfs_btnode_delete(struct buffer_head *);
48int nilfs_btnode_prepare_change_key(struct address_space *, 48int nilfs_btnode_prepare_change_key(struct address_space *,
49 struct nilfs_btnode_chkey_ctxt *); 49 struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 76c38e3e19d2..300c2bc00c3f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
31#include "alloc.h" 31#include "alloc.h"
32#include "dat.h" 32#include "dat.h"
33 33
34/** 34static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
35 * struct nilfs_btree_path - A path on which B-tree operations are executed
36 * @bp_bh: buffer head of node block
37 * @bp_sib_bh: buffer head of sibling node block
38 * @bp_index: index of child node
39 * @bp_oldreq: ptr end request for old ptr
40 * @bp_newreq: ptr alloc request for new ptr
41 * @bp_op: rebalance operation
42 */
43struct nilfs_btree_path {
44 struct buffer_head *bp_bh;
45 struct buffer_head *bp_sib_bh;
46 int bp_index;
47 union nilfs_bmap_ptr_req bp_oldreq;
48 union nilfs_bmap_ptr_req bp_newreq;
49 struct nilfs_btnode_chkey_ctxt bp_ctxt;
50 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
51 int, __u64 *, __u64 *);
52};
53
54/*
55 * B-tree path operations
56 */
57
58static struct kmem_cache *nilfs_btree_path_cache;
59
60int __init nilfs_btree_path_cache_init(void)
61{
62 nilfs_btree_path_cache =
63 kmem_cache_create("nilfs2_btree_path_cache",
64 sizeof(struct nilfs_btree_path) *
65 NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
66 return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
67}
68
69void nilfs_btree_path_cache_destroy(void)
70{
71 kmem_cache_destroy(nilfs_btree_path_cache);
72}
73
74static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
75{
76 return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
77}
78
79static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
80{ 35{
81 kmem_cache_free(nilfs_btree_path_cache, path); 36 struct nilfs_btree_path *path;
82} 37 int level = NILFS_BTREE_LEVEL_DATA;
83 38
84static void nilfs_btree_init_path(struct nilfs_btree_path *path) 39 path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
85{ 40 if (path == NULL)
86 int level; 41 goto out;
87 42
88 for (level = NILFS_BTREE_LEVEL_DATA; 43 for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
89 level < NILFS_BTREE_LEVEL_MAX;
90 level++) {
91 path[level].bp_bh = NULL; 44 path[level].bp_bh = NULL;
92 path[level].bp_sib_bh = NULL; 45 path[level].bp_sib_bh = NULL;
93 path[level].bp_index = 0; 46 path[level].bp_index = 0;
@@ -95,44 +48,28 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
95 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; 48 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
96 path[level].bp_op = NULL; 49 path[level].bp_op = NULL;
97 } 50 }
51
52out:
53 return path;
98} 54}
99 55
100static void nilfs_btree_release_path(struct nilfs_btree_path *path) 56static void nilfs_btree_free_path(struct nilfs_btree_path *path)
101{ 57{
102 int level; 58 int level = NILFS_BTREE_LEVEL_DATA;
103 59
104 for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; 60 for (; level < NILFS_BTREE_LEVEL_MAX; level++)
105 level++)
106 brelse(path[level].bp_bh); 61 brelse(path[level].bp_bh);
62
63 kmem_cache_free(nilfs_btree_path_cache, path);
107} 64}
108 65
109/* 66/*
110 * B-tree node operations 67 * B-tree node operations
111 */ 68 */
112static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr, 69static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
113 struct buffer_head **bhp)
114{
115 struct address_space *btnc =
116 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
117 int err;
118
119 err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
120 if (err)
121 return err == -EEXIST ? 0 : err;
122
123 wait_on_buffer(*bhp);
124 if (!buffer_uptodate(*bhp)) {
125 brelse(*bhp);
126 return -EIO;
127 }
128 return 0;
129}
130
131static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
132 __u64 ptr, struct buffer_head **bhp) 70 __u64 ptr, struct buffer_head **bhp)
133{ 71{
134 struct address_space *btnc = 72 struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
135 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
136 struct buffer_head *bh; 73 struct buffer_head *bh;
137 74
138 bh = nilfs_btnode_create_block(btnc, ptr); 75 bh = nilfs_btnode_create_block(btnc, ptr);
@@ -144,71 +81,55 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
144 return 0; 81 return 0;
145} 82}
146 83
147static inline int 84static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
148nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
149{ 85{
150 return node->bn_flags; 86 return node->bn_flags;
151} 87}
152 88
153static inline void 89static void
154nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) 90nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
155{ 91{
156 node->bn_flags = flags; 92 node->bn_flags = flags;
157} 93}
158 94
159static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node) 95static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
160{ 96{
161 return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; 97 return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
162} 98}
163 99
164static inline int 100static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
165nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
166{ 101{
167 return node->bn_level; 102 return node->bn_level;
168} 103}
169 104
170static inline void 105static void
171nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) 106nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
172{ 107{
173 node->bn_level = level; 108 node->bn_level = level;
174} 109}
175 110
176static inline int 111static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
177nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
178{ 112{
179 return le16_to_cpu(node->bn_nchildren); 113 return le16_to_cpu(node->bn_nchildren);
180} 114}
181 115
182static inline void 116static void
183nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) 117nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
184{ 118{
185 node->bn_nchildren = cpu_to_le16(nchildren); 119 node->bn_nchildren = cpu_to_le16(nchildren);
186} 120}
187 121
188static inline int nilfs_btree_node_size(const struct nilfs_btree *btree) 122static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
189{
190 return 1 << btree->bt_bmap.b_inode->i_blkbits;
191}
192
193static inline int
194nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
195 const struct nilfs_btree *btree)
196{ 123{
197 return nilfs_btree_node_root(node) ? 124 return 1 << btree->b_inode->i_blkbits;
198 NILFS_BTREE_ROOT_NCHILDREN_MIN :
199 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
200} 125}
201 126
202static inline int 127static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
203nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
204 const struct nilfs_btree *btree)
205{ 128{
206 return nilfs_btree_node_root(node) ? 129 return btree->b_nchildren_per_block;
207 NILFS_BTREE_ROOT_NCHILDREN_MAX :
208 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
209} 130}
210 131
211static inline __le64 * 132static __le64 *
212nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) 133nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
213{ 134{
214 return (__le64 *)((char *)(node + 1) + 135 return (__le64 *)((char *)(node + 1) +
@@ -216,45 +137,40 @@ nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
216 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); 137 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
217} 138}
218 139
219static inline __le64 * 140static __le64 *
220nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, 141nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
221 const struct nilfs_btree *btree)
222{ 142{
223 return (__le64 *)(nilfs_btree_node_dkeys(node) + 143 return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
224 nilfs_btree_node_nchildren_max(node, btree));
225} 144}
226 145
227static inline __u64 146static __u64
228nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) 147nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
229{ 148{
230 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index)); 149 return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
231} 150}
232 151
233static inline void 152static void
234nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) 153nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
235{ 154{
236 *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key); 155 *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
237} 156}
238 157
239static inline __u64 158static __u64
240nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, 159nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
241 const struct nilfs_btree_node *node, int index) 160 int ncmax)
242{ 161{
243 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) + 162 return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
244 index));
245} 163}
246 164
247static inline void 165static void
248nilfs_btree_node_set_ptr(struct nilfs_btree *btree, 166nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
249 struct nilfs_btree_node *node, int index, __u64 ptr) 167 int ncmax)
250{ 168{
251 *(nilfs_btree_node_dptrs(node, btree) + index) = 169 *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
252 nilfs_bmap_ptr_to_dptr(ptr);
253} 170}
254 171
255static void nilfs_btree_node_init(struct nilfs_btree *btree, 172static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
256 struct nilfs_btree_node *node, 173 int level, int nchildren, int ncmax,
257 int flags, int level, int nchildren,
258 const __u64 *keys, const __u64 *ptrs) 174 const __u64 *keys, const __u64 *ptrs)
259{ 175{
260 __le64 *dkeys; 176 __le64 *dkeys;
@@ -266,29 +182,28 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
266 nilfs_btree_node_set_nchildren(node, nchildren); 182 nilfs_btree_node_set_nchildren(node, nchildren);
267 183
268 dkeys = nilfs_btree_node_dkeys(node); 184 dkeys = nilfs_btree_node_dkeys(node);
269 dptrs = nilfs_btree_node_dptrs(node, btree); 185 dptrs = nilfs_btree_node_dptrs(node, ncmax);
270 for (i = 0; i < nchildren; i++) { 186 for (i = 0; i < nchildren; i++) {
271 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); 187 dkeys[i] = cpu_to_le64(keys[i]);
272 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); 188 dptrs[i] = cpu_to_le64(ptrs[i]);
273 } 189 }
274} 190}
275 191
276/* Assume the buffer heads corresponding to left and right are locked. */ 192/* Assume the buffer heads corresponding to left and right are locked. */
277static void nilfs_btree_node_move_left(struct nilfs_btree *btree, 193static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
278 struct nilfs_btree_node *left,
279 struct nilfs_btree_node *right, 194 struct nilfs_btree_node *right,
280 int n) 195 int n, int lncmax, int rncmax)
281{ 196{
282 __le64 *ldkeys, *rdkeys; 197 __le64 *ldkeys, *rdkeys;
283 __le64 *ldptrs, *rdptrs; 198 __le64 *ldptrs, *rdptrs;
284 int lnchildren, rnchildren; 199 int lnchildren, rnchildren;
285 200
286 ldkeys = nilfs_btree_node_dkeys(left); 201 ldkeys = nilfs_btree_node_dkeys(left);
287 ldptrs = nilfs_btree_node_dptrs(left, btree); 202 ldptrs = nilfs_btree_node_dptrs(left, lncmax);
288 lnchildren = nilfs_btree_node_get_nchildren(left); 203 lnchildren = nilfs_btree_node_get_nchildren(left);
289 204
290 rdkeys = nilfs_btree_node_dkeys(right); 205 rdkeys = nilfs_btree_node_dkeys(right);
291 rdptrs = nilfs_btree_node_dptrs(right, btree); 206 rdptrs = nilfs_btree_node_dptrs(right, rncmax);
292 rnchildren = nilfs_btree_node_get_nchildren(right); 207 rnchildren = nilfs_btree_node_get_nchildren(right);
293 208
294 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); 209 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
@@ -303,21 +218,20 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
303} 218}
304 219
305/* Assume that the buffer heads corresponding to left and right are locked. */ 220/* Assume that the buffer heads corresponding to left and right are locked. */
306static void nilfs_btree_node_move_right(struct nilfs_btree *btree, 221static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
307 struct nilfs_btree_node *left,
308 struct nilfs_btree_node *right, 222 struct nilfs_btree_node *right,
309 int n) 223 int n, int lncmax, int rncmax)
310{ 224{
311 __le64 *ldkeys, *rdkeys; 225 __le64 *ldkeys, *rdkeys;
312 __le64 *ldptrs, *rdptrs; 226 __le64 *ldptrs, *rdptrs;
313 int lnchildren, rnchildren; 227 int lnchildren, rnchildren;
314 228
315 ldkeys = nilfs_btree_node_dkeys(left); 229 ldkeys = nilfs_btree_node_dkeys(left);
316 ldptrs = nilfs_btree_node_dptrs(left, btree); 230 ldptrs = nilfs_btree_node_dptrs(left, lncmax);
317 lnchildren = nilfs_btree_node_get_nchildren(left); 231 lnchildren = nilfs_btree_node_get_nchildren(left);
318 232
319 rdkeys = nilfs_btree_node_dkeys(right); 233 rdkeys = nilfs_btree_node_dkeys(right);
320 rdptrs = nilfs_btree_node_dptrs(right, btree); 234 rdptrs = nilfs_btree_node_dptrs(right, rncmax);
321 rnchildren = nilfs_btree_node_get_nchildren(right); 235 rnchildren = nilfs_btree_node_get_nchildren(right);
322 236
323 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); 237 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
@@ -332,16 +246,15 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
332} 246}
333 247
334/* Assume that the buffer head corresponding to node is locked. */ 248/* Assume that the buffer head corresponding to node is locked. */
335static void nilfs_btree_node_insert(struct nilfs_btree *btree, 249static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
336 struct nilfs_btree_node *node, 250 __u64 key, __u64 ptr, int ncmax)
337 __u64 key, __u64 ptr, int index)
338{ 251{
339 __le64 *dkeys; 252 __le64 *dkeys;
340 __le64 *dptrs; 253 __le64 *dptrs;
341 int nchildren; 254 int nchildren;
342 255
343 dkeys = nilfs_btree_node_dkeys(node); 256 dkeys = nilfs_btree_node_dkeys(node);
344 dptrs = nilfs_btree_node_dptrs(node, btree); 257 dptrs = nilfs_btree_node_dptrs(node, ncmax);
345 nchildren = nilfs_btree_node_get_nchildren(node); 258 nchildren = nilfs_btree_node_get_nchildren(node);
346 if (index < nchildren) { 259 if (index < nchildren) {
347 memmove(dkeys + index + 1, dkeys + index, 260 memmove(dkeys + index + 1, dkeys + index,
@@ -349,16 +262,15 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
349 memmove(dptrs + index + 1, dptrs + index, 262 memmove(dptrs + index + 1, dptrs + index,
350 (nchildren - index) * sizeof(*dptrs)); 263 (nchildren - index) * sizeof(*dptrs));
351 } 264 }
352 dkeys[index] = nilfs_bmap_key_to_dkey(key); 265 dkeys[index] = cpu_to_le64(key);
353 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); 266 dptrs[index] = cpu_to_le64(ptr);
354 nchildren++; 267 nchildren++;
355 nilfs_btree_node_set_nchildren(node, nchildren); 268 nilfs_btree_node_set_nchildren(node, nchildren);
356} 269}
357 270
358/* Assume that the buffer head corresponding to node is locked. */ 271/* Assume that the buffer head corresponding to node is locked. */
359static void nilfs_btree_node_delete(struct nilfs_btree *btree, 272static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
360 struct nilfs_btree_node *node, 273 __u64 *keyp, __u64 *ptrp, int ncmax)
361 __u64 *keyp, __u64 *ptrp, int index)
362{ 274{
363 __u64 key; 275 __u64 key;
364 __u64 ptr; 276 __u64 ptr;
@@ -367,9 +279,9 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
367 int nchildren; 279 int nchildren;
368 280
369 dkeys = nilfs_btree_node_dkeys(node); 281 dkeys = nilfs_btree_node_dkeys(node);
370 dptrs = nilfs_btree_node_dptrs(node, btree); 282 dptrs = nilfs_btree_node_dptrs(node, ncmax);
371 key = nilfs_bmap_dkey_to_key(dkeys[index]); 283 key = le64_to_cpu(dkeys[index]);
372 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); 284 ptr = le64_to_cpu(dptrs[index]);
373 nchildren = nilfs_btree_node_get_nchildren(node); 285 nchildren = nilfs_btree_node_get_nchildren(node);
374 if (keyp != NULL) 286 if (keyp != NULL)
375 *keyp = key; 287 *keyp = key;
@@ -425,40 +337,92 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
425 return s == 0; 337 return s == 0;
426} 338}
427 339
428static inline struct nilfs_btree_node * 340/**
429nilfs_btree_get_root(const struct nilfs_btree *btree) 341 * nilfs_btree_node_broken - verify consistency of btree node
342 * @node: btree node block to be examined
343 * @size: node size (in bytes)
344 * @blocknr: block number
345 *
346 * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
347 */
348static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
349 size_t size, sector_t blocknr)
350{
351 int level, flags, nchildren;
352 int ret = 0;
353
354 level = nilfs_btree_node_get_level(node);
355 flags = nilfs_btree_node_get_flags(node);
356 nchildren = nilfs_btree_node_get_nchildren(node);
357
358 if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
359 level >= NILFS_BTREE_LEVEL_MAX ||
360 (flags & NILFS_BTREE_NODE_ROOT) ||
361 nchildren < 0 ||
362 nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
363 printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
364 "level = %d, flags = 0x%x, nchildren = %d\n",
365 (unsigned long long)blocknr, level, flags, nchildren);
366 ret = 1;
367 }
368 return ret;
369}
370
371int nilfs_btree_broken_node_block(struct buffer_head *bh)
430{ 372{
431 return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data; 373 int ret;
374
375 if (buffer_nilfs_checked(bh))
376 return 0;
377
378 ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
379 bh->b_size, bh->b_blocknr);
380 if (likely(!ret))
381 set_buffer_nilfs_checked(bh);
382 return ret;
432} 383}
433 384
434static inline struct nilfs_btree_node * 385static struct nilfs_btree_node *
386nilfs_btree_get_root(const struct nilfs_bmap *btree)
387{
388 return (struct nilfs_btree_node *)btree->b_u.u_data;
389}
390
391static struct nilfs_btree_node *
435nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) 392nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
436{ 393{
437 return (struct nilfs_btree_node *)path[level].bp_bh->b_data; 394 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
438} 395}
439 396
440static inline struct nilfs_btree_node * 397static struct nilfs_btree_node *
441nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) 398nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
442{ 399{
443 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; 400 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
444} 401}
445 402
446static inline int nilfs_btree_height(const struct nilfs_btree *btree) 403static int nilfs_btree_height(const struct nilfs_bmap *btree)
447{ 404{
448 return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; 405 return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
449} 406}
450 407
451static inline struct nilfs_btree_node * 408static struct nilfs_btree_node *
452nilfs_btree_get_node(const struct nilfs_btree *btree, 409nilfs_btree_get_node(const struct nilfs_bmap *btree,
453 const struct nilfs_btree_path *path, 410 const struct nilfs_btree_path *path,
454 int level) 411 int level, int *ncmaxp)
455{ 412{
456 return (level == nilfs_btree_height(btree) - 1) ? 413 struct nilfs_btree_node *node;
457 nilfs_btree_get_root(btree) : 414
458 nilfs_btree_get_nonroot_node(path, level); 415 if (level == nilfs_btree_height(btree) - 1) {
416 node = nilfs_btree_get_root(btree);
417 *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
418 } else {
419 node = nilfs_btree_get_nonroot_node(path, level);
420 *ncmaxp = nilfs_btree_nchildren_per_block(btree);
421 }
422 return node;
459} 423}
460 424
461static inline int 425static int
462nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) 426nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
463{ 427{
464 if (unlikely(nilfs_btree_node_get_level(node) != level)) { 428 if (unlikely(nilfs_btree_node_get_level(node) != level)) {
@@ -470,13 +434,83 @@ nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
470 return 0; 434 return 0;
471} 435}
472 436
473static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, 437struct nilfs_btree_readahead_info {
438 struct nilfs_btree_node *node; /* parent node */
439 int max_ra_blocks; /* max nof blocks to read ahead */
440 int index; /* current index on the parent node */
441 int ncmax; /* nof children in the parent node */
442};
443
444static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
445 struct buffer_head **bhp,
446 const struct nilfs_btree_readahead_info *ra)
447{
448 struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
449 struct buffer_head *bh, *ra_bh;
450 sector_t submit_ptr = 0;
451 int ret;
452
453 ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
454 if (ret) {
455 if (ret != -EEXIST)
456 return ret;
457 goto out_check;
458 }
459
460 if (ra) {
461 int i, n;
462 __u64 ptr2;
463
464 /* read ahead sibling nodes */
465 for (n = ra->max_ra_blocks, i = ra->index + 1;
466 n > 0 && i < ra->ncmax; n--, i++) {
467 ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
468
469 ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
470 &ra_bh, &submit_ptr);
471 if (likely(!ret || ret == -EEXIST))
472 brelse(ra_bh);
473 else if (ret != -EBUSY)
474 break;
475 if (!buffer_locked(bh))
476 goto out_no_wait;
477 }
478 }
479
480 wait_on_buffer(bh);
481
482 out_no_wait:
483 if (!buffer_uptodate(bh)) {
484 brelse(bh);
485 return -EIO;
486 }
487
488 out_check:
489 if (nilfs_btree_broken_node_block(bh)) {
490 clear_buffer_uptodate(bh);
491 brelse(bh);
492 return -EINVAL;
493 }
494
495 *bhp = bh;
496 return 0;
497}
498
499static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
500 struct buffer_head **bhp)
501{
502 return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
503}
504
505static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
474 struct nilfs_btree_path *path, 506 struct nilfs_btree_path *path,
475 __u64 key, __u64 *ptrp, int minlevel) 507 __u64 key, __u64 *ptrp, int minlevel,
508 int readahead)
476{ 509{
477 struct nilfs_btree_node *node; 510 struct nilfs_btree_node *node;
511 struct nilfs_btree_readahead_info p, *ra;
478 __u64 ptr; 512 __u64 ptr;
479 int level, index, found, ret; 513 int level, index, found, ncmax, ret;
480 514
481 node = nilfs_btree_get_root(btree); 515 node = nilfs_btree_get_root(btree);
482 level = nilfs_btree_node_get_level(node); 516 level = nilfs_btree_node_get_level(node);
@@ -484,14 +518,27 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
484 return -ENOENT; 518 return -ENOENT;
485 519
486 found = nilfs_btree_node_lookup(node, key, &index); 520 found = nilfs_btree_node_lookup(node, key, &index);
487 ptr = nilfs_btree_node_get_ptr(btree, node, index); 521 ptr = nilfs_btree_node_get_ptr(node, index,
522 NILFS_BTREE_ROOT_NCHILDREN_MAX);
488 path[level].bp_bh = NULL; 523 path[level].bp_bh = NULL;
489 path[level].bp_index = index; 524 path[level].bp_index = index;
490 525
491 for (level--; level >= minlevel; level--) { 526 ncmax = nilfs_btree_nchildren_per_block(btree);
492 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 527
528 while (--level >= minlevel) {
529 ra = NULL;
530 if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
531 p.node = nilfs_btree_get_node(btree, path, level + 1,
532 &p.ncmax);
533 p.index = index;
534 p.max_ra_blocks = 7;
535 ra = &p;
536 }
537 ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
538 ra);
493 if (ret < 0) 539 if (ret < 0)
494 return ret; 540 return ret;
541
495 node = nilfs_btree_get_nonroot_node(path, level); 542 node = nilfs_btree_get_nonroot_node(path, level);
496 if (nilfs_btree_bad_node(node, level)) 543 if (nilfs_btree_bad_node(node, level))
497 return -EINVAL; 544 return -EINVAL;
@@ -499,9 +546,9 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
499 found = nilfs_btree_node_lookup(node, key, &index); 546 found = nilfs_btree_node_lookup(node, key, &index);
500 else 547 else
501 index = 0; 548 index = 0;
502 if (index < nilfs_btree_node_nchildren_max(node, btree)) 549 if (index < ncmax) {
503 ptr = nilfs_btree_node_get_ptr(btree, node, index); 550 ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
504 else { 551 } else {
505 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); 552 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
506 /* insert */ 553 /* insert */
507 ptr = NILFS_BMAP_INVALID_PTR; 554 ptr = NILFS_BMAP_INVALID_PTR;
@@ -517,22 +564,24 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
517 return 0; 564 return 0;
518} 565}
519 566
520static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, 567static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
521 struct nilfs_btree_path *path, 568 struct nilfs_btree_path *path,
522 __u64 *keyp, __u64 *ptrp) 569 __u64 *keyp, __u64 *ptrp)
523{ 570{
524 struct nilfs_btree_node *node; 571 struct nilfs_btree_node *node;
525 __u64 ptr; 572 __u64 ptr;
526 int index, level, ret; 573 int index, level, ncmax, ret;
527 574
528 node = nilfs_btree_get_root(btree); 575 node = nilfs_btree_get_root(btree);
529 index = nilfs_btree_node_get_nchildren(node) - 1; 576 index = nilfs_btree_node_get_nchildren(node) - 1;
530 if (index < 0) 577 if (index < 0)
531 return -ENOENT; 578 return -ENOENT;
532 level = nilfs_btree_node_get_level(node); 579 level = nilfs_btree_node_get_level(node);
533 ptr = nilfs_btree_node_get_ptr(btree, node, index); 580 ptr = nilfs_btree_node_get_ptr(node, index,
581 NILFS_BTREE_ROOT_NCHILDREN_MAX);
534 path[level].bp_bh = NULL; 582 path[level].bp_bh = NULL;
535 path[level].bp_index = index; 583 path[level].bp_index = index;
584 ncmax = nilfs_btree_nchildren_per_block(btree);
536 585
537 for (level--; level > 0; level--) { 586 for (level--; level > 0; level--) {
538 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 587 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
@@ -542,7 +591,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
542 if (nilfs_btree_bad_node(node, level)) 591 if (nilfs_btree_bad_node(node, level))
543 return -EINVAL; 592 return -EINVAL;
544 index = nilfs_btree_node_get_nchildren(node) - 1; 593 index = nilfs_btree_node_get_nchildren(node) - 1;
545 ptr = nilfs_btree_node_get_ptr(btree, node, index); 594 ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
546 path[level].bp_index = index; 595 path[level].bp_index = index;
547 } 596 }
548 597
@@ -554,53 +603,45 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
554 return 0; 603 return 0;
555} 604}
556 605
557static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, 606static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
558 __u64 key, int level, __u64 *ptrp) 607 __u64 key, int level, __u64 *ptrp)
559{ 608{
560 struct nilfs_btree *btree;
561 struct nilfs_btree_path *path; 609 struct nilfs_btree_path *path;
562 __u64 ptr;
563 int ret; 610 int ret;
564 611
565 btree = (struct nilfs_btree *)bmap;
566 path = nilfs_btree_alloc_path(); 612 path = nilfs_btree_alloc_path();
567 if (path == NULL) 613 if (path == NULL)
568 return -ENOMEM; 614 return -ENOMEM;
569 nilfs_btree_init_path(path);
570 615
571 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 616 ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);
572
573 if (ptrp != NULL)
574 *ptrp = ptr;
575 617
576 nilfs_btree_release_path(path);
577 nilfs_btree_free_path(path); 618 nilfs_btree_free_path(path);
578 619
579 return ret; 620 return ret;
580} 621}
581 622
582static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, 623static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
583 __u64 key, __u64 *ptrp, unsigned maxblocks) 624 __u64 key, __u64 *ptrp, unsigned maxblocks)
584{ 625{
585 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
586 struct nilfs_btree_path *path; 626 struct nilfs_btree_path *path;
587 struct nilfs_btree_node *node; 627 struct nilfs_btree_node *node;
588 struct inode *dat = NULL; 628 struct inode *dat = NULL;
589 __u64 ptr, ptr2; 629 __u64 ptr, ptr2;
590 sector_t blocknr; 630 sector_t blocknr;
591 int level = NILFS_BTREE_LEVEL_NODE_MIN; 631 int level = NILFS_BTREE_LEVEL_NODE_MIN;
592 int ret, cnt, index, maxlevel; 632 int ret, cnt, index, maxlevel, ncmax;
633 struct nilfs_btree_readahead_info p;
593 634
594 path = nilfs_btree_alloc_path(); 635 path = nilfs_btree_alloc_path();
595 if (path == NULL) 636 if (path == NULL)
596 return -ENOMEM; 637 return -ENOMEM;
597 nilfs_btree_init_path(path); 638
598 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 639 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
599 if (ret < 0) 640 if (ret < 0)
600 goto out; 641 goto out;
601 642
602 if (NILFS_BMAP_USE_VBN(bmap)) { 643 if (NILFS_BMAP_USE_VBN(btree)) {
603 dat = nilfs_bmap_get_dat(bmap); 644 dat = nilfs_bmap_get_dat(btree);
604 ret = nilfs_dat_translate(dat, ptr, &blocknr); 645 ret = nilfs_dat_translate(dat, ptr, &blocknr);
605 if (ret < 0) 646 if (ret < 0)
606 goto out; 647 goto out;
@@ -611,14 +652,14 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
611 goto end; 652 goto end;
612 653
613 maxlevel = nilfs_btree_height(btree) - 1; 654 maxlevel = nilfs_btree_height(btree) - 1;
614 node = nilfs_btree_get_node(btree, path, level); 655 node = nilfs_btree_get_node(btree, path, level, &ncmax);
615 index = path[level].bp_index + 1; 656 index = path[level].bp_index + 1;
616 for (;;) { 657 for (;;) {
617 while (index < nilfs_btree_node_get_nchildren(node)) { 658 while (index < nilfs_btree_node_get_nchildren(node)) {
618 if (nilfs_btree_node_get_key(node, index) != 659 if (nilfs_btree_node_get_key(node, index) !=
619 key + cnt) 660 key + cnt)
620 goto end; 661 goto end;
621 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 662 ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
622 if (dat) { 663 if (dat) {
623 ret = nilfs_dat_translate(dat, ptr2, &blocknr); 664 ret = nilfs_dat_translate(dat, ptr2, &blocknr);
624 if (ret < 0) 665 if (ret < 0)
@@ -634,20 +675,24 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
634 break; 675 break;
635 676
636 /* look-up right sibling node */ 677 /* look-up right sibling node */
637 node = nilfs_btree_get_node(btree, path, level + 1); 678 p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
638 index = path[level + 1].bp_index + 1; 679 p.index = path[level + 1].bp_index + 1;
639 if (index >= nilfs_btree_node_get_nchildren(node) || 680 p.max_ra_blocks = 7;
640 nilfs_btree_node_get_key(node, index) != key + cnt) 681 if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
682 nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
641 break; 683 break;
642 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 684 ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
643 path[level + 1].bp_index = index; 685 path[level + 1].bp_index = p.index;
644 686
645 brelse(path[level].bp_bh); 687 brelse(path[level].bp_bh);
646 path[level].bp_bh = NULL; 688 path[level].bp_bh = NULL;
647 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); 689
690 ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
691 &p);
648 if (ret < 0) 692 if (ret < 0)
649 goto out; 693 goto out;
650 node = nilfs_btree_get_nonroot_node(path, level); 694 node = nilfs_btree_get_nonroot_node(path, level);
695 ncmax = nilfs_btree_nchildren_per_block(btree);
651 index = 0; 696 index = 0;
652 path[level].bp_index = index; 697 path[level].bp_index = index;
653 } 698 }
@@ -655,12 +700,11 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
655 *ptrp = ptr; 700 *ptrp = ptr;
656 ret = cnt; 701 ret = cnt;
657 out: 702 out:
658 nilfs_btree_release_path(path);
659 nilfs_btree_free_path(path); 703 nilfs_btree_free_path(path);
660 return ret; 704 return ret;
661} 705}
662 706
663static void nilfs_btree_promote_key(struct nilfs_btree *btree, 707static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
664 struct nilfs_btree_path *path, 708 struct nilfs_btree_path *path,
665 int level, __u64 key) 709 int level, __u64 key)
666{ 710{
@@ -682,16 +726,18 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
682 } 726 }
683} 727}
684 728
685static void nilfs_btree_do_insert(struct nilfs_btree *btree, 729static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
686 struct nilfs_btree_path *path, 730 struct nilfs_btree_path *path,
687 int level, __u64 *keyp, __u64 *ptrp) 731 int level, __u64 *keyp, __u64 *ptrp)
688{ 732{
689 struct nilfs_btree_node *node; 733 struct nilfs_btree_node *node;
734 int ncblk;
690 735
691 if (level < nilfs_btree_height(btree) - 1) { 736 if (level < nilfs_btree_height(btree) - 1) {
692 node = nilfs_btree_get_nonroot_node(path, level); 737 node = nilfs_btree_get_nonroot_node(path, level);
693 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 738 ncblk = nilfs_btree_nchildren_per_block(btree);
694 path[level].bp_index); 739 nilfs_btree_node_insert(node, path[level].bp_index,
740 *keyp, *ptrp, ncblk);
695 if (!buffer_dirty(path[level].bp_bh)) 741 if (!buffer_dirty(path[level].bp_bh))
696 nilfs_btnode_mark_dirty(path[level].bp_bh); 742 nilfs_btnode_mark_dirty(path[level].bp_bh);
697 743
@@ -701,22 +747,24 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
701 0)); 747 0));
702 } else { 748 } else {
703 node = nilfs_btree_get_root(btree); 749 node = nilfs_btree_get_root(btree);
704 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 750 nilfs_btree_node_insert(node, path[level].bp_index,
705 path[level].bp_index); 751 *keyp, *ptrp,
752 NILFS_BTREE_ROOT_NCHILDREN_MAX);
706 } 753 }
707} 754}
708 755
709static void nilfs_btree_carry_left(struct nilfs_btree *btree, 756static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
710 struct nilfs_btree_path *path, 757 struct nilfs_btree_path *path,
711 int level, __u64 *keyp, __u64 *ptrp) 758 int level, __u64 *keyp, __u64 *ptrp)
712{ 759{
713 struct nilfs_btree_node *node, *left; 760 struct nilfs_btree_node *node, *left;
714 int nchildren, lnchildren, n, move; 761 int nchildren, lnchildren, n, move, ncblk;
715 762
716 node = nilfs_btree_get_nonroot_node(path, level); 763 node = nilfs_btree_get_nonroot_node(path, level);
717 left = nilfs_btree_get_sib_node(path, level); 764 left = nilfs_btree_get_sib_node(path, level);
718 nchildren = nilfs_btree_node_get_nchildren(node); 765 nchildren = nilfs_btree_node_get_nchildren(node);
719 lnchildren = nilfs_btree_node_get_nchildren(left); 766 lnchildren = nilfs_btree_node_get_nchildren(left);
767 ncblk = nilfs_btree_nchildren_per_block(btree);
720 move = 0; 768 move = 0;
721 769
722 n = (nchildren + lnchildren + 1) / 2 - lnchildren; 770 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -726,7 +774,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
726 move = 1; 774 move = 1;
727 } 775 }
728 776
729 nilfs_btree_node_move_left(btree, left, node, n); 777 nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
730 778
731 if (!buffer_dirty(path[level].bp_bh)) 779 if (!buffer_dirty(path[level].bp_bh))
732 nilfs_btnode_mark_dirty(path[level].bp_bh); 780 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -751,17 +799,18 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
751 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 799 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
752} 800}
753 801
754static void nilfs_btree_carry_right(struct nilfs_btree *btree, 802static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
755 struct nilfs_btree_path *path, 803 struct nilfs_btree_path *path,
756 int level, __u64 *keyp, __u64 *ptrp) 804 int level, __u64 *keyp, __u64 *ptrp)
757{ 805{
758 struct nilfs_btree_node *node, *right; 806 struct nilfs_btree_node *node, *right;
759 int nchildren, rnchildren, n, move; 807 int nchildren, rnchildren, n, move, ncblk;
760 808
761 node = nilfs_btree_get_nonroot_node(path, level); 809 node = nilfs_btree_get_nonroot_node(path, level);
762 right = nilfs_btree_get_sib_node(path, level); 810 right = nilfs_btree_get_sib_node(path, level);
763 nchildren = nilfs_btree_node_get_nchildren(node); 811 nchildren = nilfs_btree_node_get_nchildren(node);
764 rnchildren = nilfs_btree_node_get_nchildren(right); 812 rnchildren = nilfs_btree_node_get_nchildren(right);
813 ncblk = nilfs_btree_nchildren_per_block(btree);
765 move = 0; 814 move = 0;
766 815
767 n = (nchildren + rnchildren + 1) / 2 - rnchildren; 816 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -771,7 +820,7 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
771 move = 1; 820 move = 1;
772 } 821 }
773 822
774 nilfs_btree_node_move_right(btree, node, right, n); 823 nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
775 824
776 if (!buffer_dirty(path[level].bp_bh)) 825 if (!buffer_dirty(path[level].bp_bh))
777 nilfs_btnode_mark_dirty(path[level].bp_bh); 826 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -797,18 +846,19 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
797 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 846 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
798} 847}
799 848
800static void nilfs_btree_split(struct nilfs_btree *btree, 849static void nilfs_btree_split(struct nilfs_bmap *btree,
801 struct nilfs_btree_path *path, 850 struct nilfs_btree_path *path,
802 int level, __u64 *keyp, __u64 *ptrp) 851 int level, __u64 *keyp, __u64 *ptrp)
803{ 852{
804 struct nilfs_btree_node *node, *right; 853 struct nilfs_btree_node *node, *right;
805 __u64 newkey; 854 __u64 newkey;
806 __u64 newptr; 855 __u64 newptr;
807 int nchildren, n, move; 856 int nchildren, n, move, ncblk;
808 857
809 node = nilfs_btree_get_nonroot_node(path, level); 858 node = nilfs_btree_get_nonroot_node(path, level);
810 right = nilfs_btree_get_sib_node(path, level); 859 right = nilfs_btree_get_sib_node(path, level);
811 nchildren = nilfs_btree_node_get_nchildren(node); 860 nchildren = nilfs_btree_node_get_nchildren(node);
861 ncblk = nilfs_btree_nchildren_per_block(btree);
812 move = 0; 862 move = 0;
813 863
814 n = (nchildren + 1) / 2; 864 n = (nchildren + 1) / 2;
@@ -817,7 +867,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
817 move = 1; 867 move = 1;
818 } 868 }
819 869
820 nilfs_btree_node_move_right(btree, node, right, n); 870 nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
821 871
822 if (!buffer_dirty(path[level].bp_bh)) 872 if (!buffer_dirty(path[level].bp_bh))
823 nilfs_btnode_mark_dirty(path[level].bp_bh); 873 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -829,8 +879,8 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
829 879
830 if (move) { 880 if (move) {
831 path[level].bp_index -= nilfs_btree_node_get_nchildren(node); 881 path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
832 nilfs_btree_node_insert(btree, right, *keyp, *ptrp, 882 nilfs_btree_node_insert(right, path[level].bp_index,
833 path[level].bp_index); 883 *keyp, *ptrp, ncblk);
834 884
835 *keyp = nilfs_btree_node_get_key(right, 0); 885 *keyp = nilfs_btree_node_get_key(right, 0);
836 *ptrp = path[level].bp_newreq.bpr_ptr; 886 *ptrp = path[level].bp_newreq.bpr_ptr;
@@ -851,19 +901,21 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
851 path[level + 1].bp_index++; 901 path[level + 1].bp_index++;
852} 902}
853 903
854static void nilfs_btree_grow(struct nilfs_btree *btree, 904static void nilfs_btree_grow(struct nilfs_bmap *btree,
855 struct nilfs_btree_path *path, 905 struct nilfs_btree_path *path,
856 int level, __u64 *keyp, __u64 *ptrp) 906 int level, __u64 *keyp, __u64 *ptrp)
857{ 907{
858 struct nilfs_btree_node *root, *child; 908 struct nilfs_btree_node *root, *child;
859 int n; 909 int n, ncblk;
860 910
861 root = nilfs_btree_get_root(btree); 911 root = nilfs_btree_get_root(btree);
862 child = nilfs_btree_get_sib_node(path, level); 912 child = nilfs_btree_get_sib_node(path, level);
913 ncblk = nilfs_btree_nchildren_per_block(btree);
863 914
864 n = nilfs_btree_node_get_nchildren(root); 915 n = nilfs_btree_node_get_nchildren(root);
865 916
866 nilfs_btree_node_move_right(btree, root, child, n); 917 nilfs_btree_node_move_right(root, child, n,
918 NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
867 nilfs_btree_node_set_level(root, level + 1); 919 nilfs_btree_node_set_level(root, level + 1);
868 920
869 if (!buffer_dirty(path[level].bp_sib_bh)) 921 if (!buffer_dirty(path[level].bp_sib_bh))
@@ -878,11 +930,11 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
878 *ptrp = path[level].bp_newreq.bpr_ptr; 930 *ptrp = path[level].bp_newreq.bpr_ptr;
879} 931}
880 932
881static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, 933static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
882 const struct nilfs_btree_path *path) 934 const struct nilfs_btree_path *path)
883{ 935{
884 struct nilfs_btree_node *node; 936 struct nilfs_btree_node *node;
885 int level; 937 int level, ncmax;
886 938
887 if (path == NULL) 939 if (path == NULL)
888 return NILFS_BMAP_INVALID_PTR; 940 return NILFS_BMAP_INVALID_PTR;
@@ -890,29 +942,30 @@ static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
890 /* left sibling */ 942 /* left sibling */
891 level = NILFS_BTREE_LEVEL_NODE_MIN; 943 level = NILFS_BTREE_LEVEL_NODE_MIN;
892 if (path[level].bp_index > 0) { 944 if (path[level].bp_index > 0) {
893 node = nilfs_btree_get_node(btree, path, level); 945 node = nilfs_btree_get_node(btree, path, level, &ncmax);
894 return nilfs_btree_node_get_ptr(btree, node, 946 return nilfs_btree_node_get_ptr(node,
895 path[level].bp_index - 1); 947 path[level].bp_index - 1,
948 ncmax);
896 } 949 }
897 950
898 /* parent */ 951 /* parent */
899 level = NILFS_BTREE_LEVEL_NODE_MIN + 1; 952 level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
900 if (level <= nilfs_btree_height(btree) - 1) { 953 if (level <= nilfs_btree_height(btree) - 1) {
901 node = nilfs_btree_get_node(btree, path, level); 954 node = nilfs_btree_get_node(btree, path, level, &ncmax);
902 return nilfs_btree_node_get_ptr(btree, node, 955 return nilfs_btree_node_get_ptr(node, path[level].bp_index,
903 path[level].bp_index); 956 ncmax);
904 } 957 }
905 958
906 return NILFS_BMAP_INVALID_PTR; 959 return NILFS_BMAP_INVALID_PTR;
907} 960}
908 961
909static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, 962static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
910 const struct nilfs_btree_path *path, 963 const struct nilfs_btree_path *path,
911 __u64 key) 964 __u64 key)
912{ 965{
913 __u64 ptr; 966 __u64 ptr;
914 967
915 ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key); 968 ptr = nilfs_bmap_find_target_seq(btree, key);
916 if (ptr != NILFS_BMAP_INVALID_PTR) 969 if (ptr != NILFS_BMAP_INVALID_PTR)
917 /* sequential access */ 970 /* sequential access */
918 return ptr; 971 return ptr;
@@ -923,17 +976,10 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
923 return ptr; 976 return ptr;
924 } 977 }
925 /* block group */ 978 /* block group */
926 return nilfs_bmap_find_target_in_group(&btree->bt_bmap); 979 return nilfs_bmap_find_target_in_group(btree);
927} 980}
928 981
929static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key, 982static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
930 __u64 ptr)
931{
932 btree->bt_bmap.b_last_allocated_key = key;
933 btree->bt_bmap.b_last_allocated_ptr = ptr;
934}
935
936static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
937 struct nilfs_btree_path *path, 983 struct nilfs_btree_path *path,
938 int *levelp, __u64 key, __u64 ptr, 984 int *levelp, __u64 key, __u64 ptr,
939 struct nilfs_bmap_stats *stats) 985 struct nilfs_bmap_stats *stats)
@@ -941,79 +987,78 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
941 struct buffer_head *bh; 987 struct buffer_head *bh;
942 struct nilfs_btree_node *node, *parent, *sib; 988 struct nilfs_btree_node *node, *parent, *sib;
943 __u64 sibptr; 989 __u64 sibptr;
944 int pindex, level, ret; 990 int pindex, level, ncmax, ncblk, ret;
945 struct inode *dat = NULL; 991 struct inode *dat = NULL;
946 992
947 stats->bs_nblocks = 0; 993 stats->bs_nblocks = 0;
948 level = NILFS_BTREE_LEVEL_DATA; 994 level = NILFS_BTREE_LEVEL_DATA;
949 995
950 /* allocate a new ptr for data block */ 996 /* allocate a new ptr for data block */
951 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { 997 if (NILFS_BMAP_USE_VBN(btree)) {
952 path[level].bp_newreq.bpr_ptr = 998 path[level].bp_newreq.bpr_ptr =
953 nilfs_btree_find_target_v(btree, path, key); 999 nilfs_btree_find_target_v(btree, path, key);
954 dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1000 dat = nilfs_bmap_get_dat(btree);
955 } 1001 }
956 1002
957 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1003 ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
958 &path[level].bp_newreq, dat);
959 if (ret < 0) 1004 if (ret < 0)
960 goto err_out_data; 1005 goto err_out_data;
961 1006
1007 ncblk = nilfs_btree_nchildren_per_block(btree);
1008
962 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 1009 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
963 level < nilfs_btree_height(btree) - 1; 1010 level < nilfs_btree_height(btree) - 1;
964 level++) { 1011 level++) {
965 node = nilfs_btree_get_nonroot_node(path, level); 1012 node = nilfs_btree_get_nonroot_node(path, level);
966 if (nilfs_btree_node_get_nchildren(node) < 1013 if (nilfs_btree_node_get_nchildren(node) < ncblk) {
967 nilfs_btree_node_nchildren_max(node, btree)) {
968 path[level].bp_op = nilfs_btree_do_insert; 1014 path[level].bp_op = nilfs_btree_do_insert;
969 stats->bs_nblocks++; 1015 stats->bs_nblocks++;
970 goto out; 1016 goto out;
971 } 1017 }
972 1018
973 parent = nilfs_btree_get_node(btree, path, level + 1); 1019 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
974 pindex = path[level + 1].bp_index; 1020 pindex = path[level + 1].bp_index;
975 1021
976 /* left sibling */ 1022 /* left sibling */
977 if (pindex > 0) { 1023 if (pindex > 0) {
978 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1024 sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
979 pindex - 1); 1025 ncmax);
980 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1026 ret = nilfs_btree_get_block(btree, sibptr, &bh);
981 if (ret < 0) 1027 if (ret < 0)
982 goto err_out_child_node; 1028 goto err_out_child_node;
983 sib = (struct nilfs_btree_node *)bh->b_data; 1029 sib = (struct nilfs_btree_node *)bh->b_data;
984 if (nilfs_btree_node_get_nchildren(sib) < 1030 if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
985 nilfs_btree_node_nchildren_max(sib, btree)) {
986 path[level].bp_sib_bh = bh; 1031 path[level].bp_sib_bh = bh;
987 path[level].bp_op = nilfs_btree_carry_left; 1032 path[level].bp_op = nilfs_btree_carry_left;
988 stats->bs_nblocks++; 1033 stats->bs_nblocks++;
989 goto out; 1034 goto out;
990 } else 1035 } else {
991 brelse(bh); 1036 brelse(bh);
1037 }
992 } 1038 }
993 1039
994 /* right sibling */ 1040 /* right sibling */
995 if (pindex < 1041 if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
996 nilfs_btree_node_get_nchildren(parent) - 1) { 1042 sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
997 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1043 ncmax);
998 pindex + 1);
999 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1044 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1000 if (ret < 0) 1045 if (ret < 0)
1001 goto err_out_child_node; 1046 goto err_out_child_node;
1002 sib = (struct nilfs_btree_node *)bh->b_data; 1047 sib = (struct nilfs_btree_node *)bh->b_data;
1003 if (nilfs_btree_node_get_nchildren(sib) < 1048 if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
1004 nilfs_btree_node_nchildren_max(sib, btree)) {
1005 path[level].bp_sib_bh = bh; 1049 path[level].bp_sib_bh = bh;
1006 path[level].bp_op = nilfs_btree_carry_right; 1050 path[level].bp_op = nilfs_btree_carry_right;
1007 stats->bs_nblocks++; 1051 stats->bs_nblocks++;
1008 goto out; 1052 goto out;
1009 } else 1053 } else {
1010 brelse(bh); 1054 brelse(bh);
1055 }
1011 } 1056 }
1012 1057
1013 /* split */ 1058 /* split */
1014 path[level].bp_newreq.bpr_ptr = 1059 path[level].bp_newreq.bpr_ptr =
1015 path[level - 1].bp_newreq.bpr_ptr + 1; 1060 path[level - 1].bp_newreq.bpr_ptr + 1;
1016 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1061 ret = nilfs_bmap_prepare_alloc_ptr(btree,
1017 &path[level].bp_newreq, dat); 1062 &path[level].bp_newreq, dat);
1018 if (ret < 0) 1063 if (ret < 0)
1019 goto err_out_child_node; 1064 goto err_out_child_node;
@@ -1025,9 +1070,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1025 1070
1026 stats->bs_nblocks++; 1071 stats->bs_nblocks++;
1027 1072
1028 nilfs_btree_node_init(btree, 1073 sib = (struct nilfs_btree_node *)bh->b_data;
1029 (struct nilfs_btree_node *)bh->b_data, 1074 nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
1030 0, level, 0, NULL, NULL);
1031 path[level].bp_sib_bh = bh; 1075 path[level].bp_sib_bh = bh;
1032 path[level].bp_op = nilfs_btree_split; 1076 path[level].bp_op = nilfs_btree_split;
1033 } 1077 }
@@ -1035,7 +1079,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1035 /* root */ 1079 /* root */
1036 node = nilfs_btree_get_root(btree); 1080 node = nilfs_btree_get_root(btree);
1037 if (nilfs_btree_node_get_nchildren(node) < 1081 if (nilfs_btree_node_get_nchildren(node) <
1038 nilfs_btree_node_nchildren_max(node, btree)) { 1082 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1039 path[level].bp_op = nilfs_btree_do_insert; 1083 path[level].bp_op = nilfs_btree_do_insert;
1040 stats->bs_nblocks++; 1084 stats->bs_nblocks++;
1041 goto out; 1085 goto out;
@@ -1043,8 +1087,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1043 1087
1044 /* grow */ 1088 /* grow */
1045 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; 1089 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
1046 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1090 ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
1047 &path[level].bp_newreq, dat);
1048 if (ret < 0) 1091 if (ret < 0)
1049 goto err_out_child_node; 1092 goto err_out_child_node;
1050 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, 1093 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1052,8 +1095,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1052 if (ret < 0) 1095 if (ret < 0)
1053 goto err_out_curr_node; 1096 goto err_out_curr_node;
1054 1097
1055 nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, 1098 nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
1056 0, level, 0, NULL, NULL); 1099 0, level, 0, ncblk, NULL, NULL);
1057 path[level].bp_sib_bh = bh; 1100 path[level].bp_sib_bh = bh;
1058 path[level].bp_op = nilfs_btree_grow; 1101 path[level].bp_op = nilfs_btree_grow;
1059 1102
@@ -1070,25 +1113,22 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1070 1113
1071 /* error */ 1114 /* error */
1072 err_out_curr_node: 1115 err_out_curr_node:
1073 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, 1116 nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1074 dat);
1075 err_out_child_node: 1117 err_out_child_node:
1076 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { 1118 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1077 nilfs_btnode_delete(path[level].bp_sib_bh); 1119 nilfs_btnode_delete(path[level].bp_sib_bh);
1078 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, 1120 nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1079 &path[level].bp_newreq, dat);
1080 1121
1081 } 1122 }
1082 1123
1083 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, 1124 nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1084 dat);
1085 err_out_data: 1125 err_out_data:
1086 *levelp = level; 1126 *levelp = level;
1087 stats->bs_nblocks = 0; 1127 stats->bs_nblocks = 0;
1088 return ret; 1128 return ret;
1089} 1129}
1090 1130
1091static void nilfs_btree_commit_insert(struct nilfs_btree *btree, 1131static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
1092 struct nilfs_btree_path *path, 1132 struct nilfs_btree_path *path,
1093 int maxlevel, __u64 key, __u64 ptr) 1133 int maxlevel, __u64 key, __u64 ptr)
1094{ 1134{
@@ -1097,36 +1137,33 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1097 1137
1098 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1138 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1099 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; 1139 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1100 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { 1140 if (NILFS_BMAP_USE_VBN(btree)) {
1101 nilfs_btree_set_target_v(btree, key, ptr); 1141 nilfs_bmap_set_target_v(btree, key, ptr);
1102 dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1142 dat = nilfs_bmap_get_dat(btree);
1103 } 1143 }
1104 1144
1105 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1145 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1106 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, 1146 nilfs_bmap_commit_alloc_ptr(btree,
1107 &path[level - 1].bp_newreq, dat); 1147 &path[level - 1].bp_newreq, dat);
1108 path[level].bp_op(btree, path, level, &key, &ptr); 1148 path[level].bp_op(btree, path, level, &key, &ptr);
1109 } 1149 }
1110 1150
1111 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 1151 if (!nilfs_bmap_dirty(btree))
1112 nilfs_bmap_set_dirty(&btree->bt_bmap); 1152 nilfs_bmap_set_dirty(btree);
1113} 1153}
1114 1154
1115static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 1155static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
1116{ 1156{
1117 struct nilfs_btree *btree;
1118 struct nilfs_btree_path *path; 1157 struct nilfs_btree_path *path;
1119 struct nilfs_bmap_stats stats; 1158 struct nilfs_bmap_stats stats;
1120 int level, ret; 1159 int level, ret;
1121 1160
1122 btree = (struct nilfs_btree *)bmap;
1123 path = nilfs_btree_alloc_path(); 1161 path = nilfs_btree_alloc_path();
1124 if (path == NULL) 1162 if (path == NULL)
1125 return -ENOMEM; 1163 return -ENOMEM;
1126 nilfs_btree_init_path(path);
1127 1164
1128 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1165 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1129 NILFS_BTREE_LEVEL_NODE_MIN); 1166 NILFS_BTREE_LEVEL_NODE_MIN, 0);
1130 if (ret != -ENOENT) { 1167 if (ret != -ENOENT) {
1131 if (ret == 0) 1168 if (ret == 0)
1132 ret = -EEXIST; 1169 ret = -EEXIST;
@@ -1137,24 +1174,25 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1137 if (ret < 0) 1174 if (ret < 0)
1138 goto out; 1175 goto out;
1139 nilfs_btree_commit_insert(btree, path, level, key, ptr); 1176 nilfs_btree_commit_insert(btree, path, level, key, ptr);
1140 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1177 nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
1141 1178
1142 out: 1179 out:
1143 nilfs_btree_release_path(path);
1144 nilfs_btree_free_path(path); 1180 nilfs_btree_free_path(path);
1145 return ret; 1181 return ret;
1146} 1182}
1147 1183
1148static void nilfs_btree_do_delete(struct nilfs_btree *btree, 1184static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
1149 struct nilfs_btree_path *path, 1185 struct nilfs_btree_path *path,
1150 int level, __u64 *keyp, __u64 *ptrp) 1186 int level, __u64 *keyp, __u64 *ptrp)
1151{ 1187{
1152 struct nilfs_btree_node *node; 1188 struct nilfs_btree_node *node;
1189 int ncblk;
1153 1190
1154 if (level < nilfs_btree_height(btree) - 1) { 1191 if (level < nilfs_btree_height(btree) - 1) {
1155 node = nilfs_btree_get_nonroot_node(path, level); 1192 node = nilfs_btree_get_nonroot_node(path, level);
1156 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1193 ncblk = nilfs_btree_nchildren_per_block(btree);
1157 path[level].bp_index); 1194 nilfs_btree_node_delete(node, path[level].bp_index,
1195 keyp, ptrp, ncblk);
1158 if (!buffer_dirty(path[level].bp_bh)) 1196 if (!buffer_dirty(path[level].bp_bh))
1159 nilfs_btnode_mark_dirty(path[level].bp_bh); 1197 nilfs_btnode_mark_dirty(path[level].bp_bh);
1160 if (path[level].bp_index == 0) 1198 if (path[level].bp_index == 0)
@@ -1162,17 +1200,18 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1162 nilfs_btree_node_get_key(node, 0)); 1200 nilfs_btree_node_get_key(node, 0));
1163 } else { 1201 } else {
1164 node = nilfs_btree_get_root(btree); 1202 node = nilfs_btree_get_root(btree);
1165 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1203 nilfs_btree_node_delete(node, path[level].bp_index,
1166 path[level].bp_index); 1204 keyp, ptrp,
1205 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1167 } 1206 }
1168} 1207}
1169 1208
1170static void nilfs_btree_borrow_left(struct nilfs_btree *btree, 1209static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
1171 struct nilfs_btree_path *path, 1210 struct nilfs_btree_path *path,
1172 int level, __u64 *keyp, __u64 *ptrp) 1211 int level, __u64 *keyp, __u64 *ptrp)
1173{ 1212{
1174 struct nilfs_btree_node *node, *left; 1213 struct nilfs_btree_node *node, *left;
1175 int nchildren, lnchildren, n; 1214 int nchildren, lnchildren, n, ncblk;
1176 1215
1177 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1216 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1178 1217
@@ -1180,10 +1219,11 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1180 left = nilfs_btree_get_sib_node(path, level); 1219 left = nilfs_btree_get_sib_node(path, level);
1181 nchildren = nilfs_btree_node_get_nchildren(node); 1220 nchildren = nilfs_btree_node_get_nchildren(node);
1182 lnchildren = nilfs_btree_node_get_nchildren(left); 1221 lnchildren = nilfs_btree_node_get_nchildren(left);
1222 ncblk = nilfs_btree_nchildren_per_block(btree);
1183 1223
1184 n = (nchildren + lnchildren) / 2 - nchildren; 1224 n = (nchildren + lnchildren) / 2 - nchildren;
1185 1225
1186 nilfs_btree_node_move_right(btree, left, node, n); 1226 nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
1187 1227
1188 if (!buffer_dirty(path[level].bp_bh)) 1228 if (!buffer_dirty(path[level].bp_bh))
1189 nilfs_btnode_mark_dirty(path[level].bp_bh); 1229 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1198,12 +1238,12 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1198 path[level].bp_index += n; 1238 path[level].bp_index += n;
1199} 1239}
1200 1240
1201static void nilfs_btree_borrow_right(struct nilfs_btree *btree, 1241static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
1202 struct nilfs_btree_path *path, 1242 struct nilfs_btree_path *path,
1203 int level, __u64 *keyp, __u64 *ptrp) 1243 int level, __u64 *keyp, __u64 *ptrp)
1204{ 1244{
1205 struct nilfs_btree_node *node, *right; 1245 struct nilfs_btree_node *node, *right;
1206 int nchildren, rnchildren, n; 1246 int nchildren, rnchildren, n, ncblk;
1207 1247
1208 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1248 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1209 1249
@@ -1211,10 +1251,11 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1211 right = nilfs_btree_get_sib_node(path, level); 1251 right = nilfs_btree_get_sib_node(path, level);
1212 nchildren = nilfs_btree_node_get_nchildren(node); 1252 nchildren = nilfs_btree_node_get_nchildren(node);
1213 rnchildren = nilfs_btree_node_get_nchildren(right); 1253 rnchildren = nilfs_btree_node_get_nchildren(right);
1254 ncblk = nilfs_btree_nchildren_per_block(btree);
1214 1255
1215 n = (nchildren + rnchildren) / 2 - nchildren; 1256 n = (nchildren + rnchildren) / 2 - nchildren;
1216 1257
1217 nilfs_btree_node_move_left(btree, node, right, n); 1258 nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
1218 1259
1219 if (!buffer_dirty(path[level].bp_bh)) 1260 if (!buffer_dirty(path[level].bp_bh))
1220 nilfs_btnode_mark_dirty(path[level].bp_bh); 1261 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1230,21 +1271,22 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1230 path[level].bp_sib_bh = NULL; 1271 path[level].bp_sib_bh = NULL;
1231} 1272}
1232 1273
1233static void nilfs_btree_concat_left(struct nilfs_btree *btree, 1274static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
1234 struct nilfs_btree_path *path, 1275 struct nilfs_btree_path *path,
1235 int level, __u64 *keyp, __u64 *ptrp) 1276 int level, __u64 *keyp, __u64 *ptrp)
1236{ 1277{
1237 struct nilfs_btree_node *node, *left; 1278 struct nilfs_btree_node *node, *left;
1238 int n; 1279 int n, ncblk;
1239 1280
1240 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1281 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1241 1282
1242 node = nilfs_btree_get_nonroot_node(path, level); 1283 node = nilfs_btree_get_nonroot_node(path, level);
1243 left = nilfs_btree_get_sib_node(path, level); 1284 left = nilfs_btree_get_sib_node(path, level);
1285 ncblk = nilfs_btree_nchildren_per_block(btree);
1244 1286
1245 n = nilfs_btree_node_get_nchildren(node); 1287 n = nilfs_btree_node_get_nchildren(node);
1246 1288
1247 nilfs_btree_node_move_left(btree, left, node, n); 1289 nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
1248 1290
1249 if (!buffer_dirty(path[level].bp_sib_bh)) 1291 if (!buffer_dirty(path[level].bp_sib_bh))
1250 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 1292 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -1255,21 +1297,22 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1255 path[level].bp_index += nilfs_btree_node_get_nchildren(left); 1297 path[level].bp_index += nilfs_btree_node_get_nchildren(left);
1256} 1298}
1257 1299
1258static void nilfs_btree_concat_right(struct nilfs_btree *btree, 1300static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
1259 struct nilfs_btree_path *path, 1301 struct nilfs_btree_path *path,
1260 int level, __u64 *keyp, __u64 *ptrp) 1302 int level, __u64 *keyp, __u64 *ptrp)
1261{ 1303{
1262 struct nilfs_btree_node *node, *right; 1304 struct nilfs_btree_node *node, *right;
1263 int n; 1305 int n, ncblk;
1264 1306
1265 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1307 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1266 1308
1267 node = nilfs_btree_get_nonroot_node(path, level); 1309 node = nilfs_btree_get_nonroot_node(path, level);
1268 right = nilfs_btree_get_sib_node(path, level); 1310 right = nilfs_btree_get_sib_node(path, level);
1311 ncblk = nilfs_btree_nchildren_per_block(btree);
1269 1312
1270 n = nilfs_btree_node_get_nchildren(right); 1313 n = nilfs_btree_node_get_nchildren(right);
1271 1314
1272 nilfs_btree_node_move_left(btree, node, right, n); 1315 nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
1273 1316
1274 if (!buffer_dirty(path[level].bp_bh)) 1317 if (!buffer_dirty(path[level].bp_bh))
1275 nilfs_btnode_mark_dirty(path[level].bp_bh); 1318 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1279,29 +1322,32 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1279 path[level + 1].bp_index++; 1322 path[level + 1].bp_index++;
1280} 1323}
1281 1324
1282static void nilfs_btree_shrink(struct nilfs_btree *btree, 1325static void nilfs_btree_shrink(struct nilfs_bmap *btree,
1283 struct nilfs_btree_path *path, 1326 struct nilfs_btree_path *path,
1284 int level, __u64 *keyp, __u64 *ptrp) 1327 int level, __u64 *keyp, __u64 *ptrp)
1285{ 1328{
1286 struct nilfs_btree_node *root, *child; 1329 struct nilfs_btree_node *root, *child;
1287 int n; 1330 int n, ncblk;
1288 1331
1289 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1332 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1290 1333
1291 root = nilfs_btree_get_root(btree); 1334 root = nilfs_btree_get_root(btree);
1292 child = nilfs_btree_get_nonroot_node(path, level); 1335 child = nilfs_btree_get_nonroot_node(path, level);
1336 ncblk = nilfs_btree_nchildren_per_block(btree);
1293 1337
1294 nilfs_btree_node_delete(btree, root, NULL, NULL, 0); 1338 nilfs_btree_node_delete(root, 0, NULL, NULL,
1339 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1295 nilfs_btree_node_set_level(root, level); 1340 nilfs_btree_node_set_level(root, level);
1296 n = nilfs_btree_node_get_nchildren(child); 1341 n = nilfs_btree_node_get_nchildren(child);
1297 nilfs_btree_node_move_left(btree, root, child, n); 1342 nilfs_btree_node_move_left(root, child, n,
1343 NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
1298 1344
1299 nilfs_btnode_delete(path[level].bp_bh); 1345 nilfs_btnode_delete(path[level].bp_bh);
1300 path[level].bp_bh = NULL; 1346 path[level].bp_bh = NULL;
1301} 1347}
1302 1348
1303 1349
1304static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, 1350static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
1305 struct nilfs_btree_path *path, 1351 struct nilfs_btree_path *path,
1306 int *levelp, 1352 int *levelp,
1307 struct nilfs_bmap_stats *stats, 1353 struct nilfs_bmap_stats *stats,
@@ -1310,42 +1356,43 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1310 struct buffer_head *bh; 1356 struct buffer_head *bh;
1311 struct nilfs_btree_node *node, *parent, *sib; 1357 struct nilfs_btree_node *node, *parent, *sib;
1312 __u64 sibptr; 1358 __u64 sibptr;
1313 int pindex, level, ret; 1359 int pindex, level, ncmin, ncmax, ncblk, ret;
1314 1360
1315 ret = 0; 1361 ret = 0;
1316 stats->bs_nblocks = 0; 1362 stats->bs_nblocks = 0;
1363 ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
1364 ncblk = nilfs_btree_nchildren_per_block(btree);
1365
1317 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 1366 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1318 level < nilfs_btree_height(btree) - 1; 1367 level < nilfs_btree_height(btree) - 1;
1319 level++) { 1368 level++) {
1320 node = nilfs_btree_get_nonroot_node(path, level); 1369 node = nilfs_btree_get_nonroot_node(path, level);
1321 path[level].bp_oldreq.bpr_ptr = 1370 path[level].bp_oldreq.bpr_ptr =
1322 nilfs_btree_node_get_ptr(btree, node, 1371 nilfs_btree_node_get_ptr(node, path[level].bp_index,
1323 path[level].bp_index); 1372 ncblk);
1324 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1373 ret = nilfs_bmap_prepare_end_ptr(btree,
1325 &path[level].bp_oldreq, dat); 1374 &path[level].bp_oldreq, dat);
1326 if (ret < 0) 1375 if (ret < 0)
1327 goto err_out_child_node; 1376 goto err_out_child_node;
1328 1377
1329 if (nilfs_btree_node_get_nchildren(node) > 1378 if (nilfs_btree_node_get_nchildren(node) > ncmin) {
1330 nilfs_btree_node_nchildren_min(node, btree)) {
1331 path[level].bp_op = nilfs_btree_do_delete; 1379 path[level].bp_op = nilfs_btree_do_delete;
1332 stats->bs_nblocks++; 1380 stats->bs_nblocks++;
1333 goto out; 1381 goto out;
1334 } 1382 }
1335 1383
1336 parent = nilfs_btree_get_node(btree, path, level + 1); 1384 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1337 pindex = path[level + 1].bp_index; 1385 pindex = path[level + 1].bp_index;
1338 1386
1339 if (pindex > 0) { 1387 if (pindex > 0) {
1340 /* left sibling */ 1388 /* left sibling */
1341 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1389 sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
1342 pindex - 1); 1390 ncmax);
1343 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1391 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1344 if (ret < 0) 1392 if (ret < 0)
1345 goto err_out_curr_node; 1393 goto err_out_curr_node;
1346 sib = (struct nilfs_btree_node *)bh->b_data; 1394 sib = (struct nilfs_btree_node *)bh->b_data;
1347 if (nilfs_btree_node_get_nchildren(sib) > 1395 if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
1348 nilfs_btree_node_nchildren_min(sib, btree)) {
1349 path[level].bp_sib_bh = bh; 1396 path[level].bp_sib_bh = bh;
1350 path[level].bp_op = nilfs_btree_borrow_left; 1397 path[level].bp_op = nilfs_btree_borrow_left;
1351 stats->bs_nblocks++; 1398 stats->bs_nblocks++;
@@ -1359,14 +1406,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1359 } else if (pindex < 1406 } else if (pindex <
1360 nilfs_btree_node_get_nchildren(parent) - 1) { 1407 nilfs_btree_node_get_nchildren(parent) - 1) {
1361 /* right sibling */ 1408 /* right sibling */
1362 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1409 sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
1363 pindex + 1); 1410 ncmax);
1364 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1411 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1365 if (ret < 0) 1412 if (ret < 0)
1366 goto err_out_curr_node; 1413 goto err_out_curr_node;
1367 sib = (struct nilfs_btree_node *)bh->b_data; 1414 sib = (struct nilfs_btree_node *)bh->b_data;
1368 if (nilfs_btree_node_get_nchildren(sib) > 1415 if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
1369 nilfs_btree_node_nchildren_min(sib, btree)) {
1370 path[level].bp_sib_bh = bh; 1416 path[level].bp_sib_bh = bh;
1371 path[level].bp_op = nilfs_btree_borrow_right; 1417 path[level].bp_op = nilfs_btree_borrow_right;
1372 stats->bs_nblocks++; 1418 stats->bs_nblocks++;
@@ -1397,10 +1443,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1397 1443
1398 node = nilfs_btree_get_root(btree); 1444 node = nilfs_btree_get_root(btree);
1399 path[level].bp_oldreq.bpr_ptr = 1445 path[level].bp_oldreq.bpr_ptr =
1400 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); 1446 nilfs_btree_node_get_ptr(node, path[level].bp_index,
1447 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1401 1448
1402 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1449 ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
1403 &path[level].bp_oldreq, dat);
1404 if (ret < 0) 1450 if (ret < 0)
1405 goto err_out_child_node; 1451 goto err_out_child_node;
1406 1452
@@ -1415,99 +1461,87 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1415 1461
1416 /* error */ 1462 /* error */
1417 err_out_curr_node: 1463 err_out_curr_node:
1418 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat); 1464 nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
1419 err_out_child_node: 1465 err_out_child_node:
1420 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { 1466 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1421 brelse(path[level].bp_sib_bh); 1467 brelse(path[level].bp_sib_bh);
1422 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, 1468 nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
1423 &path[level].bp_oldreq, dat);
1424 } 1469 }
1425 *levelp = level; 1470 *levelp = level;
1426 stats->bs_nblocks = 0; 1471 stats->bs_nblocks = 0;
1427 return ret; 1472 return ret;
1428} 1473}
1429 1474
1430static void nilfs_btree_commit_delete(struct nilfs_btree *btree, 1475static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
1431 struct nilfs_btree_path *path, 1476 struct nilfs_btree_path *path,
1432 int maxlevel, struct inode *dat) 1477 int maxlevel, struct inode *dat)
1433{ 1478{
1434 int level; 1479 int level;
1435 1480
1436 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1481 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1437 nilfs_bmap_commit_end_ptr(&btree->bt_bmap, 1482 nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
1438 &path[level].bp_oldreq, dat);
1439 path[level].bp_op(btree, path, level, NULL, NULL); 1483 path[level].bp_op(btree, path, level, NULL, NULL);
1440 } 1484 }
1441 1485
1442 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 1486 if (!nilfs_bmap_dirty(btree))
1443 nilfs_bmap_set_dirty(&btree->bt_bmap); 1487 nilfs_bmap_set_dirty(btree);
1444} 1488}
1445 1489
1446static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) 1490static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
1447 1491
1448{ 1492{
1449 struct nilfs_btree *btree;
1450 struct nilfs_btree_path *path; 1493 struct nilfs_btree_path *path;
1451 struct nilfs_bmap_stats stats; 1494 struct nilfs_bmap_stats stats;
1452 struct inode *dat; 1495 struct inode *dat;
1453 int level, ret; 1496 int level, ret;
1454 1497
1455 btree = (struct nilfs_btree *)bmap;
1456 path = nilfs_btree_alloc_path(); 1498 path = nilfs_btree_alloc_path();
1457 if (path == NULL) 1499 if (path == NULL)
1458 return -ENOMEM; 1500 return -ENOMEM;
1459 nilfs_btree_init_path(path); 1501
1460 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1502 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1461 NILFS_BTREE_LEVEL_NODE_MIN); 1503 NILFS_BTREE_LEVEL_NODE_MIN, 0);
1462 if (ret < 0) 1504 if (ret < 0)
1463 goto out; 1505 goto out;
1464 1506
1465 1507
1466 dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ? 1508 dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
1467 nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
1468 1509
1469 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); 1510 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
1470 if (ret < 0) 1511 if (ret < 0)
1471 goto out; 1512 goto out;
1472 nilfs_btree_commit_delete(btree, path, level, dat); 1513 nilfs_btree_commit_delete(btree, path, level, dat);
1473 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); 1514 nilfs_bmap_sub_blocks(btree, stats.bs_nblocks);
1474 1515
1475out: 1516out:
1476 nilfs_btree_release_path(path);
1477 nilfs_btree_free_path(path); 1517 nilfs_btree_free_path(path);
1478 return ret; 1518 return ret;
1479} 1519}
1480 1520
1481static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) 1521static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
1482{ 1522{
1483 struct nilfs_btree *btree;
1484 struct nilfs_btree_path *path; 1523 struct nilfs_btree_path *path;
1485 int ret; 1524 int ret;
1486 1525
1487 btree = (struct nilfs_btree *)bmap;
1488 path = nilfs_btree_alloc_path(); 1526 path = nilfs_btree_alloc_path();
1489 if (path == NULL) 1527 if (path == NULL)
1490 return -ENOMEM; 1528 return -ENOMEM;
1491 nilfs_btree_init_path(path);
1492 1529
1493 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); 1530 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1494 1531
1495 nilfs_btree_release_path(path);
1496 nilfs_btree_free_path(path); 1532 nilfs_btree_free_path(path);
1497 1533
1498 return ret; 1534 return ret;
1499} 1535}
1500 1536
1501static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) 1537static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
1502{ 1538{
1503 struct buffer_head *bh; 1539 struct buffer_head *bh;
1504 struct nilfs_btree *btree;
1505 struct nilfs_btree_node *root, *node; 1540 struct nilfs_btree_node *root, *node;
1506 __u64 maxkey, nextmaxkey; 1541 __u64 maxkey, nextmaxkey;
1507 __u64 ptr; 1542 __u64 ptr;
1508 int nchildren, ret; 1543 int nchildren, ret;
1509 1544
1510 btree = (struct nilfs_btree *)bmap;
1511 root = nilfs_btree_get_root(btree); 1545 root = nilfs_btree_get_root(btree);
1512 switch (nilfs_btree_height(btree)) { 1546 switch (nilfs_btree_height(btree)) {
1513 case 2: 1547 case 2:
@@ -1518,7 +1552,8 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1518 nchildren = nilfs_btree_node_get_nchildren(root); 1552 nchildren = nilfs_btree_node_get_nchildren(root);
1519 if (nchildren > 1) 1553 if (nchildren > 1)
1520 return 0; 1554 return 0;
1521 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1555 ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
1556 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1522 ret = nilfs_btree_get_block(btree, ptr, &bh); 1557 ret = nilfs_btree_get_block(btree, ptr, &bh);
1523 if (ret < 0) 1558 if (ret < 0)
1524 return ret; 1559 return ret;
@@ -1538,32 +1573,33 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1538 return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); 1573 return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
1539} 1574}
1540 1575
1541static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, 1576static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
1542 __u64 *keys, __u64 *ptrs, int nitems) 1577 __u64 *keys, __u64 *ptrs, int nitems)
1543{ 1578{
1544 struct buffer_head *bh; 1579 struct buffer_head *bh;
1545 struct nilfs_btree *btree;
1546 struct nilfs_btree_node *node, *root; 1580 struct nilfs_btree_node *node, *root;
1547 __le64 *dkeys; 1581 __le64 *dkeys;
1548 __le64 *dptrs; 1582 __le64 *dptrs;
1549 __u64 ptr; 1583 __u64 ptr;
1550 int nchildren, i, ret; 1584 int nchildren, ncmax, i, ret;
1551 1585
1552 btree = (struct nilfs_btree *)bmap;
1553 root = nilfs_btree_get_root(btree); 1586 root = nilfs_btree_get_root(btree);
1554 switch (nilfs_btree_height(btree)) { 1587 switch (nilfs_btree_height(btree)) {
1555 case 2: 1588 case 2:
1556 bh = NULL; 1589 bh = NULL;
1557 node = root; 1590 node = root;
1591 ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
1558 break; 1592 break;
1559 case 3: 1593 case 3:
1560 nchildren = nilfs_btree_node_get_nchildren(root); 1594 nchildren = nilfs_btree_node_get_nchildren(root);
1561 WARN_ON(nchildren > 1); 1595 WARN_ON(nchildren > 1);
1562 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1596 ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
1597 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1563 ret = nilfs_btree_get_block(btree, ptr, &bh); 1598 ret = nilfs_btree_get_block(btree, ptr, &bh);
1564 if (ret < 0) 1599 if (ret < 0)
1565 return ret; 1600 return ret;
1566 node = (struct nilfs_btree_node *)bh->b_data; 1601 node = (struct nilfs_btree_node *)bh->b_data;
1602 ncmax = nilfs_btree_nchildren_per_block(btree);
1567 break; 1603 break;
1568 default: 1604 default:
1569 node = NULL; 1605 node = NULL;
@@ -1574,10 +1610,10 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1574 if (nchildren < nitems) 1610 if (nchildren < nitems)
1575 nitems = nchildren; 1611 nitems = nchildren;
1576 dkeys = nilfs_btree_node_dkeys(node); 1612 dkeys = nilfs_btree_node_dkeys(node);
1577 dptrs = nilfs_btree_node_dptrs(node, btree); 1613 dptrs = nilfs_btree_node_dptrs(node, ncmax);
1578 for (i = 0; i < nitems; i++) { 1614 for (i = 0; i < nitems; i++) {
1579 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); 1615 keys[i] = le64_to_cpu(dkeys[i]);
1580 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); 1616 ptrs[i] = le64_to_cpu(dptrs[i]);
1581 } 1617 }
1582 1618
1583 if (bh != NULL) 1619 if (bh != NULL)
@@ -1587,14 +1623,13 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1587} 1623}
1588 1624
1589static int 1625static int
1590nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, 1626nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
1591 union nilfs_bmap_ptr_req *dreq, 1627 union nilfs_bmap_ptr_req *dreq,
1592 union nilfs_bmap_ptr_req *nreq, 1628 union nilfs_bmap_ptr_req *nreq,
1593 struct buffer_head **bhp, 1629 struct buffer_head **bhp,
1594 struct nilfs_bmap_stats *stats) 1630 struct nilfs_bmap_stats *stats)
1595{ 1631{
1596 struct buffer_head *bh; 1632 struct buffer_head *bh;
1597 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1598 struct inode *dat = NULL; 1633 struct inode *dat = NULL;
1599 int ret; 1634 int ret;
1600 1635
@@ -1602,12 +1637,12 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1602 1637
1603 /* for data */ 1638 /* for data */
1604 /* cannot find near ptr */ 1639 /* cannot find near ptr */
1605 if (NILFS_BMAP_USE_VBN(bmap)) { 1640 if (NILFS_BMAP_USE_VBN(btree)) {
1606 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); 1641 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
1607 dat = nilfs_bmap_get_dat(bmap); 1642 dat = nilfs_bmap_get_dat(btree);
1608 } 1643 }
1609 1644
1610 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat); 1645 ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
1611 if (ret < 0) 1646 if (ret < 0)
1612 return ret; 1647 return ret;
1613 1648
@@ -1615,7 +1650,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1615 stats->bs_nblocks++; 1650 stats->bs_nblocks++;
1616 if (nreq != NULL) { 1651 if (nreq != NULL) {
1617 nreq->bpr_ptr = dreq->bpr_ptr + 1; 1652 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1618 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat); 1653 ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
1619 if (ret < 0) 1654 if (ret < 0)
1620 goto err_out_dreq; 1655 goto err_out_dreq;
1621 1656
@@ -1632,16 +1667,16 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1632 1667
1633 /* error */ 1668 /* error */
1634 err_out_nreq: 1669 err_out_nreq:
1635 nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat); 1670 nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
1636 err_out_dreq: 1671 err_out_dreq:
1637 nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat); 1672 nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
1638 stats->bs_nblocks = 0; 1673 stats->bs_nblocks = 0;
1639 return ret; 1674 return ret;
1640 1675
1641} 1676}
1642 1677
1643static void 1678static void
1644nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, 1679nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
1645 __u64 key, __u64 ptr, 1680 __u64 key, __u64 ptr,
1646 const __u64 *keys, const __u64 *ptrs, 1681 const __u64 *keys, const __u64 *ptrs,
1647 int n, 1682 int n,
@@ -1649,57 +1684,59 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1649 union nilfs_bmap_ptr_req *nreq, 1684 union nilfs_bmap_ptr_req *nreq,
1650 struct buffer_head *bh) 1685 struct buffer_head *bh)
1651{ 1686{
1652 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1653 struct nilfs_btree_node *node; 1687 struct nilfs_btree_node *node;
1654 struct inode *dat; 1688 struct inode *dat;
1655 __u64 tmpptr; 1689 __u64 tmpptr;
1690 int ncblk;
1656 1691
1657 /* free resources */ 1692 /* free resources */
1658 if (bmap->b_ops->bop_clear != NULL) 1693 if (btree->b_ops->bop_clear != NULL)
1659 bmap->b_ops->bop_clear(bmap); 1694 btree->b_ops->bop_clear(btree);
1660 1695
1661 /* ptr must be a pointer to a buffer head. */ 1696 /* ptr must be a pointer to a buffer head. */
1662 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1697 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1663 1698
1664 /* convert and insert */ 1699 /* convert and insert */
1665 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; 1700 dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
1666 nilfs_btree_init(bmap); 1701 nilfs_btree_init(btree);
1667 if (nreq != NULL) { 1702 if (nreq != NULL) {
1668 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); 1703 nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
1669 nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); 1704 nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
1670 1705
1671 /* create child node at level 1 */ 1706 /* create child node at level 1 */
1672 node = (struct nilfs_btree_node *)bh->b_data; 1707 node = (struct nilfs_btree_node *)bh->b_data;
1673 nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); 1708 ncblk = nilfs_btree_nchildren_per_block(btree);
1674 nilfs_btree_node_insert(btree, node, 1709 nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
1675 key, dreq->bpr_ptr, n); 1710 nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
1676 if (!buffer_dirty(bh)) 1711 if (!buffer_dirty(bh))
1677 nilfs_btnode_mark_dirty(bh); 1712 nilfs_btnode_mark_dirty(bh);
1678 if (!nilfs_bmap_dirty(bmap)) 1713 if (!nilfs_bmap_dirty(btree))
1679 nilfs_bmap_set_dirty(bmap); 1714 nilfs_bmap_set_dirty(btree);
1680 1715
1681 brelse(bh); 1716 brelse(bh);
1682 1717
1683 /* create root node at level 2 */ 1718 /* create root node at level 2 */
1684 node = nilfs_btree_get_root(btree); 1719 node = nilfs_btree_get_root(btree);
1685 tmpptr = nreq->bpr_ptr; 1720 tmpptr = nreq->bpr_ptr;
1686 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1721 nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
1687 2, 1, &keys[0], &tmpptr); 1722 NILFS_BTREE_ROOT_NCHILDREN_MAX,
1723 &keys[0], &tmpptr);
1688 } else { 1724 } else {
1689 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); 1725 nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
1690 1726
1691 /* create root node at level 1 */ 1727 /* create root node at level 1 */
1692 node = nilfs_btree_get_root(btree); 1728 node = nilfs_btree_get_root(btree);
1693 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1729 nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
1694 1, n, keys, ptrs); 1730 NILFS_BTREE_ROOT_NCHILDREN_MAX,
1695 nilfs_btree_node_insert(btree, node, 1731 keys, ptrs);
1696 key, dreq->bpr_ptr, n); 1732 nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
1697 if (!nilfs_bmap_dirty(bmap)) 1733 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1698 nilfs_bmap_set_dirty(bmap); 1734 if (!nilfs_bmap_dirty(btree))
1735 nilfs_bmap_set_dirty(btree);
1699 } 1736 }
1700 1737
1701 if (NILFS_BMAP_USE_VBN(bmap)) 1738 if (NILFS_BMAP_USE_VBN(btree))
1702 nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr); 1739 nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
1703} 1740}
1704 1741
1705/** 1742/**
@@ -1711,7 +1748,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1711 * @ptrs: 1748 * @ptrs:
1712 * @n: 1749 * @n:
1713 */ 1750 */
1714int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, 1751int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
1715 __u64 key, __u64 ptr, 1752 __u64 key, __u64 ptr,
1716 const __u64 *keys, const __u64 *ptrs, int n) 1753 const __u64 *keys, const __u64 *ptrs, int n)
1717{ 1754{
@@ -1724,7 +1761,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1724 di = &dreq; 1761 di = &dreq;
1725 ni = NULL; 1762 ni = NULL;
1726 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( 1763 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
1727 1 << bmap->b_inode->i_blkbits)) { 1764 1 << btree->b_inode->i_blkbits)) {
1728 di = &dreq; 1765 di = &dreq;
1729 ni = &nreq; 1766 ni = &nreq;
1730 } else { 1767 } else {
@@ -1733,17 +1770,17 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1733 BUG(); 1770 BUG();
1734 } 1771 }
1735 1772
1736 ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh, 1773 ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
1737 &stats); 1774 &stats);
1738 if (ret < 0) 1775 if (ret < 0)
1739 return ret; 1776 return ret;
1740 nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, 1777 nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
1741 di, ni, bh); 1778 di, ni, bh);
1742 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1779 nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
1743 return 0; 1780 return 0;
1744} 1781}
1745 1782
1746static int nilfs_btree_propagate_p(struct nilfs_btree *btree, 1783static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
1747 struct nilfs_btree_path *path, 1784 struct nilfs_btree_path *path,
1748 int level, 1785 int level,
1749 struct buffer_head *bh) 1786 struct buffer_head *bh)
@@ -1755,17 +1792,17 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1755 return 0; 1792 return 0;
1756} 1793}
1757 1794
1758static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, 1795static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
1759 struct nilfs_btree_path *path, 1796 struct nilfs_btree_path *path,
1760 int level, struct inode *dat) 1797 int level, struct inode *dat)
1761{ 1798{
1762 struct nilfs_btree_node *parent; 1799 struct nilfs_btree_node *parent;
1763 int ret; 1800 int ncmax, ret;
1764 1801
1765 parent = nilfs_btree_get_node(btree, path, level + 1); 1802 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1766 path[level].bp_oldreq.bpr_ptr = 1803 path[level].bp_oldreq.bpr_ptr =
1767 nilfs_btree_node_get_ptr(btree, parent, 1804 nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
1768 path[level + 1].bp_index); 1805 ncmax);
1769 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; 1806 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1770 ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, 1807 ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
1771 &path[level].bp_newreq.bpr_req); 1808 &path[level].bp_newreq.bpr_req);
@@ -1777,7 +1814,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1777 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; 1814 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
1778 path[level].bp_ctxt.bh = path[level].bp_bh; 1815 path[level].bp_ctxt.bh = path[level].bp_bh;
1779 ret = nilfs_btnode_prepare_change_key( 1816 ret = nilfs_btnode_prepare_change_key(
1780 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1817 &NILFS_BMAP_I(btree)->i_btnode_cache,
1781 &path[level].bp_ctxt); 1818 &path[level].bp_ctxt);
1782 if (ret < 0) { 1819 if (ret < 0) {
1783 nilfs_dat_abort_update(dat, 1820 nilfs_dat_abort_update(dat,
@@ -1790,30 +1827,31 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1790 return 0; 1827 return 0;
1791} 1828}
1792 1829
1793static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, 1830static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
1794 struct nilfs_btree_path *path, 1831 struct nilfs_btree_path *path,
1795 int level, struct inode *dat) 1832 int level, struct inode *dat)
1796{ 1833{
1797 struct nilfs_btree_node *parent; 1834 struct nilfs_btree_node *parent;
1835 int ncmax;
1798 1836
1799 nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, 1837 nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
1800 &path[level].bp_newreq.bpr_req, 1838 &path[level].bp_newreq.bpr_req,
1801 btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS); 1839 btree->b_ptr_type == NILFS_BMAP_PTR_VS);
1802 1840
1803 if (buffer_nilfs_node(path[level].bp_bh)) { 1841 if (buffer_nilfs_node(path[level].bp_bh)) {
1804 nilfs_btnode_commit_change_key( 1842 nilfs_btnode_commit_change_key(
1805 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1843 &NILFS_BMAP_I(btree)->i_btnode_cache,
1806 &path[level].bp_ctxt); 1844 &path[level].bp_ctxt);
1807 path[level].bp_bh = path[level].bp_ctxt.bh; 1845 path[level].bp_bh = path[level].bp_ctxt.bh;
1808 } 1846 }
1809 set_buffer_nilfs_volatile(path[level].bp_bh); 1847 set_buffer_nilfs_volatile(path[level].bp_bh);
1810 1848
1811 parent = nilfs_btree_get_node(btree, path, level + 1); 1849 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1812 nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, 1850 nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
1813 path[level].bp_newreq.bpr_ptr); 1851 path[level].bp_newreq.bpr_ptr, ncmax);
1814} 1852}
1815 1853
1816static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, 1854static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
1817 struct nilfs_btree_path *path, 1855 struct nilfs_btree_path *path,
1818 int level, struct inode *dat) 1856 int level, struct inode *dat)
1819{ 1857{
@@ -1821,11 +1859,11 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1821 &path[level].bp_newreq.bpr_req); 1859 &path[level].bp_newreq.bpr_req);
1822 if (buffer_nilfs_node(path[level].bp_bh)) 1860 if (buffer_nilfs_node(path[level].bp_bh))
1823 nilfs_btnode_abort_change_key( 1861 nilfs_btnode_abort_change_key(
1824 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1862 &NILFS_BMAP_I(btree)->i_btnode_cache,
1825 &path[level].bp_ctxt); 1863 &path[level].bp_ctxt);
1826} 1864}
1827 1865
1828static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, 1866static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
1829 struct nilfs_btree_path *path, 1867 struct nilfs_btree_path *path,
1830 int minlevel, int *maxlevelp, 1868 int minlevel, int *maxlevelp,
1831 struct inode *dat) 1869 struct inode *dat)
@@ -1860,7 +1898,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1860 return ret; 1898 return ret;
1861} 1899}
1862 1900
1863static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, 1901static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
1864 struct nilfs_btree_path *path, 1902 struct nilfs_btree_path *path,
1865 int minlevel, int maxlevel, 1903 int minlevel, int maxlevel,
1866 struct buffer_head *bh, 1904 struct buffer_head *bh,
@@ -1875,14 +1913,15 @@ static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1875 nilfs_btree_commit_update_v(btree, path, level, dat); 1913 nilfs_btree_commit_update_v(btree, path, level, dat);
1876} 1914}
1877 1915
1878static int nilfs_btree_propagate_v(struct nilfs_btree *btree, 1916static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
1879 struct nilfs_btree_path *path, 1917 struct nilfs_btree_path *path,
1880 int level, struct buffer_head *bh) 1918 int level, struct buffer_head *bh)
1881{ 1919{
1882 int maxlevel = 0, ret; 1920 int maxlevel = 0, ret;
1883 struct nilfs_btree_node *parent; 1921 struct nilfs_btree_node *parent;
1884 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1922 struct inode *dat = nilfs_bmap_get_dat(btree);
1885 __u64 ptr; 1923 __u64 ptr;
1924 int ncmax;
1886 1925
1887 get_bh(bh); 1926 get_bh(bh);
1888 path[level].bp_bh = bh; 1927 path[level].bp_bh = bh;
@@ -1892,9 +1931,10 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1892 goto out; 1931 goto out;
1893 1932
1894 if (buffer_nilfs_volatile(path[level].bp_bh)) { 1933 if (buffer_nilfs_volatile(path[level].bp_bh)) {
1895 parent = nilfs_btree_get_node(btree, path, level + 1); 1934 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1896 ptr = nilfs_btree_node_get_ptr(btree, parent, 1935 ptr = nilfs_btree_node_get_ptr(parent,
1897 path[level + 1].bp_index); 1936 path[level + 1].bp_index,
1937 ncmax);
1898 ret = nilfs_dat_mark_dirty(dat, ptr); 1938 ret = nilfs_dat_mark_dirty(dat, ptr);
1899 if (ret < 0) 1939 if (ret < 0)
1900 goto out; 1940 goto out;
@@ -1908,10 +1948,9 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1908 return ret; 1948 return ret;
1909} 1949}
1910 1950
1911static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, 1951static int nilfs_btree_propagate(struct nilfs_bmap *btree,
1912 struct buffer_head *bh) 1952 struct buffer_head *bh)
1913{ 1953{
1914 struct nilfs_btree *btree;
1915 struct nilfs_btree_path *path; 1954 struct nilfs_btree_path *path;
1916 struct nilfs_btree_node *node; 1955 struct nilfs_btree_node *node;
1917 __u64 key; 1956 __u64 key;
@@ -1919,22 +1958,20 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1919 1958
1920 WARN_ON(!buffer_dirty(bh)); 1959 WARN_ON(!buffer_dirty(bh));
1921 1960
1922 btree = (struct nilfs_btree *)bmap;
1923 path = nilfs_btree_alloc_path(); 1961 path = nilfs_btree_alloc_path();
1924 if (path == NULL) 1962 if (path == NULL)
1925 return -ENOMEM; 1963 return -ENOMEM;
1926 nilfs_btree_init_path(path);
1927 1964
1928 if (buffer_nilfs_node(bh)) { 1965 if (buffer_nilfs_node(bh)) {
1929 node = (struct nilfs_btree_node *)bh->b_data; 1966 node = (struct nilfs_btree_node *)bh->b_data;
1930 key = nilfs_btree_node_get_key(node, 0); 1967 key = nilfs_btree_node_get_key(node, 0);
1931 level = nilfs_btree_node_get_level(node); 1968 level = nilfs_btree_node_get_level(node);
1932 } else { 1969 } else {
1933 key = nilfs_bmap_data_get_key(bmap, bh); 1970 key = nilfs_bmap_data_get_key(btree, bh);
1934 level = NILFS_BTREE_LEVEL_DATA; 1971 level = NILFS_BTREE_LEVEL_DATA;
1935 } 1972 }
1936 1973
1937 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); 1974 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
1938 if (ret < 0) { 1975 if (ret < 0) {
1939 if (unlikely(ret == -ENOENT)) 1976 if (unlikely(ret == -ENOENT))
1940 printk(KERN_CRIT "%s: key = %llu, level == %d\n", 1977 printk(KERN_CRIT "%s: key = %llu, level == %d\n",
@@ -1942,24 +1979,23 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1942 goto out; 1979 goto out;
1943 } 1980 }
1944 1981
1945 ret = NILFS_BMAP_USE_VBN(bmap) ? 1982 ret = NILFS_BMAP_USE_VBN(btree) ?
1946 nilfs_btree_propagate_v(btree, path, level, bh) : 1983 nilfs_btree_propagate_v(btree, path, level, bh) :
1947 nilfs_btree_propagate_p(btree, path, level, bh); 1984 nilfs_btree_propagate_p(btree, path, level, bh);
1948 1985
1949 out: 1986 out:
1950 nilfs_btree_release_path(path);
1951 nilfs_btree_free_path(path); 1987 nilfs_btree_free_path(path);
1952 1988
1953 return ret; 1989 return ret;
1954} 1990}
1955 1991
1956static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, 1992static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
1957 struct buffer_head *bh) 1993 struct buffer_head *bh)
1958{ 1994{
1959 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr); 1995 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
1960} 1996}
1961 1997
1962static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, 1998static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
1963 struct list_head *lists, 1999 struct list_head *lists,
1964 struct buffer_head *bh) 2000 struct buffer_head *bh)
1965{ 2001{
@@ -1973,6 +2009,18 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1973 node = (struct nilfs_btree_node *)bh->b_data; 2009 node = (struct nilfs_btree_node *)bh->b_data;
1974 key = nilfs_btree_node_get_key(node, 0); 2010 key = nilfs_btree_node_get_key(node, 0);
1975 level = nilfs_btree_node_get_level(node); 2011 level = nilfs_btree_node_get_level(node);
2012 if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
2013 level >= NILFS_BTREE_LEVEL_MAX) {
2014 dump_stack();
2015 printk(KERN_WARNING
2016 "%s: invalid btree level: %d (key=%llu, ino=%lu, "
2017 "blocknr=%llu)\n",
2018 __func__, level, (unsigned long long)key,
2019 NILFS_BMAP_I(btree)->vfs_inode.i_ino,
2020 (unsigned long long)bh->b_blocknr);
2021 return;
2022 }
2023
1976 list_for_each(head, &lists[level]) { 2024 list_for_each(head, &lists[level]) {
1977 cbh = list_entry(head, struct buffer_head, b_assoc_buffers); 2025 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
1978 cnode = (struct nilfs_btree_node *)cbh->b_data; 2026 cnode = (struct nilfs_btree_node *)cbh->b_data;
@@ -1983,11 +2031,10 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1983 list_add_tail(&bh->b_assoc_buffers, head); 2031 list_add_tail(&bh->b_assoc_buffers, head);
1984} 2032}
1985 2033
1986static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, 2034static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
1987 struct list_head *listp) 2035 struct list_head *listp)
1988{ 2036{
1989 struct nilfs_btree *btree = (struct nilfs_btree *)bmap; 2037 struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
1990 struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
1991 struct list_head lists[NILFS_BTREE_LEVEL_MAX]; 2038 struct list_head lists[NILFS_BTREE_LEVEL_MAX];
1992 struct pagevec pvec; 2039 struct pagevec pvec;
1993 struct buffer_head *bh, *head; 2040 struct buffer_head *bh, *head;
@@ -2021,7 +2068,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
2021 list_splice_tail(&lists[level], listp); 2068 list_splice_tail(&lists[level], listp);
2022} 2069}
2023 2070
2024static int nilfs_btree_assign_p(struct nilfs_btree *btree, 2071static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
2025 struct nilfs_btree_path *path, 2072 struct nilfs_btree_path *path,
2026 int level, 2073 int level,
2027 struct buffer_head **bh, 2074 struct buffer_head **bh,
@@ -2031,38 +2078,38 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
2031 struct nilfs_btree_node *parent; 2078 struct nilfs_btree_node *parent;
2032 __u64 key; 2079 __u64 key;
2033 __u64 ptr; 2080 __u64 ptr;
2034 int ret; 2081 int ncmax, ret;
2035 2082
2036 parent = nilfs_btree_get_node(btree, path, level + 1); 2083 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
2037 ptr = nilfs_btree_node_get_ptr(btree, parent, 2084 ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
2038 path[level + 1].bp_index); 2085 ncmax);
2039 if (buffer_nilfs_node(*bh)) { 2086 if (buffer_nilfs_node(*bh)) {
2040 path[level].bp_ctxt.oldkey = ptr; 2087 path[level].bp_ctxt.oldkey = ptr;
2041 path[level].bp_ctxt.newkey = blocknr; 2088 path[level].bp_ctxt.newkey = blocknr;
2042 path[level].bp_ctxt.bh = *bh; 2089 path[level].bp_ctxt.bh = *bh;
2043 ret = nilfs_btnode_prepare_change_key( 2090 ret = nilfs_btnode_prepare_change_key(
2044 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 2091 &NILFS_BMAP_I(btree)->i_btnode_cache,
2045 &path[level].bp_ctxt); 2092 &path[level].bp_ctxt);
2046 if (ret < 0) 2093 if (ret < 0)
2047 return ret; 2094 return ret;
2048 nilfs_btnode_commit_change_key( 2095 nilfs_btnode_commit_change_key(
2049 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 2096 &NILFS_BMAP_I(btree)->i_btnode_cache,
2050 &path[level].bp_ctxt); 2097 &path[level].bp_ctxt);
2051 *bh = path[level].bp_ctxt.bh; 2098 *bh = path[level].bp_ctxt.bh;
2052 } 2099 }
2053 2100
2054 nilfs_btree_node_set_ptr(btree, parent, 2101 nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
2055 path[level + 1].bp_index, blocknr); 2102 ncmax);
2056 2103
2057 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); 2104 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2058 /* on-disk format */ 2105 /* on-disk format */
2059 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2106 binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
2060 binfo->bi_dat.bi_level = level; 2107 binfo->bi_dat.bi_level = level;
2061 2108
2062 return 0; 2109 return 0;
2063} 2110}
2064 2111
2065static int nilfs_btree_assign_v(struct nilfs_btree *btree, 2112static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
2066 struct nilfs_btree_path *path, 2113 struct nilfs_btree_path *path,
2067 int level, 2114 int level,
2068 struct buffer_head **bh, 2115 struct buffer_head **bh,
@@ -2070,15 +2117,15 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2070 union nilfs_binfo *binfo) 2117 union nilfs_binfo *binfo)
2071{ 2118{
2072 struct nilfs_btree_node *parent; 2119 struct nilfs_btree_node *parent;
2073 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 2120 struct inode *dat = nilfs_bmap_get_dat(btree);
2074 __u64 key; 2121 __u64 key;
2075 __u64 ptr; 2122 __u64 ptr;
2076 union nilfs_bmap_ptr_req req; 2123 union nilfs_bmap_ptr_req req;
2077 int ret; 2124 int ncmax, ret;
2078 2125
2079 parent = nilfs_btree_get_node(btree, path, level + 1); 2126 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
2080 ptr = nilfs_btree_node_get_ptr(btree, parent, 2127 ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
2081 path[level + 1].bp_index); 2128 ncmax);
2082 req.bpr_ptr = ptr; 2129 req.bpr_ptr = ptr;
2083 ret = nilfs_dat_prepare_start(dat, &req.bpr_req); 2130 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
2084 if (ret < 0) 2131 if (ret < 0)
@@ -2087,56 +2134,52 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2087 2134
2088 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); 2135 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2089 /* on-disk format */ 2136 /* on-disk format */
2090 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 2137 binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
2091 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2138 binfo->bi_v.bi_blkoff = cpu_to_le64(key);
2092 2139
2093 return 0; 2140 return 0;
2094} 2141}
2095 2142
2096static int nilfs_btree_assign(struct nilfs_bmap *bmap, 2143static int nilfs_btree_assign(struct nilfs_bmap *btree,
2097 struct buffer_head **bh, 2144 struct buffer_head **bh,
2098 sector_t blocknr, 2145 sector_t blocknr,
2099 union nilfs_binfo *binfo) 2146 union nilfs_binfo *binfo)
2100{ 2147{
2101 struct nilfs_btree *btree;
2102 struct nilfs_btree_path *path; 2148 struct nilfs_btree_path *path;
2103 struct nilfs_btree_node *node; 2149 struct nilfs_btree_node *node;
2104 __u64 key; 2150 __u64 key;
2105 int level, ret; 2151 int level, ret;
2106 2152
2107 btree = (struct nilfs_btree *)bmap;
2108 path = nilfs_btree_alloc_path(); 2153 path = nilfs_btree_alloc_path();
2109 if (path == NULL) 2154 if (path == NULL)
2110 return -ENOMEM; 2155 return -ENOMEM;
2111 nilfs_btree_init_path(path);
2112 2156
2113 if (buffer_nilfs_node(*bh)) { 2157 if (buffer_nilfs_node(*bh)) {
2114 node = (struct nilfs_btree_node *)(*bh)->b_data; 2158 node = (struct nilfs_btree_node *)(*bh)->b_data;
2115 key = nilfs_btree_node_get_key(node, 0); 2159 key = nilfs_btree_node_get_key(node, 0);
2116 level = nilfs_btree_node_get_level(node); 2160 level = nilfs_btree_node_get_level(node);
2117 } else { 2161 } else {
2118 key = nilfs_bmap_data_get_key(bmap, *bh); 2162 key = nilfs_bmap_data_get_key(btree, *bh);
2119 level = NILFS_BTREE_LEVEL_DATA; 2163 level = NILFS_BTREE_LEVEL_DATA;
2120 } 2164 }
2121 2165
2122 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); 2166 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
2123 if (ret < 0) { 2167 if (ret < 0) {
2124 WARN_ON(ret == -ENOENT); 2168 WARN_ON(ret == -ENOENT);
2125 goto out; 2169 goto out;
2126 } 2170 }
2127 2171
2128 ret = NILFS_BMAP_USE_VBN(bmap) ? 2172 ret = NILFS_BMAP_USE_VBN(btree) ?
2129 nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) : 2173 nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
2130 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); 2174 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2131 2175
2132 out: 2176 out:
2133 nilfs_btree_release_path(path);
2134 nilfs_btree_free_path(path); 2177 nilfs_btree_free_path(path);
2135 2178
2136 return ret; 2179 return ret;
2137} 2180}
2138 2181
2139static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, 2182static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
2140 struct buffer_head **bh, 2183 struct buffer_head **bh,
2141 sector_t blocknr, 2184 sector_t blocknr,
2142 union nilfs_binfo *binfo) 2185 union nilfs_binfo *binfo)
@@ -2145,7 +2188,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2145 __u64 key; 2188 __u64 key;
2146 int ret; 2189 int ret;
2147 2190
2148 ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr, 2191 ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
2149 blocknr); 2192 blocknr);
2150 if (ret < 0) 2193 if (ret < 0)
2151 return ret; 2194 return ret;
@@ -2154,30 +2197,27 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2154 node = (struct nilfs_btree_node *)(*bh)->b_data; 2197 node = (struct nilfs_btree_node *)(*bh)->b_data;
2155 key = nilfs_btree_node_get_key(node, 0); 2198 key = nilfs_btree_node_get_key(node, 0);
2156 } else 2199 } else
2157 key = nilfs_bmap_data_get_key(bmap, *bh); 2200 key = nilfs_bmap_data_get_key(btree, *bh);
2158 2201
2159 /* on-disk format */ 2202 /* on-disk format */
2160 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); 2203 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
2161 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2204 binfo->bi_v.bi_blkoff = cpu_to_le64(key);
2162 2205
2163 return 0; 2206 return 0;
2164} 2207}
2165 2208
2166static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) 2209static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
2167{ 2210{
2168 struct buffer_head *bh; 2211 struct buffer_head *bh;
2169 struct nilfs_btree *btree;
2170 struct nilfs_btree_path *path; 2212 struct nilfs_btree_path *path;
2171 __u64 ptr; 2213 __u64 ptr;
2172 int ret; 2214 int ret;
2173 2215
2174 btree = (struct nilfs_btree *)bmap;
2175 path = nilfs_btree_alloc_path(); 2216 path = nilfs_btree_alloc_path();
2176 if (path == NULL) 2217 if (path == NULL)
2177 return -ENOMEM; 2218 return -ENOMEM;
2178 nilfs_btree_init_path(path);
2179 2219
2180 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); 2220 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
2181 if (ret < 0) { 2221 if (ret < 0) {
2182 WARN_ON(ret == -ENOENT); 2222 WARN_ON(ret == -ENOENT);
2183 goto out; 2223 goto out;
@@ -2191,11 +2231,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2191 if (!buffer_dirty(bh)) 2231 if (!buffer_dirty(bh))
2192 nilfs_btnode_mark_dirty(bh); 2232 nilfs_btnode_mark_dirty(bh);
2193 brelse(bh); 2233 brelse(bh);
2194 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 2234 if (!nilfs_bmap_dirty(btree))
2195 nilfs_bmap_set_dirty(&btree->bt_bmap); 2235 nilfs_bmap_set_dirty(btree);
2196 2236
2197 out: 2237 out:
2198 nilfs_btree_release_path(path);
2199 nilfs_btree_free_path(path); 2238 nilfs_btree_free_path(path);
2200 return ret; 2239 return ret;
2201} 2240}
@@ -2243,10 +2282,14 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2243int nilfs_btree_init(struct nilfs_bmap *bmap) 2282int nilfs_btree_init(struct nilfs_bmap *bmap)
2244{ 2283{
2245 bmap->b_ops = &nilfs_btree_ops; 2284 bmap->b_ops = &nilfs_btree_ops;
2285 bmap->b_nchildren_per_block =
2286 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
2246 return 0; 2287 return 0;
2247} 2288}
2248 2289
2249void nilfs_btree_init_gc(struct nilfs_bmap *bmap) 2290void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
2250{ 2291{
2251 bmap->b_ops = &nilfs_btree_ops_gc; 2292 bmap->b_ops = &nilfs_btree_ops_gc;
2293 bmap->b_nchildren_per_block =
2294 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
2252} 2295}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84ade75..22c02e35b6ef 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,18 +30,26 @@
30#include "btnode.h" 30#include "btnode.h"
31#include "bmap.h" 31#include "bmap.h"
32 32
33struct nilfs_btree;
34struct nilfs_btree_path;
35
36/** 33/**
37 * struct nilfs_btree - B-tree structure 34 * struct nilfs_btree_path - A path on which B-tree operations are executed
38 * @bt_bmap: bmap base structure 35 * @bp_bh: buffer head of node block
36 * @bp_sib_bh: buffer head of sibling node block
37 * @bp_index: index of child node
38 * @bp_oldreq: ptr end request for old ptr
39 * @bp_newreq: ptr alloc request for new ptr
40 * @bp_op: rebalance operation
39 */ 41 */
40struct nilfs_btree { 42struct nilfs_btree_path {
41 struct nilfs_bmap bt_bmap; 43 struct buffer_head *bp_bh;
44 struct buffer_head *bp_sib_bh;
45 int bp_index;
46 union nilfs_bmap_ptr_req bp_oldreq;
47 union nilfs_bmap_ptr_req bp_newreq;
48 struct nilfs_btnode_chkey_ctxt bp_ctxt;
49 void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *,
50 int, __u64 *, __u64 *);
42}; 51};
43 52
44
45#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE 53#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE
46#define NILFS_BTREE_ROOT_NCHILDREN_MAX \ 54#define NILFS_BTREE_ROOT_NCHILDREN_MAX \
47 ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \ 55 ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \
@@ -57,12 +65,13 @@ struct nilfs_btree {
57#define NILFS_BTREE_KEY_MIN ((__u64)0) 65#define NILFS_BTREE_KEY_MIN ((__u64)0)
58#define NILFS_BTREE_KEY_MAX (~(__u64)0) 66#define NILFS_BTREE_KEY_MAX (~(__u64)0)
59 67
68extern struct kmem_cache *nilfs_btree_path_cache;
60 69
61int nilfs_btree_path_cache_init(void);
62void nilfs_btree_path_cache_destroy(void);
63int nilfs_btree_init(struct nilfs_bmap *); 70int nilfs_btree_init(struct nilfs_bmap *);
64int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, 71int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
65 const __u64 *, const __u64 *, int); 72 const __u64 *, const __u64 *, int);
66void nilfs_btree_init_gc(struct nilfs_bmap *); 73void nilfs_btree_init_gc(struct nilfs_bmap *);
67 74
75int nilfs_btree_broken_node_block(struct buffer_head *bh);
76
68#endif /* _NILFS_BTREE_H */ 77#endif /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 85c89dfc71f0..cb003c8ee1f6 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -80,23 +80,10 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
80 return last_byte; 80 return last_byte;
81} 81}
82 82
83static int nilfs_prepare_chunk_uninterruptible(struct page *page, 83static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
84 struct address_space *mapping,
85 unsigned from, unsigned to)
86{ 84{
87 loff_t pos = page_offset(page) + from; 85 loff_t pos = page_offset(page) + from;
88 return block_write_begin(NULL, mapping, pos, to - from, 86 return __block_write_begin(page, pos, to - from, nilfs_get_block);
89 AOP_FLAG_UNINTERRUPTIBLE, &page,
90 NULL, nilfs_get_block);
91}
92
93static int nilfs_prepare_chunk(struct page *page,
94 struct address_space *mapping,
95 unsigned from, unsigned to)
96{
97 loff_t pos = page_offset(page) + from;
98 return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
99 NULL, nilfs_get_block);
100} 87}
101 88
102static void nilfs_commit_chunk(struct page *page, 89static void nilfs_commit_chunk(struct page *page,
@@ -141,7 +128,7 @@ static void nilfs_check_page(struct page *page)
141 } 128 }
142 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { 129 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
143 p = (struct nilfs_dir_entry *)(kaddr + offs); 130 p = (struct nilfs_dir_entry *)(kaddr + offs);
144 rec_len = le16_to_cpu(p->rec_len); 131 rec_len = nilfs_rec_len_from_disk(p->rec_len);
145 132
146 if (rec_len < NILFS_DIR_REC_LEN(1)) 133 if (rec_len < NILFS_DIR_REC_LEN(1))
147 goto Eshort; 134 goto Eshort;
@@ -199,13 +186,10 @@ fail:
199static struct page *nilfs_get_page(struct inode *dir, unsigned long n) 186static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
200{ 187{
201 struct address_space *mapping = dir->i_mapping; 188 struct address_space *mapping = dir->i_mapping;
202 struct page *page = read_cache_page(mapping, n, 189 struct page *page = read_mapping_page(mapping, n, NULL);
203 (filler_t *)mapping->a_ops->readpage, NULL); 190
204 if (!IS_ERR(page)) { 191 if (!IS_ERR(page)) {
205 wait_on_page_locked(page);
206 kmap(page); 192 kmap(page);
207 if (!PageUptodate(page))
208 goto fail;
209 if (!PageChecked(page)) 193 if (!PageChecked(page))
210 nilfs_check_page(page); 194 nilfs_check_page(page);
211 if (PageError(page)) 195 if (PageError(page))
@@ -238,7 +222,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
238 */ 222 */
239static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) 223static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
240{ 224{
241 return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); 225 return (struct nilfs_dir_entry *)((char *)p +
226 nilfs_rec_len_from_disk(p->rec_len));
242} 227}
243 228
244static unsigned char 229static unsigned char
@@ -329,7 +314,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
329 goto success; 314 goto success;
330 } 315 }
331 } 316 }
332 filp->f_pos += le16_to_cpu(de->rec_len); 317 filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
333 } 318 }
334 nilfs_put_page(page); 319 nilfs_put_page(page);
335 } 320 }
@@ -444,12 +429,12 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
444 struct page *page, struct inode *inode) 429 struct page *page, struct inode *inode)
445{ 430{
446 unsigned from = (char *) de - (char *) page_address(page); 431 unsigned from = (char *) de - (char *) page_address(page);
447 unsigned to = from + le16_to_cpu(de->rec_len); 432 unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
448 struct address_space *mapping = page->mapping; 433 struct address_space *mapping = page->mapping;
449 int err; 434 int err;
450 435
451 lock_page(page); 436 lock_page(page);
452 err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to); 437 err = nilfs_prepare_chunk(page, from, to);
453 BUG_ON(err); 438 BUG_ON(err);
454 de->inode = cpu_to_le64(inode->i_ino); 439 de->inode = cpu_to_le64(inode->i_ino);
455 nilfs_set_de_type(de, inode); 440 nilfs_set_de_type(de, inode);
@@ -500,7 +485,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
500 /* We hit i_size */ 485 /* We hit i_size */
501 name_len = 0; 486 name_len = 0;
502 rec_len = chunk_size; 487 rec_len = chunk_size;
503 de->rec_len = cpu_to_le16(chunk_size); 488 de->rec_len = nilfs_rec_len_to_disk(chunk_size);
504 de->inode = 0; 489 de->inode = 0;
505 goto got_it; 490 goto got_it;
506 } 491 }
@@ -514,7 +499,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
514 if (nilfs_match(namelen, name, de)) 499 if (nilfs_match(namelen, name, de))
515 goto out_unlock; 500 goto out_unlock;
516 name_len = NILFS_DIR_REC_LEN(de->name_len); 501 name_len = NILFS_DIR_REC_LEN(de->name_len);
517 rec_len = le16_to_cpu(de->rec_len); 502 rec_len = nilfs_rec_len_from_disk(de->rec_len);
518 if (!de->inode && rec_len >= reclen) 503 if (!de->inode && rec_len >= reclen)
519 goto got_it; 504 goto got_it;
520 if (rec_len >= name_len + reclen) 505 if (rec_len >= name_len + reclen)
@@ -530,15 +515,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
530got_it: 515got_it:
531 from = (char *)de - (char *)page_address(page); 516 from = (char *)de - (char *)page_address(page);
532 to = from + rec_len; 517 to = from + rec_len;
533 err = nilfs_prepare_chunk(page, page->mapping, from, to); 518 err = nilfs_prepare_chunk(page, from, to);
534 if (err) 519 if (err)
535 goto out_unlock; 520 goto out_unlock;
536 if (de->inode) { 521 if (de->inode) {
537 struct nilfs_dir_entry *de1; 522 struct nilfs_dir_entry *de1;
538 523
539 de1 = (struct nilfs_dir_entry *)((char *)de + name_len); 524 de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
540 de1->rec_len = cpu_to_le16(rec_len - name_len); 525 de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
541 de->rec_len = cpu_to_le16(name_len); 526 de->rec_len = nilfs_rec_len_to_disk(name_len);
542 de = de1; 527 de = de1;
543 } 528 }
544 de->name_len = namelen; 529 de->name_len = namelen;
@@ -569,7 +554,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
569 struct inode *inode = mapping->host; 554 struct inode *inode = mapping->host;
570 char *kaddr = page_address(page); 555 char *kaddr = page_address(page);
571 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); 556 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
572 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); 557 unsigned to = ((char *)dir - kaddr) +
558 nilfs_rec_len_from_disk(dir->rec_len);
573 struct nilfs_dir_entry *pde = NULL; 559 struct nilfs_dir_entry *pde = NULL;
574 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); 560 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
575 int err; 561 int err;
@@ -587,10 +573,10 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
587 if (pde) 573 if (pde)
588 from = (char *)pde - (char *)page_address(page); 574 from = (char *)pde - (char *)page_address(page);
589 lock_page(page); 575 lock_page(page);
590 err = nilfs_prepare_chunk(page, mapping, from, to); 576 err = nilfs_prepare_chunk(page, from, to);
591 BUG_ON(err); 577 BUG_ON(err);
592 if (pde) 578 if (pde)
593 pde->rec_len = cpu_to_le16(to - from); 579 pde->rec_len = nilfs_rec_len_to_disk(to - from);
594 dir->inode = 0; 580 dir->inode = 0;
595 nilfs_commit_chunk(page, mapping, from, to); 581 nilfs_commit_chunk(page, mapping, from, to);
596 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 582 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -615,7 +601,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
615 if (!page) 601 if (!page)
616 return -ENOMEM; 602 return -ENOMEM;
617 603
618 err = nilfs_prepare_chunk(page, mapping, 0, chunk_size); 604 err = nilfs_prepare_chunk(page, 0, chunk_size);
619 if (unlikely(err)) { 605 if (unlikely(err)) {
620 unlock_page(page); 606 unlock_page(page);
621 goto fail; 607 goto fail;
@@ -624,14 +610,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
624 memset(kaddr, 0, chunk_size); 610 memset(kaddr, 0, chunk_size);
625 de = (struct nilfs_dir_entry *)kaddr; 611 de = (struct nilfs_dir_entry *)kaddr;
626 de->name_len = 1; 612 de->name_len = 1;
627 de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1)); 613 de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
628 memcpy(de->name, ".\0\0", 4); 614 memcpy(de->name, ".\0\0", 4);
629 de->inode = cpu_to_le64(inode->i_ino); 615 de->inode = cpu_to_le64(inode->i_ino);
630 nilfs_set_de_type(de, inode); 616 nilfs_set_de_type(de, inode);
631 617
632 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); 618 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
633 de->name_len = 2; 619 de->name_len = 2;
634 de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1)); 620 de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
635 de->inode = cpu_to_le64(parent->i_ino); 621 de->inode = cpu_to_le64(parent->i_ino);
636 memcpy(de->name, "..\0", 4); 622 memcpy(de->name, "..\0", 4);
637 nilfs_set_de_type(de, inode); 623 nilfs_set_de_type(de, inode);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 236753df5cdf..324d80c57518 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -27,47 +27,43 @@
27#include "alloc.h" 27#include "alloc.h"
28#include "dat.h" 28#include "dat.h"
29 29
30static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) 30static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct)
31{ 31{
32 return (__le64 *) 32 return (__le64 *)
33 ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1); 33 ((struct nilfs_direct_node *)direct->b_u.u_data + 1);
34} 34}
35 35
36static inline __u64 36static inline __u64
37nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key) 37nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key)
38{ 38{
39 return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key)); 39 return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key));
40} 40}
41 41
42static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct, 42static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct,
43 __u64 key, __u64 ptr) 43 __u64 key, __u64 ptr)
44{ 44{
45 *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr); 45 *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr);
46} 46}
47 47
48static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, 48static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
49 __u64 key, int level, __u64 *ptrp) 49 __u64 key, int level, __u64 *ptrp)
50{ 50{
51 struct nilfs_direct *direct;
52 __u64 ptr; 51 __u64 ptr;
53 52
54 direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */
55 if (key > NILFS_DIRECT_KEY_MAX || level != 1) 53 if (key > NILFS_DIRECT_KEY_MAX || level != 1)
56 return -ENOENT; 54 return -ENOENT;
57 ptr = nilfs_direct_get_ptr(direct, key); 55 ptr = nilfs_direct_get_ptr(direct, key);
58 if (ptr == NILFS_BMAP_INVALID_PTR) 56 if (ptr == NILFS_BMAP_INVALID_PTR)
59 return -ENOENT; 57 return -ENOENT;
60 58
61 if (ptrp != NULL) 59 *ptrp = ptr;
62 *ptrp = ptr;
63 return 0; 60 return 0;
64} 61}
65 62
66static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, 63static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
67 __u64 key, __u64 *ptrp, 64 __u64 key, __u64 *ptrp,
68 unsigned maxblocks) 65 unsigned maxblocks)
69{ 66{
70 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
71 struct inode *dat = NULL; 67 struct inode *dat = NULL;
72 __u64 ptr, ptr2; 68 __u64 ptr, ptr2;
73 sector_t blocknr; 69 sector_t blocknr;
@@ -79,8 +75,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
79 if (ptr == NILFS_BMAP_INVALID_PTR) 75 if (ptr == NILFS_BMAP_INVALID_PTR)
80 return -ENOENT; 76 return -ENOENT;
81 77
82 if (NILFS_BMAP_USE_VBN(bmap)) { 78 if (NILFS_BMAP_USE_VBN(direct)) {
83 dat = nilfs_bmap_get_dat(bmap); 79 dat = nilfs_bmap_get_dat(direct);
84 ret = nilfs_dat_translate(dat, ptr, &blocknr); 80 ret = nilfs_dat_translate(dat, ptr, &blocknr);
85 if (ret < 0) 81 if (ret < 0)
86 return ret; 82 return ret;
@@ -106,29 +102,21 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
106} 102}
107 103
108static __u64 104static __u64
109nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) 105nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
110{ 106{
111 __u64 ptr; 107 __u64 ptr;
112 108
113 ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key); 109 ptr = nilfs_bmap_find_target_seq(direct, key);
114 if (ptr != NILFS_BMAP_INVALID_PTR) 110 if (ptr != NILFS_BMAP_INVALID_PTR)
115 /* sequential access */ 111 /* sequential access */
116 return ptr; 112 return ptr;
117 else 113 else
118 /* block group */ 114 /* block group */
119 return nilfs_bmap_find_target_in_group(&direct->d_bmap); 115 return nilfs_bmap_find_target_in_group(direct);
120}
121
122static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
123 __u64 key, __u64 ptr)
124{
125 direct->d_bmap.b_last_allocated_key = key;
126 direct->d_bmap.b_last_allocated_ptr = ptr;
127} 116}
128 117
129static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 118static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
130{ 119{
131 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
132 union nilfs_bmap_ptr_req req; 120 union nilfs_bmap_ptr_req req;
133 struct inode *dat = NULL; 121 struct inode *dat = NULL;
134 struct buffer_head *bh; 122 struct buffer_head *bh;
@@ -136,11 +124,11 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
136 124
137 if (key > NILFS_DIRECT_KEY_MAX) 125 if (key > NILFS_DIRECT_KEY_MAX)
138 return -ENOENT; 126 return -ENOENT;
139 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) 127 if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR)
140 return -EEXIST; 128 return -EEXIST;
141 129
142 if (NILFS_BMAP_USE_VBN(bmap)) { 130 if (NILFS_BMAP_USE_VBN(bmap)) {
143 req.bpr_ptr = nilfs_direct_find_target_v(direct, key); 131 req.bpr_ptr = nilfs_direct_find_target_v(bmap, key);
144 dat = nilfs_bmap_get_dat(bmap); 132 dat = nilfs_bmap_get_dat(bmap);
145 } 133 }
146 ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); 134 ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
@@ -150,13 +138,13 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
150 set_buffer_nilfs_volatile(bh); 138 set_buffer_nilfs_volatile(bh);
151 139
152 nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); 140 nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
153 nilfs_direct_set_ptr(direct, key, req.bpr_ptr); 141 nilfs_direct_set_ptr(bmap, key, req.bpr_ptr);
154 142
155 if (!nilfs_bmap_dirty(bmap)) 143 if (!nilfs_bmap_dirty(bmap))
156 nilfs_bmap_set_dirty(bmap); 144 nilfs_bmap_set_dirty(bmap);
157 145
158 if (NILFS_BMAP_USE_VBN(bmap)) 146 if (NILFS_BMAP_USE_VBN(bmap))
159 nilfs_direct_set_target_v(direct, key, req.bpr_ptr); 147 nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
160 148
161 nilfs_bmap_add_blocks(bmap, 1); 149 nilfs_bmap_add_blocks(bmap, 1);
162 } 150 }
@@ -165,33 +153,30 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
165 153
166static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) 154static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
167{ 155{
168 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
169 union nilfs_bmap_ptr_req req; 156 union nilfs_bmap_ptr_req req;
170 struct inode *dat; 157 struct inode *dat;
171 int ret; 158 int ret;
172 159
173 if (key > NILFS_DIRECT_KEY_MAX || 160 if (key > NILFS_DIRECT_KEY_MAX ||
174 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) 161 nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR)
175 return -ENOENT; 162 return -ENOENT;
176 163
177 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; 164 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
178 req.bpr_ptr = nilfs_direct_get_ptr(direct, key); 165 req.bpr_ptr = nilfs_direct_get_ptr(bmap, key);
179 166
180 ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); 167 ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
181 if (!ret) { 168 if (!ret) {
182 nilfs_bmap_commit_end_ptr(bmap, &req, dat); 169 nilfs_bmap_commit_end_ptr(bmap, &req, dat);
183 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); 170 nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
184 nilfs_bmap_sub_blocks(bmap, 1); 171 nilfs_bmap_sub_blocks(bmap, 1);
185 } 172 }
186 return ret; 173 return ret;
187} 174}
188 175
189static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) 176static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
190{ 177{
191 struct nilfs_direct *direct;
192 __u64 key, lastkey; 178 __u64 key, lastkey;
193 179
194 direct = (struct nilfs_direct *)bmap;
195 lastkey = NILFS_DIRECT_KEY_MAX + 1; 180 lastkey = NILFS_DIRECT_KEY_MAX + 1;
196 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) 181 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
197 if (nilfs_direct_get_ptr(direct, key) != 182 if (nilfs_direct_get_ptr(direct, key) !=
@@ -211,15 +196,13 @@ static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
211 return key > NILFS_DIRECT_KEY_MAX; 196 return key > NILFS_DIRECT_KEY_MAX;
212} 197}
213 198
214static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, 199static int nilfs_direct_gather_data(struct nilfs_bmap *direct,
215 __u64 *keys, __u64 *ptrs, int nitems) 200 __u64 *keys, __u64 *ptrs, int nitems)
216{ 201{
217 struct nilfs_direct *direct;
218 __u64 key; 202 __u64 key;
219 __u64 ptr; 203 __u64 ptr;
220 int n; 204 int n;
221 205
222 direct = (struct nilfs_direct *)bmap;
223 if (nitems > NILFS_DIRECT_NBLOCKS) 206 if (nitems > NILFS_DIRECT_NBLOCKS)
224 nitems = NILFS_DIRECT_NBLOCKS; 207 nitems = NILFS_DIRECT_NBLOCKS;
225 n = 0; 208 n = 0;
@@ -237,7 +220,6 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
237int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, 220int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
238 __u64 key, __u64 *keys, __u64 *ptrs, int n) 221 __u64 key, __u64 *keys, __u64 *ptrs, int n)
239{ 222{
240 struct nilfs_direct *direct;
241 __le64 *dptrs; 223 __le64 *dptrs;
242 int ret, i, j; 224 int ret, i, j;
243 225
@@ -253,12 +235,11 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
253 bmap->b_ops->bop_clear(bmap); 235 bmap->b_ops->bop_clear(bmap);
254 236
255 /* convert */ 237 /* convert */
256 direct = (struct nilfs_direct *)bmap; 238 dptrs = nilfs_direct_dptrs(bmap);
257 dptrs = nilfs_direct_dptrs(direct);
258 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { 239 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
259 if ((j < n) && (i == keys[j])) { 240 if ((j < n) && (i == keys[j])) {
260 dptrs[i] = (i != key) ? 241 dptrs[i] = (i != key) ?
261 nilfs_bmap_ptr_to_dptr(ptrs[j]) : 242 cpu_to_le64(ptrs[j]) :
262 NILFS_BMAP_INVALID_PTR; 243 NILFS_BMAP_INVALID_PTR;
263 j++; 244 j++;
264 } else 245 } else
@@ -269,10 +250,9 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
269 return 0; 250 return 0;
270} 251}
271 252
272static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, 253static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
273 struct buffer_head *bh) 254 struct buffer_head *bh)
274{ 255{
275 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
276 struct nilfs_palloc_req oldreq, newreq; 256 struct nilfs_palloc_req oldreq, newreq;
277 struct inode *dat; 257 struct inode *dat;
278 __u64 key; 258 __u64 key;
@@ -284,7 +264,7 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
284 264
285 dat = nilfs_bmap_get_dat(bmap); 265 dat = nilfs_bmap_get_dat(bmap);
286 key = nilfs_bmap_data_get_key(bmap, bh); 266 key = nilfs_bmap_data_get_key(bmap, bh);
287 ptr = nilfs_direct_get_ptr(direct, key); 267 ptr = nilfs_direct_get_ptr(bmap, key);
288 if (!buffer_nilfs_volatile(bh)) { 268 if (!buffer_nilfs_volatile(bh)) {
289 oldreq.pr_entry_nr = ptr; 269 oldreq.pr_entry_nr = ptr;
290 newreq.pr_entry_nr = ptr; 270 newreq.pr_entry_nr = ptr;
@@ -294,20 +274,20 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
294 nilfs_dat_commit_update(dat, &oldreq, &newreq, 274 nilfs_dat_commit_update(dat, &oldreq, &newreq,
295 bmap->b_ptr_type == NILFS_BMAP_PTR_VS); 275 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
296 set_buffer_nilfs_volatile(bh); 276 set_buffer_nilfs_volatile(bh);
297 nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr); 277 nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr);
298 } else 278 } else
299 ret = nilfs_dat_mark_dirty(dat, ptr); 279 ret = nilfs_dat_mark_dirty(dat, ptr);
300 280
301 return ret; 281 return ret;
302} 282}
303 283
304static int nilfs_direct_assign_v(struct nilfs_direct *direct, 284static int nilfs_direct_assign_v(struct nilfs_bmap *direct,
305 __u64 key, __u64 ptr, 285 __u64 key, __u64 ptr,
306 struct buffer_head **bh, 286 struct buffer_head **bh,
307 sector_t blocknr, 287 sector_t blocknr,
308 union nilfs_binfo *binfo) 288 union nilfs_binfo *binfo)
309{ 289{
310 struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap); 290 struct inode *dat = nilfs_bmap_get_dat(direct);
311 union nilfs_bmap_ptr_req req; 291 union nilfs_bmap_ptr_req req;
312 int ret; 292 int ret;
313 293
@@ -315,13 +295,13 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
315 ret = nilfs_dat_prepare_start(dat, &req.bpr_req); 295 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
316 if (!ret) { 296 if (!ret) {
317 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); 297 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
318 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 298 binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
319 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 299 binfo->bi_v.bi_blkoff = cpu_to_le64(key);
320 } 300 }
321 return ret; 301 return ret;
322} 302}
323 303
324static int nilfs_direct_assign_p(struct nilfs_direct *direct, 304static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
325 __u64 key, __u64 ptr, 305 __u64 key, __u64 ptr,
326 struct buffer_head **bh, 306 struct buffer_head **bh,
327 sector_t blocknr, 307 sector_t blocknr,
@@ -329,7 +309,7 @@ static int nilfs_direct_assign_p(struct nilfs_direct *direct,
329{ 309{
330 nilfs_direct_set_ptr(direct, key, blocknr); 310 nilfs_direct_set_ptr(direct, key, blocknr);
331 311
332 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); 312 binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
333 binfo->bi_dat.bi_level = 0; 313 binfo->bi_dat.bi_level = 0;
334 314
335 return 0; 315 return 0;
@@ -340,18 +320,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
340 sector_t blocknr, 320 sector_t blocknr,
341 union nilfs_binfo *binfo) 321 union nilfs_binfo *binfo)
342{ 322{
343 struct nilfs_direct *direct;
344 __u64 key; 323 __u64 key;
345 __u64 ptr; 324 __u64 ptr;
346 325
347 direct = (struct nilfs_direct *)bmap;
348 key = nilfs_bmap_data_get_key(bmap, *bh); 326 key = nilfs_bmap_data_get_key(bmap, *bh);
349 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { 327 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
350 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, 328 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
351 (unsigned long long)key); 329 (unsigned long long)key);
352 return -EINVAL; 330 return -EINVAL;
353 } 331 }
354 ptr = nilfs_direct_get_ptr(direct, key); 332 ptr = nilfs_direct_get_ptr(bmap, key);
355 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { 333 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
356 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, 334 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
357 (unsigned long long)ptr); 335 (unsigned long long)ptr);
@@ -359,8 +337,8 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
359 } 337 }
360 338
361 return NILFS_BMAP_USE_VBN(bmap) ? 339 return NILFS_BMAP_USE_VBN(bmap) ?
362 nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) : 340 nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) :
363 nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo); 341 nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo);
364} 342}
365 343
366static const struct nilfs_bmap_operations nilfs_direct_ops = { 344static const struct nilfs_bmap_operations nilfs_direct_ops = {
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index a5ffd66e25d0..dc643de20a25 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -28,8 +28,6 @@
28#include "bmap.h" 28#include "bmap.h"
29 29
30 30
31struct nilfs_direct;
32
33/** 31/**
34 * struct nilfs_direct_node - direct node 32 * struct nilfs_direct_node - direct node
35 * @dn_flags: flags 33 * @dn_flags: flags
@@ -40,15 +38,6 @@ struct nilfs_direct_node {
40 __u8 pad[7]; 38 __u8 pad[7];
41}; 39};
42 40
43/**
44 * struct nilfs_direct - direct mapping
45 * @d_bmap: bmap structure
46 */
47struct nilfs_direct {
48 struct nilfs_bmap d_bmap;
49};
50
51
52#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) 41#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
53#define NILFS_DIRECT_KEY_MIN 0 42#define NILFS_DIRECT_KEY_MIN 0
54#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) 43#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
27#include "nilfs.h" 27#include "nilfs.h"
28#include "segment.h" 28#include "segment.h"
29 29
30int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 30int nilfs_sync_file(struct file *file, int datasync)
31{ 31{
32 /* 32 /*
33 * Called from fsync() system call 33 * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
37 * This function should be implemented when the writeback function 37 * This function should be implemented when the writeback function
38 * will be implemented. 38 * will be implemented.
39 */ 39 */
40 struct inode *inode = dentry->d_inode; 40 struct inode *inode = file->f_mapping->host;
41 int err; 41 int err;
42 42
43 if (!nilfs_inode_dirty(inode)) 43 if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index dd5f7e0a95f6..84a45d1d5464 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -78,7 +78,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
78 struct inode *gcdat = nilfs->ns_gc_dat; 78 struct inode *gcdat = nilfs->ns_gc_dat;
79 struct nilfs_inode_info *gii = NILFS_I(gcdat); 79 struct nilfs_inode_info *gii = NILFS_I(gcdat);
80 80
81 gcdat->i_state = I_CLEAR; 81 gcdat->i_state = I_FREEING | I_CLEAR;
82 gii->i_flags = 0; 82 gii->i_flags = 0;
83 83
84 nilfs_palloc_clear_cache(gcdat); 84 nilfs_palloc_clear_cache(gcdat);
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 145f03cd7d3e..bed3a783129b 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -48,6 +48,8 @@
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/swap.h> 49#include <linux/swap.h>
50#include "nilfs.h" 50#include "nilfs.h"
51#include "btree.h"
52#include "btnode.h"
51#include "page.h" 53#include "page.h"
52#include "mdt.h" 54#include "mdt.h"
53#include "dat.h" 55#include "dat.h"
@@ -149,8 +151,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
149int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, 151int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
150 __u64 vbn, struct buffer_head **out_bh) 152 __u64 vbn, struct buffer_head **out_bh)
151{ 153{
152 int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, 154 int ret;
153 vbn ? : pbn, pbn, out_bh); 155
156 ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
157 vbn ? : pbn, pbn, READ, out_bh, &pbn);
154 if (ret == -EEXIST) /* internal code (cache hit) */ 158 if (ret == -EEXIST) /* internal code (cache hit) */
155 ret = 0; 159 ret = 0;
156 return ret; 160 return ret;
@@ -164,10 +168,15 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
164 if (buffer_dirty(bh)) 168 if (buffer_dirty(bh))
165 return -EEXIST; 169 return -EEXIST;
166 170
167 if (buffer_nilfs_node(bh)) 171 if (buffer_nilfs_node(bh)) {
172 if (nilfs_btree_broken_node_block(bh)) {
173 clear_buffer_uptodate(bh);
174 return -EIO;
175 }
168 nilfs_btnode_mark_dirty(bh); 176 nilfs_btnode_mark_dirty(bh);
169 else 177 } else {
170 nilfs_mdt_mark_buffer_dirty(bh); 178 nilfs_mdt_mark_buffer_dirty(bh);
179 }
171 return 0; 180 return 0;
172} 181}
173 182
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 0957b58f909d..eccb2f2e2315 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -27,6 +27,7 @@
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/uio.h> 28#include <linux/uio.h>
29#include "nilfs.h" 29#include "nilfs.h"
30#include "btnode.h"
30#include "segment.h" 31#include "segment.h"
31#include "page.h" 32#include "page.h"
32#include "mdt.h" 33#include "mdt.h"
@@ -197,11 +198,15 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
197 if (unlikely(err)) 198 if (unlikely(err))
198 return err; 199 return err;
199 200
200 *pagep = NULL; 201 err = block_write_begin(mapping, pos, len, flags, pagep,
201 err = block_write_begin(file, mapping, pos, len, flags, pagep, 202 nilfs_get_block);
202 fsdata, nilfs_get_block); 203 if (unlikely(err)) {
203 if (unlikely(err)) 204 loff_t isize = mapping->host->i_size;
205 if (pos + len > isize)
206 vmtruncate(mapping->host, isize);
207
204 nilfs_transaction_abort(inode->i_sb); 208 nilfs_transaction_abort(inode->i_sb);
209 }
205 return err; 210 return err;
206} 211}
207 212
@@ -237,6 +242,19 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
237 /* Needs synchronization with the cleaner */ 242 /* Needs synchronization with the cleaner */
238 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 243 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
239 offset, nr_segs, nilfs_get_block, NULL); 244 offset, nr_segs, nilfs_get_block, NULL);
245
246 /*
247 * In case of error extending write may have instantiated a few
248 * blocks outside i_size. Trim these off again.
249 */
250 if (unlikely((rw & WRITE) && size < 0)) {
251 loff_t isize = i_size_read(inode);
252 loff_t end = offset + iov_length(iov, nr_segs);
253
254 if (end > isize)
255 vmtruncate(inode, isize);
256 }
257
240 return size; 258 return size;
241} 259}
242 260
@@ -280,16 +298,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
280 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 298 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
281 299
282 atomic_inc(&sbi->s_inodes_count); 300 atomic_inc(&sbi->s_inodes_count);
283 301 inode_init_owner(inode, dir, mode);
284 inode->i_uid = current_fsuid();
285 if (dir->i_mode & S_ISGID) {
286 inode->i_gid = dir->i_gid;
287 if (S_ISDIR(mode))
288 mode |= S_ISGID;
289 } else
290 inode->i_gid = current_fsgid();
291
292 inode->i_mode = mode;
293 inode->i_ino = ino; 302 inode->i_ino = ino;
294 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 303 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
295 304
@@ -346,7 +355,6 @@ void nilfs_free_inode(struct inode *inode)
346 struct super_block *sb = inode->i_sb; 355 struct super_block *sb = inode->i_sb;
347 struct nilfs_sb_info *sbi = NILFS_SB(sb); 356 struct nilfs_sb_info *sbi = NILFS_SB(sb);
348 357
349 clear_inode(inode);
350 /* XXX: check error code? Is there any thing I can do? */ 358 /* XXX: check error code? Is there any thing I can do? */
351 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino); 359 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
352 atomic_dec(&sbi->s_inodes_count); 360 atomic_dec(&sbi->s_inodes_count);
@@ -451,7 +459,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
451 inode->i_op = &nilfs_special_inode_operations; 459 inode->i_op = &nilfs_special_inode_operations;
452 init_special_inode( 460 init_special_inode(
453 inode, inode->i_mode, 461 inode, inode->i_mode,
454 new_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 462 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
455 } 463 }
456 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 464 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
457 brelse(bh); 465 brelse(bh);
@@ -511,7 +519,7 @@ void nilfs_write_inode_common(struct inode *inode,
511 nilfs_bmap_write(ii->i_bmap, raw_inode); 519 nilfs_bmap_write(ii->i_bmap, raw_inode);
512 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 520 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
513 raw_inode->i_device_code = 521 raw_inode->i_device_code =
514 cpu_to_le64(new_encode_dev(inode->i_rdev)); 522 cpu_to_le64(huge_encode_dev(inode->i_rdev));
515 /* When extending inode, nilfs->ns_inode_size should be checked 523 /* When extending inode, nilfs->ns_inode_size should be checked
516 for substitutions of appended fields */ 524 for substitutions of appended fields */
517} 525}
@@ -606,16 +614,34 @@ void nilfs_truncate(struct inode *inode)
606 But truncate has no return value. */ 614 But truncate has no return value. */
607} 615}
608 616
609void nilfs_delete_inode(struct inode *inode) 617static void nilfs_clear_inode(struct inode *inode)
618{
619 struct nilfs_inode_info *ii = NILFS_I(inode);
620
621 /*
622 * Free resources allocated in nilfs_read_inode(), here.
623 */
624 BUG_ON(!list_empty(&ii->i_dirty));
625 brelse(ii->i_bh);
626 ii->i_bh = NULL;
627
628 if (test_bit(NILFS_I_BMAP, &ii->i_state))
629 nilfs_bmap_clear(ii->i_bmap);
630
631 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
632}
633
634void nilfs_evict_inode(struct inode *inode)
610{ 635{
611 struct nilfs_transaction_info ti; 636 struct nilfs_transaction_info ti;
612 struct super_block *sb = inode->i_sb; 637 struct super_block *sb = inode->i_sb;
613 struct nilfs_inode_info *ii = NILFS_I(inode); 638 struct nilfs_inode_info *ii = NILFS_I(inode);
614 639
615 if (unlikely(is_bad_inode(inode))) { 640 if (inode->i_nlink || unlikely(is_bad_inode(inode))) {
616 if (inode->i_data.nrpages) 641 if (inode->i_data.nrpages)
617 truncate_inode_pages(&inode->i_data, 0); 642 truncate_inode_pages(&inode->i_data, 0);
618 clear_inode(inode); 643 end_writeback(inode);
644 nilfs_clear_inode(inode);
619 return; 645 return;
620 } 646 }
621 nilfs_transaction_begin(sb, &ti, 0); /* never fails */ 647 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
@@ -625,6 +651,8 @@ void nilfs_delete_inode(struct inode *inode)
625 651
626 nilfs_truncate_bmap(ii, 0); 652 nilfs_truncate_bmap(ii, 0);
627 nilfs_mark_inode_dirty(inode); 653 nilfs_mark_inode_dirty(inode);
654 end_writeback(inode);
655 nilfs_clear_inode(inode);
628 nilfs_free_inode(inode); 656 nilfs_free_inode(inode);
629 /* nilfs_free_inode() marks inode buffer dirty */ 657 /* nilfs_free_inode() marks inode buffer dirty */
630 if (IS_SYNC(inode)) 658 if (IS_SYNC(inode))
@@ -648,14 +676,27 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
648 err = nilfs_transaction_begin(sb, &ti, 0); 676 err = nilfs_transaction_begin(sb, &ti, 0);
649 if (unlikely(err)) 677 if (unlikely(err))
650 return err; 678 return err;
651 err = inode_setattr(inode, iattr); 679
652 if (!err && (iattr->ia_valid & ATTR_MODE)) 680 if ((iattr->ia_valid & ATTR_SIZE) &&
681 iattr->ia_size != i_size_read(inode)) {
682 err = vmtruncate(inode, iattr->ia_size);
683 if (unlikely(err))
684 goto out_err;
685 }
686
687 setattr_copy(inode, iattr);
688 mark_inode_dirty(inode);
689
690 if (iattr->ia_valid & ATTR_MODE) {
653 err = nilfs_acl_chmod(inode); 691 err = nilfs_acl_chmod(inode);
654 if (likely(!err)) 692 if (unlikely(err))
655 err = nilfs_transaction_commit(sb); 693 goto out_err;
656 else 694 }
657 nilfs_transaction_abort(sb); 695
696 return nilfs_transaction_commit(sb);
658 697
698out_err:
699 nilfs_transaction_abort(sb);
659 return err; 700 return err;
660} 701}
661 702
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 024be8c35bb6..d01aff4957d9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -28,6 +28,7 @@
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include "nilfs.h" 30#include "nilfs.h"
31#include "btnode.h"
31#include "segment.h" 32#include "segment.h"
32#include "page.h" 33#include "page.h"
33#include "mdt.h" 34#include "mdt.h"
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..d3d54046e5f8 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -32,7 +32,6 @@
32#include "the_nilfs.h" 32#include "the_nilfs.h"
33#include "sb.h" 33#include "sb.h"
34#include "bmap.h" 34#include "bmap.h"
35#include "bmap_union.h"
36 35
37/* 36/*
38 * nilfs inode data in memory 37 * nilfs inode data in memory
@@ -41,7 +40,7 @@ struct nilfs_inode_info {
41 __u32 i_flags; 40 __u32 i_flags;
42 unsigned long i_state; /* Dynamic state flags */ 41 unsigned long i_state; /* Dynamic state flags */
43 struct nilfs_bmap *i_bmap; 42 struct nilfs_bmap *i_bmap;
44 union nilfs_bmap_union i_bmap_union; 43 struct nilfs_bmap i_bmap_data;
45 __u64 i_xattr; /* sector_t ??? */ 44 __u64 i_xattr; /* sector_t ??? */
46 __u32 i_dir_start_lookup; 45 __u32 i_dir_start_lookup;
47 __u64 i_cno; /* check point number for GC inode */ 46 __u64 i_cno; /* check point number for GC inode */
@@ -71,9 +70,7 @@ static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
71static inline struct nilfs_inode_info * 70static inline struct nilfs_inode_info *
72NILFS_BMAP_I(const struct nilfs_bmap *bmap) 71NILFS_BMAP_I(const struct nilfs_bmap *bmap)
73{ 72{
74 return container_of((union nilfs_bmap_union *)bmap, 73 return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
75 struct nilfs_inode_info,
76 i_bmap_union);
77} 74}
78 75
79static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) 76static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
@@ -107,6 +104,14 @@ enum {
107}; 104};
108 105
109/* 106/*
107 * commit flags for nilfs_commit_super and nilfs_sync_super
108 */
109enum {
110 NILFS_SB_COMMIT = 0, /* Commit a super block alternately */
111 NILFS_SB_COMMIT_ALL /* Commit both super blocks */
112};
113
114/*
110 * Macros to check inode numbers 115 * Macros to check inode numbers
111 */ 116 */
112#define NILFS_MDT_INO_BITS \ 117#define NILFS_MDT_INO_BITS \
@@ -228,7 +233,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
228 struct page *, struct inode *); 233 struct page *, struct inode *);
229 234
230/* file.c */ 235/* file.c */
231extern int nilfs_sync_file(struct file *, struct dentry *, int); 236extern int nilfs_sync_file(struct file *, int);
232 237
233/* ioctl.c */ 238/* ioctl.c */
234long nilfs_ioctl(struct file *, unsigned int, unsigned long); 239long nilfs_ioctl(struct file *, unsigned int, unsigned long);
@@ -245,7 +250,7 @@ extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
245extern struct inode *nilfs_iget(struct super_block *, unsigned long); 250extern struct inode *nilfs_iget(struct super_block *, unsigned long);
246extern void nilfs_update_inode(struct inode *, struct buffer_head *); 251extern void nilfs_update_inode(struct inode *, struct buffer_head *);
247extern void nilfs_truncate(struct inode *); 252extern void nilfs_truncate(struct inode *);
248extern void nilfs_delete_inode(struct inode *); 253extern void nilfs_evict_inode(struct inode *);
249extern int nilfs_setattr(struct dentry *, struct iattr *); 254extern int nilfs_setattr(struct dentry *, struct iattr *);
250extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, 255extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
251 struct buffer_head **); 256 struct buffer_head **);
@@ -270,7 +275,14 @@ extern struct nilfs_super_block *
270nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); 275nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
271extern int nilfs_store_magic_and_option(struct super_block *, 276extern int nilfs_store_magic_and_option(struct super_block *,
272 struct nilfs_super_block *, char *); 277 struct nilfs_super_block *, char *);
278extern int nilfs_check_feature_compatibility(struct super_block *,
279 struct nilfs_super_block *);
280extern void nilfs_set_log_cursor(struct nilfs_super_block *,
281 struct the_nilfs *);
282extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
283 int flip);
273extern int nilfs_commit_super(struct nilfs_sb_info *, int); 284extern int nilfs_commit_super(struct nilfs_sb_info *, int);
285extern int nilfs_cleanup_super(struct nilfs_sb_info *);
274extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); 286extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
275extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); 287extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
276 288
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8de3e1e48130..aab11db2cb08 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -37,7 +37,8 @@
37 37
38#define NILFS_BUFFER_INHERENT_BITS \ 38#define NILFS_BUFFER_INHERENT_BITS \
39 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ 39 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
40 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated)) 40 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \
41 (1UL << BH_NILFS_Checked))
41 42
42static struct buffer_head * 43static struct buffer_head *
43__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, 44__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -129,6 +130,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
129 130
130 lock_buffer(bh); 131 lock_buffer(bh);
131 clear_buffer_nilfs_volatile(bh); 132 clear_buffer_nilfs_volatile(bh);
133 clear_buffer_nilfs_checked(bh);
132 clear_buffer_dirty(bh); 134 clear_buffer_dirty(bh);
133 if (nilfs_page_buffers_clean(page)) 135 if (nilfs_page_buffers_clean(page))
134 __nilfs_clear_page_dirty(page); 136 __nilfs_clear_page_dirty(page);
@@ -480,6 +482,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
480 lock_buffer(bh); 482 lock_buffer(bh);
481 clear_buffer_dirty(bh); 483 clear_buffer_dirty(bh);
482 clear_buffer_nilfs_volatile(bh); 484 clear_buffer_nilfs_volatile(bh);
485 clear_buffer_nilfs_checked(bh);
483 clear_buffer_uptodate(bh); 486 clear_buffer_uptodate(bh);
484 clear_buffer_mapped(bh); 487 clear_buffer_mapped(bh);
485 unlock_buffer(bh); 488 unlock_buffer(bh);
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 8abca4d1c1f8..f53d8da41ed7 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -34,11 +34,13 @@ enum {
34 BH_NILFS_Allocated = BH_PrivateStart, 34 BH_NILFS_Allocated = BH_PrivateStart,
35 BH_NILFS_Node, 35 BH_NILFS_Node,
36 BH_NILFS_Volatile, 36 BH_NILFS_Volatile,
37 BH_NILFS_Checked,
37}; 38};
38 39
39BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ 40BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
40BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ 41BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
41BUFFER_FNS(NILFS_Volatile, nilfs_volatile) 42BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
43BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
42 44
43 45
44void nilfs_mark_buffer_dirty(struct buffer_head *bh); 46void nilfs_mark_buffer_dirty(struct buffer_head *bh);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ba43146f3c30..d0c35ef39f6a 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -91,25 +91,9 @@ static int nilfs_warn_segment_error(int err)
91 return -EINVAL; 91 return -EINVAL;
92} 92}
93 93
94static void store_segsum_info(struct nilfs_segsum_info *ssi,
95 struct nilfs_segment_summary *sum,
96 unsigned int blocksize)
97{
98 ssi->flags = le16_to_cpu(sum->ss_flags);
99 ssi->seg_seq = le64_to_cpu(sum->ss_seq);
100 ssi->ctime = le64_to_cpu(sum->ss_create);
101 ssi->next = le64_to_cpu(sum->ss_next);
102 ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
103 ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
104 ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
105
106 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
107 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
108}
109
110/** 94/**
111 * calc_crc_cont - check CRC of blocks continuously 95 * nilfs_compute_checksum - compute checksum of blocks continuously
112 * @sbi: nilfs_sb_info 96 * @nilfs: nilfs object
113 * @bhs: buffer head of start block 97 * @bhs: buffer head of start block
114 * @sum: place to store result 98 * @sum: place to store result
115 * @offset: offset bytes in the first block 99 * @offset: offset bytes in the first block
@@ -117,23 +101,25 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
117 * @start: DBN of start block 101 * @start: DBN of start block
118 * @nblock: number of blocks to be checked 102 * @nblock: number of blocks to be checked
119 */ 103 */
120static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, 104static int nilfs_compute_checksum(struct the_nilfs *nilfs,
121 u32 *sum, unsigned long offset, u64 check_bytes, 105 struct buffer_head *bhs, u32 *sum,
122 sector_t start, unsigned long nblock) 106 unsigned long offset, u64 check_bytes,
107 sector_t start, unsigned long nblock)
123{ 108{
124 unsigned long blocksize = sbi->s_super->s_blocksize; 109 unsigned int blocksize = nilfs->ns_blocksize;
125 unsigned long size; 110 unsigned long size;
126 u32 crc; 111 u32 crc;
127 112
128 BUG_ON(offset >= blocksize); 113 BUG_ON(offset >= blocksize);
129 check_bytes -= offset; 114 check_bytes -= offset;
130 size = min_t(u64, check_bytes, blocksize - offset); 115 size = min_t(u64, check_bytes, blocksize - offset);
131 crc = crc32_le(sbi->s_nilfs->ns_crc_seed, 116 crc = crc32_le(nilfs->ns_crc_seed,
132 (unsigned char *)bhs->b_data + offset, size); 117 (unsigned char *)bhs->b_data + offset, size);
133 if (--nblock > 0) { 118 if (--nblock > 0) {
134 do { 119 do {
135 struct buffer_head *bh 120 struct buffer_head *bh;
136 = sb_bread(sbi->s_super, ++start); 121
122 bh = __bread(nilfs->ns_bdev, ++start, blocksize);
137 if (!bh) 123 if (!bh)
138 return -EIO; 124 return -EIO;
139 check_bytes -= size; 125 check_bytes -= size;
@@ -148,12 +134,12 @@ static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
148 134
149/** 135/**
150 * nilfs_read_super_root_block - read super root block 136 * nilfs_read_super_root_block - read super root block
151 * @sb: super_block 137 * @nilfs: nilfs object
152 * @sr_block: disk block number of the super root block 138 * @sr_block: disk block number of the super root block
153 * @pbh: address of a buffer_head pointer to return super root buffer 139 * @pbh: address of a buffer_head pointer to return super root buffer
154 * @check: CRC check flag 140 * @check: CRC check flag
155 */ 141 */
156int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, 142int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
157 struct buffer_head **pbh, int check) 143 struct buffer_head **pbh, int check)
158{ 144{
159 struct buffer_head *bh_sr; 145 struct buffer_head *bh_sr;
@@ -162,7 +148,7 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
162 int ret; 148 int ret;
163 149
164 *pbh = NULL; 150 *pbh = NULL;
165 bh_sr = sb_bread(sb, sr_block); 151 bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize);
166 if (unlikely(!bh_sr)) { 152 if (unlikely(!bh_sr)) {
167 ret = NILFS_SEG_FAIL_IO; 153 ret = NILFS_SEG_FAIL_IO;
168 goto failed; 154 goto failed;
@@ -172,12 +158,13 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
172 if (check) { 158 if (check) {
173 unsigned bytes = le16_to_cpu(sr->sr_bytes); 159 unsigned bytes = le16_to_cpu(sr->sr_bytes);
174 160
175 if (bytes == 0 || bytes > sb->s_blocksize) { 161 if (bytes == 0 || bytes > nilfs->ns_blocksize) {
176 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; 162 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
177 goto failed_bh; 163 goto failed_bh;
178 } 164 }
179 if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc, 165 if (nilfs_compute_checksum(
180 sizeof(sr->sr_sum), bytes, sr_block, 1)) { 166 nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes,
167 sr_block, 1)) {
181 ret = NILFS_SEG_FAIL_IO; 168 ret = NILFS_SEG_FAIL_IO;
182 goto failed_bh; 169 goto failed_bh;
183 } 170 }
@@ -197,64 +184,76 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
197} 184}
198 185
199/** 186/**
200 * load_segment_summary - read segment summary of the specified partial segment 187 * nilfs_read_log_header - read summary header of the specified log
201 * @sbi: nilfs_sb_info 188 * @nilfs: nilfs object
202 * @pseg_start: start disk block number of partial segment 189 * @start_blocknr: start block number of the log
203 * @seg_seq: sequence number requested 190 * @sum: pointer to return segment summary structure
204 * @ssi: pointer to nilfs_segsum_info struct to store information
205 */ 191 */
206static int 192static struct buffer_head *
207load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, 193nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
208 u64 seg_seq, struct nilfs_segsum_info *ssi) 194 struct nilfs_segment_summary **sum)
209{ 195{
210 struct buffer_head *bh_sum; 196 struct buffer_head *bh_sum;
211 struct nilfs_segment_summary *sum; 197
198 bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
199 if (bh_sum)
200 *sum = (struct nilfs_segment_summary *)bh_sum->b_data;
201 return bh_sum;
202}
203
204/**
205 * nilfs_validate_log - verify consistency of log
206 * @nilfs: nilfs object
207 * @seg_seq: sequence number of segment
208 * @bh_sum: buffer head of summary block
209 * @sum: segment summary struct
210 */
211static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
212 struct buffer_head *bh_sum,
213 struct nilfs_segment_summary *sum)
214{
212 unsigned long nblock; 215 unsigned long nblock;
213 u32 crc; 216 u32 crc;
214 int ret = NILFS_SEG_FAIL_IO; 217 int ret;
215 218
216 bh_sum = sb_bread(sbi->s_super, pseg_start); 219 ret = NILFS_SEG_FAIL_MAGIC;
217 if (!bh_sum) 220 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC)
218 goto out; 221 goto out;
219 222
220 sum = (struct nilfs_segment_summary *)bh_sum->b_data; 223 ret = NILFS_SEG_FAIL_SEQ;
221 224 if (le64_to_cpu(sum->ss_seq) != seg_seq)
222 /* Check consistency of segment summary */ 225 goto out;
223 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
224 ret = NILFS_SEG_FAIL_MAGIC;
225 goto failed;
226 }
227 store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
228 if (seg_seq != ssi->seg_seq) {
229 ret = NILFS_SEG_FAIL_SEQ;
230 goto failed;
231 }
232 226
233 nblock = ssi->nblocks; 227 nblock = le32_to_cpu(sum->ss_nblocks);
234 if (unlikely(nblock == 0 || 228 ret = NILFS_SEG_FAIL_CONSISTENCY;
235 nblock > sbi->s_nilfs->ns_blocks_per_segment)) { 229 if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment))
236 /* This limits the number of blocks read in the CRC check */ 230 /* This limits the number of blocks read in the CRC check */
237 ret = NILFS_SEG_FAIL_CONSISTENCY; 231 goto out;
238 goto failed; 232
239 } 233 ret = NILFS_SEG_FAIL_IO;
240 if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum), 234 if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum),
241 ((u64)nblock << sbi->s_super->s_blocksize_bits), 235 ((u64)nblock << nilfs->ns_blocksize_bits),
242 pseg_start, nblock)) { 236 bh_sum->b_blocknr, nblock))
243 ret = NILFS_SEG_FAIL_IO; 237 goto out;
244 goto failed; 238
245 } 239 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
246 if (crc == le32_to_cpu(sum->ss_datasum)) 240 if (crc != le32_to_cpu(sum->ss_datasum))
247 ret = 0; 241 goto out;
248 else 242 ret = 0;
249 ret = NILFS_SEG_FAIL_CHECKSUM_FULL; 243out:
250 failed:
251 brelse(bh_sum);
252 out:
253 return ret; 244 return ret;
254} 245}
255 246
256static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, 247/**
257 unsigned int *offset, unsigned int bytes) 248 * nilfs_read_summary_info - read an item on summary blocks of a log
249 * @nilfs: nilfs object
250 * @pbh: the current buffer head on summary blocks [in, out]
251 * @offset: the current byte offset on summary blocks [in, out]
252 * @bytes: byte size of the item to be read
253 */
254static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
255 struct buffer_head **pbh,
256 unsigned int *offset, unsigned int bytes)
258{ 257{
259 void *ptr; 258 void *ptr;
260 sector_t blocknr; 259 sector_t blocknr;
@@ -263,7 +262,8 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
263 if (bytes > (*pbh)->b_size - *offset) { 262 if (bytes > (*pbh)->b_size - *offset) {
264 blocknr = (*pbh)->b_blocknr; 263 blocknr = (*pbh)->b_blocknr;
265 brelse(*pbh); 264 brelse(*pbh);
266 *pbh = sb_bread(sb, blocknr + 1); 265 *pbh = __bread(nilfs->ns_bdev, blocknr + 1,
266 nilfs->ns_blocksize);
267 if (unlikely(!*pbh)) 267 if (unlikely(!*pbh))
268 return NULL; 268 return NULL;
269 *offset = 0; 269 *offset = 0;
@@ -273,9 +273,18 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
273 return ptr; 273 return ptr;
274} 274}
275 275
276static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, 276/**
277 unsigned int *offset, unsigned int bytes, 277 * nilfs_skip_summary_info - skip items on summary blocks of a log
278 unsigned long count) 278 * @nilfs: nilfs object
279 * @pbh: the current buffer head on summary blocks [in, out]
280 * @offset: the current byte offset on summary blocks [in, out]
281 * @bytes: byte size of the item to be skipped
282 * @count: number of items to be skipped
283 */
284static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
285 struct buffer_head **pbh,
286 unsigned int *offset, unsigned int bytes,
287 unsigned long count)
279{ 288{
280 unsigned int rest_item_in_current_block 289 unsigned int rest_item_in_current_block
281 = ((*pbh)->b_size - *offset) / bytes; 290 = ((*pbh)->b_size - *offset) / bytes;
@@ -292,36 +301,46 @@ static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
292 *offset = bytes * (count - (bcnt - 1) * nitem_per_block); 301 *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
293 302
294 brelse(*pbh); 303 brelse(*pbh);
295 *pbh = sb_bread(sb, blocknr + bcnt); 304 *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt,
305 nilfs->ns_blocksize);
296 } 306 }
297} 307}
298 308
299static int 309/**
300collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, 310 * nilfs_scan_dsync_log - get block information of a log written for data sync
301 struct nilfs_segsum_info *ssi, 311 * @nilfs: nilfs object
302 struct list_head *head) 312 * @start_blocknr: start block number of the log
313 * @sum: log summary information
314 * @head: list head to add nilfs_recovery_block struct
315 */
316static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
317 struct nilfs_segment_summary *sum,
318 struct list_head *head)
303{ 319{
304 struct buffer_head *bh; 320 struct buffer_head *bh;
305 unsigned int offset; 321 unsigned int offset;
306 unsigned long nfinfo = ssi->nfinfo; 322 u32 nfinfo, sumbytes;
307 sector_t blocknr = sum_blocknr + ssi->nsumblk; 323 sector_t blocknr;
308 ino_t ino; 324 ino_t ino;
309 int err = -EIO; 325 int err = -EIO;
310 326
327 nfinfo = le32_to_cpu(sum->ss_nfinfo);
311 if (!nfinfo) 328 if (!nfinfo)
312 return 0; 329 return 0;
313 330
314 bh = sb_bread(sbi->s_super, sum_blocknr); 331 sumbytes = le32_to_cpu(sum->ss_sumbytes);
332 blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize);
333 bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
315 if (unlikely(!bh)) 334 if (unlikely(!bh))
316 goto out; 335 goto out;
317 336
318 offset = le16_to_cpu( 337 offset = le16_to_cpu(sum->ss_bytes);
319 ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
320 for (;;) { 338 for (;;) {
321 unsigned long nblocks, ndatablk, nnodeblk; 339 unsigned long nblocks, ndatablk, nnodeblk;
322 struct nilfs_finfo *finfo; 340 struct nilfs_finfo *finfo;
323 341
324 finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo)); 342 finfo = nilfs_read_summary_info(nilfs, &bh, &offset,
343 sizeof(*finfo));
325 if (unlikely(!finfo)) 344 if (unlikely(!finfo))
326 goto out; 345 goto out;
327 346
@@ -334,8 +353,8 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
334 struct nilfs_recovery_block *rb; 353 struct nilfs_recovery_block *rb;
335 struct nilfs_binfo_v *binfo; 354 struct nilfs_binfo_v *binfo;
336 355
337 binfo = segsum_get(sbi->s_super, &bh, &offset, 356 binfo = nilfs_read_summary_info(nilfs, &bh, &offset,
338 sizeof(*binfo)); 357 sizeof(*binfo));
339 if (unlikely(!binfo)) 358 if (unlikely(!binfo))
340 goto out; 359 goto out;
341 360
@@ -353,9 +372,9 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
353 } 372 }
354 if (--nfinfo == 0) 373 if (--nfinfo == 0)
355 break; 374 break;
356 blocknr += nnodeblk; /* always 0 for the data sync segments */ 375 blocknr += nnodeblk; /* always 0 for data sync logs */
357 segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64), 376 nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64),
358 nnodeblk); 377 nnodeblk);
359 if (unlikely(!bh)) 378 if (unlikely(!bh))
360 goto out; 379 goto out;
361 } 380 }
@@ -465,14 +484,14 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
465 return err; 484 return err;
466} 485}
467 486
468static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, 487static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
469 struct nilfs_recovery_block *rb, 488 struct nilfs_recovery_block *rb,
470 struct page *page) 489 struct page *page)
471{ 490{
472 struct buffer_head *bh_org; 491 struct buffer_head *bh_org;
473 void *kaddr; 492 void *kaddr;
474 493
475 bh_org = sb_bread(sbi->s_super, rb->blocknr); 494 bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
476 if (unlikely(!bh_org)) 495 if (unlikely(!bh_org))
477 return -EIO; 496 return -EIO;
478 497
@@ -483,13 +502,14 @@ static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
483 return 0; 502 return 0;
484} 503}
485 504
486static int recover_dsync_blocks(struct nilfs_sb_info *sbi, 505static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
487 struct list_head *head, 506 struct nilfs_sb_info *sbi,
488 unsigned long *nr_salvaged_blocks) 507 struct list_head *head,
508 unsigned long *nr_salvaged_blocks)
489{ 509{
490 struct inode *inode; 510 struct inode *inode;
491 struct nilfs_recovery_block *rb, *n; 511 struct nilfs_recovery_block *rb, *n;
492 unsigned blocksize = sbi->s_super->s_blocksize; 512 unsigned blocksize = nilfs->ns_blocksize;
493 struct page *page; 513 struct page *page;
494 loff_t pos; 514 loff_t pos;
495 int err = 0, err2 = 0; 515 int err = 0, err2 = 0;
@@ -503,13 +523,16 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
503 } 523 }
504 524
505 pos = rb->blkoff << inode->i_blkbits; 525 pos = rb->blkoff << inode->i_blkbits;
506 page = NULL; 526 err = block_write_begin(inode->i_mapping, pos, blocksize,
507 err = block_write_begin(NULL, inode->i_mapping, pos, blocksize, 527 0, &page, nilfs_get_block);
508 0, &page, NULL, nilfs_get_block); 528 if (unlikely(err)) {
509 if (unlikely(err)) 529 loff_t isize = inode->i_size;
530 if (pos + blocksize > isize)
531 vmtruncate(inode, isize);
510 goto failed_inode; 532 goto failed_inode;
533 }
511 534
512 err = nilfs_recovery_copy_block(sbi, rb, page); 535 err = nilfs_recovery_copy_block(nilfs, rb, page);
513 if (unlikely(err)) 536 if (unlikely(err))
514 goto failed_page; 537 goto failed_page;
515 538
@@ -549,18 +572,20 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
549/** 572/**
550 * nilfs_do_roll_forward - salvage logical segments newer than the latest 573 * nilfs_do_roll_forward - salvage logical segments newer than the latest
551 * checkpoint 574 * checkpoint
575 * @nilfs: nilfs object
552 * @sbi: nilfs_sb_info 576 * @sbi: nilfs_sb_info
553 * @nilfs: the_nilfs
554 * @ri: pointer to a nilfs_recovery_info 577 * @ri: pointer to a nilfs_recovery_info
555 */ 578 */
556static int nilfs_do_roll_forward(struct the_nilfs *nilfs, 579static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
557 struct nilfs_sb_info *sbi, 580 struct nilfs_sb_info *sbi,
558 struct nilfs_recovery_info *ri) 581 struct nilfs_recovery_info *ri)
559{ 582{
560 struct nilfs_segsum_info ssi; 583 struct buffer_head *bh_sum = NULL;
584 struct nilfs_segment_summary *sum;
561 sector_t pseg_start; 585 sector_t pseg_start;
562 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ 586 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
563 unsigned long nsalvaged_blocks = 0; 587 unsigned long nsalvaged_blocks = 0;
588 unsigned int flags;
564 u64 seg_seq; 589 u64 seg_seq;
565 __u64 segnum, nextnum = 0; 590 __u64 segnum, nextnum = 0;
566 int empty_seg = 0; 591 int empty_seg = 0;
@@ -579,8 +604,14 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
579 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); 604 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
580 605
581 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { 606 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
607 brelse(bh_sum);
608 bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
609 if (!bh_sum) {
610 err = -EIO;
611 goto failed;
612 }
582 613
583 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); 614 ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
584 if (ret) { 615 if (ret) {
585 if (ret == NILFS_SEG_FAIL_IO) { 616 if (ret == NILFS_SEG_FAIL_IO) {
586 err = -EIO; 617 err = -EIO;
@@ -588,33 +619,38 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
588 } 619 }
589 goto strayed; 620 goto strayed;
590 } 621 }
591 if (unlikely(NILFS_SEG_HAS_SR(&ssi))) 622
623 flags = le16_to_cpu(sum->ss_flags);
624 if (flags & NILFS_SS_SR)
592 goto confused; 625 goto confused;
593 626
594 /* Found a valid partial segment; do recovery actions */ 627 /* Found a valid partial segment; do recovery actions */
595 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); 628 nextnum = nilfs_get_segnum_of_block(nilfs,
629 le64_to_cpu(sum->ss_next));
596 empty_seg = 0; 630 empty_seg = 0;
597 nilfs->ns_ctime = ssi.ctime; 631 nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
598 if (!(ssi.flags & NILFS_SS_GC)) 632 if (!(flags & NILFS_SS_GC))
599 nilfs->ns_nongc_ctime = ssi.ctime; 633 nilfs->ns_nongc_ctime = nilfs->ns_ctime;
600 634
601 switch (state) { 635 switch (state) {
602 case RF_INIT_ST: 636 case RF_INIT_ST:
603 if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi)) 637 if (!(flags & NILFS_SS_LOGBGN) ||
638 !(flags & NILFS_SS_SYNDT))
604 goto try_next_pseg; 639 goto try_next_pseg;
605 state = RF_DSYNC_ST; 640 state = RF_DSYNC_ST;
606 /* Fall through */ 641 /* Fall through */
607 case RF_DSYNC_ST: 642 case RF_DSYNC_ST:
608 if (!NILFS_SEG_DSYNC(&ssi)) 643 if (!(flags & NILFS_SS_SYNDT))
609 goto confused; 644 goto confused;
610 645
611 err = collect_blocks_from_segsum( 646 err = nilfs_scan_dsync_log(nilfs, pseg_start, sum,
612 sbi, pseg_start, &ssi, &dsync_blocks); 647 &dsync_blocks);
613 if (unlikely(err)) 648 if (unlikely(err))
614 goto failed; 649 goto failed;
615 if (NILFS_SEG_LOGEND(&ssi)) { 650 if (flags & NILFS_SS_LOGEND) {
616 err = recover_dsync_blocks( 651 err = nilfs_recover_dsync_blocks(
617 sbi, &dsync_blocks, &nsalvaged_blocks); 652 nilfs, sbi, &dsync_blocks,
653 &nsalvaged_blocks);
618 if (unlikely(err)) 654 if (unlikely(err))
619 goto failed; 655 goto failed;
620 state = RF_INIT_ST; 656 state = RF_INIT_ST;
@@ -625,7 +661,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
625 try_next_pseg: 661 try_next_pseg:
626 if (pseg_start == ri->ri_lsegs_end) 662 if (pseg_start == ri->ri_lsegs_end)
627 break; 663 break;
628 pseg_start += ssi.nblocks; 664 pseg_start += le32_to_cpu(sum->ss_nblocks);
629 if (pseg_start < seg_end) 665 if (pseg_start < seg_end)
630 continue; 666 continue;
631 goto feed_segment; 667 goto feed_segment;
@@ -650,8 +686,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
650 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; 686 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
651 } 687 }
652 out: 688 out:
689 brelse(bh_sum);
653 dispose_recovery_list(&dsync_blocks); 690 dispose_recovery_list(&dsync_blocks);
654 nilfs_detach_writer(sbi->s_nilfs, sbi); 691 nilfs_detach_writer(nilfs, sbi);
655 return err; 692 return err;
656 693
657 confused: 694 confused:
@@ -665,7 +702,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
665} 702}
666 703
667static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, 704static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
668 struct nilfs_sb_info *sbi,
669 struct nilfs_recovery_info *ri) 705 struct nilfs_recovery_info *ri)
670{ 706{
671 struct buffer_head *bh; 707 struct buffer_head *bh;
@@ -675,7 +711,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
675 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) 711 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
676 return; 712 return;
677 713
678 bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start); 714 bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
679 BUG_ON(!bh); 715 BUG_ON(!bh);
680 memset(bh->b_data, 0, bh->b_size); 716 memset(bh->b_data, 0, bh->b_size);
681 set_buffer_dirty(bh); 717 set_buffer_dirty(bh);
@@ -688,9 +724,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
688} 724}
689 725
690/** 726/**
691 * nilfs_recover_logical_segments - salvage logical segments written after 727 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
692 * the latest super root 728 * @nilfs: nilfs object
693 * @nilfs: the_nilfs
694 * @sbi: nilfs_sb_info 729 * @sbi: nilfs_sb_info
695 * @ri: pointer to a nilfs_recovery_info struct to store search results. 730 * @ri: pointer to a nilfs_recovery_info struct to store search results.
696 * 731 *
@@ -707,9 +742,9 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
707 * 742 *
708 * %-ENOMEM - Insufficient memory available. 743 * %-ENOMEM - Insufficient memory available.
709 */ 744 */
710int nilfs_recover_logical_segments(struct the_nilfs *nilfs, 745int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
711 struct nilfs_sb_info *sbi, 746 struct nilfs_sb_info *sbi,
712 struct nilfs_recovery_info *ri) 747 struct nilfs_recovery_info *ri)
713{ 748{
714 int err; 749 int err;
715 750
@@ -749,7 +784,7 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
749 goto failed; 784 goto failed;
750 } 785 }
751 786
752 nilfs_finish_roll_forward(nilfs, sbi, ri); 787 nilfs_finish_roll_forward(nilfs, ri);
753 } 788 }
754 789
755 failed: 790 failed:
@@ -760,7 +795,6 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
760/** 795/**
761 * nilfs_search_super_root - search the latest valid super root 796 * nilfs_search_super_root - search the latest valid super root
762 * @nilfs: the_nilfs 797 * @nilfs: the_nilfs
763 * @sbi: nilfs_sb_info
764 * @ri: pointer to a nilfs_recovery_info struct to store search results. 798 * @ri: pointer to a nilfs_recovery_info struct to store search results.
765 * 799 *
766 * nilfs_search_super_root() looks for the latest super-root from a partial 800 * nilfs_search_super_root() looks for the latest super-root from a partial
@@ -773,14 +807,19 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
773 * %-EINVAL - No valid segment found 807 * %-EINVAL - No valid segment found
774 * 808 *
775 * %-EIO - I/O error 809 * %-EIO - I/O error
810 *
811 * %-ENOMEM - Insufficient memory available.
776 */ 812 */
777int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, 813int nilfs_search_super_root(struct the_nilfs *nilfs,
778 struct nilfs_recovery_info *ri) 814 struct nilfs_recovery_info *ri)
779{ 815{
780 struct nilfs_segsum_info ssi; 816 struct buffer_head *bh_sum = NULL;
817 struct nilfs_segment_summary *sum;
781 sector_t pseg_start, pseg_end, sr_pseg_start = 0; 818 sector_t pseg_start, pseg_end, sr_pseg_start = 0;
782 sector_t seg_start, seg_end; /* range of full segment (block number) */ 819 sector_t seg_start, seg_end; /* range of full segment (block number) */
783 sector_t b, end; 820 sector_t b, end;
821 unsigned long nblocks;
822 unsigned int flags;
784 u64 seg_seq; 823 u64 seg_seq;
785 __u64 segnum, nextnum = 0; 824 __u64 segnum, nextnum = 0;
786 __u64 cno; 825 __u64 cno;
@@ -799,17 +838,24 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
799 /* Read ahead segment */ 838 /* Read ahead segment */
800 b = seg_start; 839 b = seg_start;
801 while (b <= seg_end) 840 while (b <= seg_end)
802 sb_breadahead(sbi->s_super, b++); 841 __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize);
803 842
804 for (;;) { 843 for (;;) {
805 /* Load segment summary */ 844 brelse(bh_sum);
806 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); 845 ret = NILFS_SEG_FAIL_IO;
846 bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
847 if (!bh_sum)
848 goto failed;
849
850 ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
807 if (ret) { 851 if (ret) {
808 if (ret == NILFS_SEG_FAIL_IO) 852 if (ret == NILFS_SEG_FAIL_IO)
809 goto failed; 853 goto failed;
810 goto strayed; 854 goto strayed;
811 } 855 }
812 pseg_end = pseg_start + ssi.nblocks - 1; 856
857 nblocks = le32_to_cpu(sum->ss_nblocks);
858 pseg_end = pseg_start + nblocks - 1;
813 if (unlikely(pseg_end > seg_end)) { 859 if (unlikely(pseg_end > seg_end)) {
814 ret = NILFS_SEG_FAIL_CONSISTENCY; 860 ret = NILFS_SEG_FAIL_CONSISTENCY;
815 goto strayed; 861 goto strayed;
@@ -819,11 +865,13 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
819 ri->ri_pseg_start = pseg_start; 865 ri->ri_pseg_start = pseg_start;
820 ri->ri_seq = seg_seq; 866 ri->ri_seq = seg_seq;
821 ri->ri_segnum = segnum; 867 ri->ri_segnum = segnum;
822 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); 868 nextnum = nilfs_get_segnum_of_block(nilfs,
869 le64_to_cpu(sum->ss_next));
823 ri->ri_nextnum = nextnum; 870 ri->ri_nextnum = nextnum;
824 empty_seg = 0; 871 empty_seg = 0;
825 872
826 if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) { 873 flags = le16_to_cpu(sum->ss_flags);
874 if (!(flags & NILFS_SS_SR) && !scan_newer) {
827 /* This will never happen because a superblock 875 /* This will never happen because a superblock
828 (last_segment) always points to a pseg 876 (last_segment) always points to a pseg
829 having a super root. */ 877 having a super root. */
@@ -834,14 +882,15 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
834 if (pseg_start == seg_start) { 882 if (pseg_start == seg_start) {
835 nilfs_get_segment_range(nilfs, nextnum, &b, &end); 883 nilfs_get_segment_range(nilfs, nextnum, &b, &end);
836 while (b <= end) 884 while (b <= end)
837 sb_breadahead(sbi->s_super, b++); 885 __breadahead(nilfs->ns_bdev, b++,
886 nilfs->ns_blocksize);
838 } 887 }
839 if (!NILFS_SEG_HAS_SR(&ssi)) { 888 if (!(flags & NILFS_SS_SR)) {
840 if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { 889 if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) {
841 ri->ri_lsegs_start = pseg_start; 890 ri->ri_lsegs_start = pseg_start;
842 ri->ri_lsegs_start_seq = seg_seq; 891 ri->ri_lsegs_start_seq = seg_seq;
843 } 892 }
844 if (NILFS_SEG_LOGEND(&ssi)) 893 if (flags & NILFS_SS_LOGEND)
845 ri->ri_lsegs_end = pseg_start; 894 ri->ri_lsegs_end = pseg_start;
846 goto try_next_pseg; 895 goto try_next_pseg;
847 } 896 }
@@ -852,12 +901,12 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
852 ri->ri_lsegs_start = ri->ri_lsegs_end = 0; 901 ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
853 902
854 nilfs_dispose_segment_list(&segments); 903 nilfs_dispose_segment_list(&segments);
855 nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start) 904 sr_pseg_start = pseg_start;
856 + ssi.nblocks - seg_start; 905 nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start;
857 nilfs->ns_seg_seq = seg_seq; 906 nilfs->ns_seg_seq = seg_seq;
858 nilfs->ns_segnum = segnum; 907 nilfs->ns_segnum = segnum;
859 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ 908 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */
860 nilfs->ns_ctime = ssi.ctime; 909 nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
861 nilfs->ns_nextnum = nextnum; 910 nilfs->ns_nextnum = nextnum;
862 911
863 if (scan_newer) 912 if (scan_newer)
@@ -868,15 +917,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
868 scan_newer = 1; 917 scan_newer = 1;
869 } 918 }
870 919
871 /* reset region for roll-forward */
872 pseg_start += ssi.nblocks;
873 if (pseg_start < seg_end)
874 continue;
875 goto feed_segment;
876
877 try_next_pseg: 920 try_next_pseg:
878 /* Standing on a course, or met an inconsistent state */ 921 /* Standing on a course, or met an inconsistent state */
879 pseg_start += ssi.nblocks; 922 pseg_start += nblocks;
880 if (pseg_start < seg_end) 923 if (pseg_start < seg_end)
881 continue; 924 continue;
882 goto feed_segment; 925 goto feed_segment;
@@ -907,6 +950,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
907 950
908 super_root_found: 951 super_root_found:
909 /* Updating pointers relating to the latest checkpoint */ 952 /* Updating pointers relating to the latest checkpoint */
953 brelse(bh_sum);
910 list_splice_tail(&segments, &ri->ri_used_segments); 954 list_splice_tail(&segments, &ri->ri_used_segments);
911 nilfs->ns_last_pseg = sr_pseg_start; 955 nilfs->ns_last_pseg = sr_pseg_start;
912 nilfs->ns_last_seq = nilfs->ns_seg_seq; 956 nilfs->ns_last_seq = nilfs->ns_seg_seq;
@@ -914,6 +958,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
914 return 0; 958 return 0;
915 959
916 failed: 960 failed:
961 brelse(bh_sum);
917 nilfs_dispose_segment_list(&segments); 962 nilfs_dispose_segment_list(&segments);
918 return (ret < 0) ? ret : nilfs_warn_segment_error(ret); 963 return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
919} 964}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 17851f77f739..4588fb9e93df 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -40,35 +40,10 @@ struct nilfs_write_info {
40 sector_t blocknr; 40 sector_t blocknr;
41}; 41};
42 42
43
44static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, 43static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
45 struct the_nilfs *nilfs); 44 struct the_nilfs *nilfs);
46static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); 45static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
47 46
48
49static struct kmem_cache *nilfs_segbuf_cachep;
50
51static void nilfs_segbuf_init_once(void *obj)
52{
53 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
54}
55
56int __init nilfs_init_segbuf_cache(void)
57{
58 nilfs_segbuf_cachep =
59 kmem_cache_create("nilfs2_segbuf_cache",
60 sizeof(struct nilfs_segment_buffer),
61 0, SLAB_RECLAIM_ACCOUNT,
62 nilfs_segbuf_init_once);
63
64 return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
65}
66
67void nilfs_destroy_segbuf_cache(void)
68{
69 kmem_cache_destroy(nilfs_segbuf_cachep);
70}
71
72struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) 47struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
73{ 48{
74 struct nilfs_segment_buffer *segbuf; 49 struct nilfs_segment_buffer *segbuf;
@@ -81,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
81 INIT_LIST_HEAD(&segbuf->sb_list); 56 INIT_LIST_HEAD(&segbuf->sb_list);
82 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); 57 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
83 INIT_LIST_HEAD(&segbuf->sb_payload_buffers); 58 INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
59 segbuf->sb_super_root = NULL;
84 60
85 init_completion(&segbuf->sb_bio_event); 61 init_completion(&segbuf->sb_bio_event);
86 atomic_set(&segbuf->sb_err, 0); 62 atomic_set(&segbuf->sb_err, 0);
@@ -158,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
158} 134}
159 135
160int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, 136int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
161 time_t ctime) 137 time_t ctime, __u64 cno)
162{ 138{
163 int err; 139 int err;
164 140
@@ -171,6 +147,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
171 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); 147 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
172 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; 148 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
173 segbuf->sb_sum.ctime = ctime; 149 segbuf->sb_sum.ctime = ctime;
150 segbuf->sb_sum.cno = cno;
174 return 0; 151 return 0;
175} 152}
176 153
@@ -196,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
196 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); 173 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo);
197 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); 174 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
198 raw_sum->ss_pad = 0; 175 raw_sum->ss_pad = 0;
176 raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno);
199} 177}
200 178
201/* 179/*
202 * CRC calculation routines 180 * CRC calculation routines
203 */ 181 */
204void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, 182static void
205 u32 seed) 183nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
206{ 184{
207 struct buffer_head *bh; 185 struct buffer_head *bh;
208 struct nilfs_segment_summary *raw_sum; 186 struct nilfs_segment_summary *raw_sum;
@@ -229,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
229 raw_sum->ss_sumsum = cpu_to_le32(crc); 207 raw_sum->ss_sumsum = cpu_to_le32(crc);
230} 208}
231 209
232void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, 210static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
233 u32 seed) 211 u32 seed)
234{ 212{
235 struct buffer_head *bh; 213 struct buffer_head *bh;
236 struct nilfs_segment_summary *raw_sum; 214 struct nilfs_segment_summary *raw_sum;
@@ -256,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
256 raw_sum->ss_datasum = cpu_to_le32(crc); 234 raw_sum->ss_datasum = cpu_to_le32(crc);
257} 235}
258 236
237static void
238nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
239 u32 seed)
240{
241 struct nilfs_super_root *raw_sr;
242 u32 crc;
243
244 raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
245 crc = crc32_le(seed,
246 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
247 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
248 raw_sr->sr_sum = cpu_to_le32(crc);
249}
250
259static void nilfs_release_buffers(struct list_head *list) 251static void nilfs_release_buffers(struct list_head *list)
260{ 252{
261 struct buffer_head *bh, *n; 253 struct buffer_head *bh, *n;
@@ -282,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
282{ 274{
283 nilfs_release_buffers(&segbuf->sb_segsum_buffers); 275 nilfs_release_buffers(&segbuf->sb_segsum_buffers);
284 nilfs_release_buffers(&segbuf->sb_payload_buffers); 276 nilfs_release_buffers(&segbuf->sb_payload_buffers);
277 segbuf->sb_super_root = NULL;
285} 278}
286 279
287/* 280/*
@@ -334,6 +327,23 @@ int nilfs_wait_on_logs(struct list_head *logs)
334 return ret; 327 return ret;
335} 328}
336 329
330/**
331 * nilfs_add_checksums_on_logs - add checksums on the logs
332 * @logs: list of segment buffers storing target logs
333 * @seed: checksum seed value
334 */
335void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
336{
337 struct nilfs_segment_buffer *segbuf;
338
339 list_for_each_entry(segbuf, logs, sb_list) {
340 if (segbuf->sb_super_root)
341 nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
342 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
343 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
344 }
345}
346
337/* 347/*
338 * BIO operations 348 * BIO operations
339 */ 349 */
@@ -498,7 +508,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
498 * Last BIO is always sent through the following 508 * Last BIO is always sent through the following
499 * submission. 509 * submission.
500 */ 510 */
501 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 511 rw |= REQ_SYNC | REQ_UNPLUG;
502 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); 512 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
503 } 513 }
504 514
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 94dfd3517bc0..b04f08cc2397 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
37 * @sumbytes: Byte count of segment summary 37 * @sumbytes: Byte count of segment summary
38 * @nfileblk: Total number of file blocks 38 * @nfileblk: Total number of file blocks
39 * @seg_seq: Segment sequence number 39 * @seg_seq: Segment sequence number
40 * @cno: Checkpoint number
40 * @ctime: Creation time 41 * @ctime: Creation time
41 * @next: Block number of the next full segment 42 * @next: Block number of the next full segment
42 */ 43 */
@@ -48,21 +49,11 @@ struct nilfs_segsum_info {
48 unsigned long sumbytes; 49 unsigned long sumbytes;
49 unsigned long nfileblk; 50 unsigned long nfileblk;
50 u64 seg_seq; 51 u64 seg_seq;
52 __u64 cno;
51 time_t ctime; 53 time_t ctime;
52 sector_t next; 54 sector_t next;
53}; 55};
54 56
55/* macro for the flags */
56#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR)
57#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN)
58#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND)
59#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT)
60#define NILFS_SEG_SIMPLEX(sum) \
61 (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
62 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
63
64#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk)
65
66/** 57/**
67 * struct nilfs_segment_buffer - Segment buffer 58 * struct nilfs_segment_buffer - Segment buffer
68 * @sb_super: back pointer to a superblock struct 59 * @sb_super: back pointer to a superblock struct
@@ -76,6 +67,7 @@ struct nilfs_segsum_info {
76 * @sb_rest_blocks: Number of residual blocks in the current segment 67 * @sb_rest_blocks: Number of residual blocks in the current segment
77 * @sb_segsum_buffers: List of buffers for segment summaries 68 * @sb_segsum_buffers: List of buffers for segment summaries
78 * @sb_payload_buffers: List of buffers for segment payload 69 * @sb_payload_buffers: List of buffers for segment payload
70 * @sb_super_root: Pointer to buffer storing a super root block (if exists)
79 * @sb_nbio: Number of flying bio requests 71 * @sb_nbio: Number of flying bio requests
80 * @sb_err: I/O error status 72 * @sb_err: I/O error status
81 * @sb_bio_event: Completion event of log writing 73 * @sb_bio_event: Completion event of log writing
@@ -95,6 +87,7 @@ struct nilfs_segment_buffer {
95 /* Buffers */ 87 /* Buffers */
96 struct list_head sb_segsum_buffers; 88 struct list_head sb_segsum_buffers;
97 struct list_head sb_payload_buffers; /* including super root */ 89 struct list_head sb_payload_buffers; /* including super root */
90 struct buffer_head *sb_super_root;
98 91
99 /* io status */ 92 /* io status */
100 int sb_nbio; 93 int sb_nbio;
@@ -121,9 +114,8 @@ struct nilfs_segment_buffer {
121 b_assoc_buffers)) 114 b_assoc_buffers))
122#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) 115#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head)
123 116
117extern struct kmem_cache *nilfs_segbuf_cachep;
124 118
125int __init nilfs_init_segbuf_cache(void);
126void nilfs_destroy_segbuf_cache(void);
127struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); 119struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
128void nilfs_segbuf_free(struct nilfs_segment_buffer *); 120void nilfs_segbuf_free(struct nilfs_segment_buffer *);
129void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, 121void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
@@ -132,13 +124,24 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
132 struct nilfs_segment_buffer *prev); 124 struct nilfs_segment_buffer *prev);
133void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, 125void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
134 struct the_nilfs *); 126 struct the_nilfs *);
135int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); 127int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
136int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); 128int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
137int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, 129int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
138 struct buffer_head **); 130 struct buffer_head **);
139void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); 131void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
140void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32); 132
141void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32); 133static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf)
134{
135 unsigned int flags = segbuf->sb_sum.flags;
136
137 return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) ==
138 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND);
139}
140
141static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf)
142{
143 return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk;
144}
142 145
143static inline void 146static inline void
144nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, 147nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs,
171 struct nilfs_segment_buffer *last); 174 struct nilfs_segment_buffer *last);
172int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs); 175int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
173int nilfs_wait_on_logs(struct list_head *logs); 176int nilfs_wait_on_logs(struct list_head *logs);
177void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
174 178
175static inline void nilfs_destroy_logs(struct list_head *logs) 179static inline void nilfs_destroy_logs(struct list_head *logs)
176{ 180{
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6a7dbd8451db..9fd051a33c4f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -116,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
116#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) 116#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
117#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) 117#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
118 118
119/*
120 * Transaction
121 */
122static struct kmem_cache *nilfs_transaction_cachep;
123
124/**
125 * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
126 *
127 * nilfs_init_transaction_cache() creates a slab cache for the struct
128 * nilfs_transaction_info.
129 *
130 * Return Value: On success, it returns 0. On error, one of the following
131 * negative error code is returned.
132 *
133 * %-ENOMEM - Insufficient memory available.
134 */
135int nilfs_init_transaction_cache(void)
136{
137 nilfs_transaction_cachep =
138 kmem_cache_create("nilfs2_transaction_cache",
139 sizeof(struct nilfs_transaction_info),
140 0, SLAB_RECLAIM_ACCOUNT, NULL);
141 return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
142}
143
144/**
145 * nilfs_destroy_transaction_cache - destroy the cache for transaction info
146 *
147 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
148 * nilfs_transaction_info.
149 */
150void nilfs_destroy_transaction_cache(void)
151{
152 kmem_cache_destroy(nilfs_transaction_cachep);
153}
154
155static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) 119static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
156{ 120{
157 struct nilfs_transaction_info *cur_ti = current->journal_info; 121 struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -402,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
402 366
403 if (nilfs_doing_gc()) 367 if (nilfs_doing_gc())
404 flags = NILFS_SS_GC; 368 flags = NILFS_SS_GC;
405 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime); 369 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
370 sci->sc_sbi->s_nilfs->ns_cno);
406 if (unlikely(err)) 371 if (unlikely(err))
407 return err; 372 return err;
408 373
@@ -435,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
435 return err; 400 return err;
436 segbuf = sci->sc_curseg; 401 segbuf = sci->sc_curseg;
437 } 402 }
438 err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root); 403 err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
439 if (likely(!err)) 404 if (likely(!err))
440 segbuf->sb_sum.flags |= NILFS_SS_SR; 405 segbuf->sb_sum.flags |= NILFS_SS_SR;
441 return err; 406 return err;
@@ -599,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
599 *vblocknr = binfo->bi_v.bi_vblocknr; 564 *vblocknr = binfo->bi_v.bi_vblocknr;
600} 565}
601 566
602struct nilfs_sc_operations nilfs_sc_file_ops = { 567static struct nilfs_sc_operations nilfs_sc_file_ops = {
603 .collect_data = nilfs_collect_file_data, 568 .collect_data = nilfs_collect_file_data,
604 .collect_node = nilfs_collect_file_node, 569 .collect_node = nilfs_collect_file_node,
605 .collect_bmap = nilfs_collect_file_bmap, 570 .collect_bmap = nilfs_collect_file_bmap,
@@ -649,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
649 *binfo_dat = binfo->bi_dat; 614 *binfo_dat = binfo->bi_dat;
650} 615}
651 616
652struct nilfs_sc_operations nilfs_sc_dat_ops = { 617static struct nilfs_sc_operations nilfs_sc_dat_ops = {
653 .collect_data = nilfs_collect_dat_data, 618 .collect_data = nilfs_collect_dat_data,
654 .collect_node = nilfs_collect_file_node, 619 .collect_node = nilfs_collect_file_node,
655 .collect_bmap = nilfs_collect_dat_bmap, 620 .collect_bmap = nilfs_collect_dat_bmap,
@@ -657,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
657 .write_node_binfo = nilfs_write_dat_node_binfo, 622 .write_node_binfo = nilfs_write_dat_node_binfo,
658}; 623};
659 624
660struct nilfs_sc_operations nilfs_sc_dsync_ops = { 625static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
661 .collect_data = nilfs_collect_file_data, 626 .collect_data = nilfs_collect_file_data,
662 .collect_node = NULL, 627 .collect_node = NULL,
663 .collect_bmap = NULL, 628 .collect_bmap = NULL,
@@ -932,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
932 } 897 }
933} 898}
934 899
935/*
936 * CRC calculation routines
937 */
938static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
939{
940 struct nilfs_super_root *raw_sr =
941 (struct nilfs_super_root *)bh_sr->b_data;
942 u32 crc;
943
944 crc = crc32_le(seed,
945 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
946 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
947 raw_sr->sr_sum = cpu_to_le32(crc);
948}
949
950static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
951 u32 seed)
952{
953 struct nilfs_segment_buffer *segbuf;
954
955 if (sci->sc_super_root)
956 nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
957
958 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
959 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
960 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
961 }
962}
963
964static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, 900static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
965 struct the_nilfs *nilfs) 901 struct the_nilfs *nilfs)
966{ 902{
967 struct buffer_head *bh_sr = sci->sc_super_root; 903 struct buffer_head *bh_sr;
968 struct nilfs_super_root *raw_sr = 904 struct nilfs_super_root *raw_sr;
969 (struct nilfs_super_root *)bh_sr->b_data;
970 unsigned isz = nilfs->ns_inode_size; 905 unsigned isz = nilfs->ns_inode_size;
971 906
907 bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
908 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
909
972 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); 910 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
973 raw_sr->sr_nongc_ctime 911 raw_sr->sr_nongc_ctime
974 = cpu_to_le64(nilfs_doing_gc() ? 912 = cpu_to_le64(nilfs_doing_gc() ?
@@ -1491,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1491 1429
1492 /* Collection retry loop */ 1430 /* Collection retry loop */
1493 for (;;) { 1431 for (;;) {
1494 sci->sc_super_root = NULL;
1495 sci->sc_nblk_this_inc = 0; 1432 sci->sc_nblk_this_inc = 0;
1496 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); 1433 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1497 1434
@@ -1568,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1568 ssp.offset = sizeof(struct nilfs_segment_summary); 1505 ssp.offset = sizeof(struct nilfs_segment_summary);
1569 1506
1570 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { 1507 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
1571 if (bh == sci->sc_super_root) 1508 if (bh == segbuf->sb_super_root)
1572 break; 1509 break;
1573 if (!finfo) { 1510 if (!finfo) {
1574 finfo = nilfs_segctor_map_segsum_entry( 1511 finfo = nilfs_segctor_map_segsum_entry(
@@ -1729,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1729 1666
1730 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1667 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1731 b_assoc_buffers) { 1668 b_assoc_buffers) {
1732 if (bh == sci->sc_super_root) { 1669 if (bh == segbuf->sb_super_root) {
1733 if (bh->b_page != bd_page) { 1670 if (bh->b_page != bd_page) {
1734 lock_page(bd_page); 1671 lock_page(bd_page);
1735 clear_page_dirty_for_io(bd_page); 1672 clear_page_dirty_for_io(bd_page);
@@ -1848,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1848} 1785}
1849 1786
1850static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, 1787static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
1851 struct buffer_head *bh_sr, int err) 1788 int err)
1852{ 1789{
1853 struct nilfs_segment_buffer *segbuf; 1790 struct nilfs_segment_buffer *segbuf;
1854 struct page *bd_page = NULL, *fs_page = NULL; 1791 struct page *bd_page = NULL, *fs_page = NULL;
@@ -1869,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
1869 1806
1870 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1807 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1871 b_assoc_buffers) { 1808 b_assoc_buffers) {
1872 if (bh == bh_sr) { 1809 if (bh == segbuf->sb_super_root) {
1873 if (bh->b_page != bd_page) { 1810 if (bh->b_page != bd_page) {
1874 end_page_writeback(bd_page); 1811 end_page_writeback(bd_page);
1875 bd_page = bh->b_page; 1812 bd_page = bh->b_page;
@@ -1898,7 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1898 1835
1899 list_splice_tail_init(&sci->sc_write_logs, &logs); 1836 list_splice_tail_init(&sci->sc_write_logs, &logs);
1900 ret = nilfs_wait_on_logs(&logs); 1837 ret = nilfs_wait_on_logs(&logs);
1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err); 1838 nilfs_abort_logs(&logs, NULL, ret ? : err);
1902 1839
1903 list_splice_tail_init(&sci->sc_segbufs, &logs); 1840 list_splice_tail_init(&sci->sc_segbufs, &logs);
1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile); 1841 nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1914 } 1851 }
1915 1852
1916 nilfs_destroy_logs(&logs); 1853 nilfs_destroy_logs(&logs);
1917 sci->sc_super_root = NULL;
1918} 1854}
1919 1855
1920static void nilfs_set_next_segment(struct the_nilfs *nilfs, 1856static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1933 struct nilfs_segment_buffer *segbuf; 1869 struct nilfs_segment_buffer *segbuf;
1934 struct page *bd_page = NULL, *fs_page = NULL; 1870 struct page *bd_page = NULL, *fs_page = NULL;
1935 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 1871 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
1936 int update_sr = (sci->sc_super_root != NULL); 1872 int update_sr = false;
1937 1873
1938 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1874 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
1939 struct buffer_head *bh; 1875 struct buffer_head *bh;
@@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1964 set_buffer_uptodate(bh); 1900 set_buffer_uptodate(bh);
1965 clear_buffer_dirty(bh); 1901 clear_buffer_dirty(bh);
1966 clear_buffer_nilfs_volatile(bh); 1902 clear_buffer_nilfs_volatile(bh);
1967 if (bh == sci->sc_super_root) { 1903 if (bh == segbuf->sb_super_root) {
1968 if (bh->b_page != bd_page) { 1904 if (bh->b_page != bd_page) {
1969 end_page_writeback(bd_page); 1905 end_page_writeback(bd_page);
1970 bd_page = bh->b_page; 1906 bd_page = bh->b_page;
1971 } 1907 }
1908 update_sr = true;
1972 break; 1909 break;
1973 } 1910 }
1974 if (bh->b_page != fs_page) { 1911 if (bh->b_page != fs_page) {
@@ -1977,12 +1914,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1977 } 1914 }
1978 } 1915 }
1979 1916
1980 if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) { 1917 if (!nilfs_segbuf_simplex(segbuf)) {
1981 if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) { 1918 if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
1982 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); 1919 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
1983 sci->sc_lseg_stime = jiffies; 1920 sci->sc_lseg_stime = jiffies;
1984 } 1921 }
1985 if (NILFS_SEG_LOGEND(&segbuf->sb_sum)) 1922 if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
1986 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); 1923 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
1987 } 1924 }
1988 } 1925 }
@@ -2014,7 +1951,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
2014 if (update_sr) { 1951 if (update_sr) {
2015 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, 1952 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2016 segbuf->sb_sum.seg_seq, nilfs->ns_cno++); 1953 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2017 set_nilfs_sb_dirty(nilfs);
2018 1954
2019 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 1955 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2020 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); 1956 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2115,7 +2051,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2115 struct nilfs_sb_info *sbi = sci->sc_sbi; 2051 struct nilfs_sb_info *sbi = sci->sc_sbi;
2116 struct the_nilfs *nilfs = sbi->s_nilfs; 2052 struct the_nilfs *nilfs = sbi->s_nilfs;
2117 struct page *failed_page; 2053 struct page *failed_page;
2118 int err, has_sr = 0; 2054 int err;
2119 2055
2120 sci->sc_stage.scnt = NILFS_ST_INIT; 2056 sci->sc_stage.scnt = NILFS_ST_INIT;
2121 2057
@@ -2143,11 +2079,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2143 if (unlikely(err)) 2079 if (unlikely(err))
2144 goto failed; 2080 goto failed;
2145 2081
2146 has_sr = (sci->sc_super_root != NULL);
2147
2148 /* Avoid empty segment */ 2082 /* Avoid empty segment */
2149 if (sci->sc_stage.scnt == NILFS_ST_DONE && 2083 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2150 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { 2084 nilfs_segbuf_empty(sci->sc_curseg)) {
2151 nilfs_segctor_abort_construction(sci, nilfs, 1); 2085 nilfs_segctor_abort_construction(sci, nilfs, 1);
2152 goto out; 2086 goto out;
2153 } 2087 }
@@ -2159,7 +2093,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2159 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) 2093 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2160 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); 2094 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
2161 2095
2162 if (has_sr) { 2096 if (mode == SC_LSEG_SR &&
2097 sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
2163 err = nilfs_segctor_fill_in_checkpoint(sci); 2098 err = nilfs_segctor_fill_in_checkpoint(sci);
2164 if (unlikely(err)) 2099 if (unlikely(err))
2165 goto failed_to_write; 2100 goto failed_to_write;
@@ -2171,11 +2106,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2171 /* Write partial segments */ 2106 /* Write partial segments */
2172 err = nilfs_segctor_prepare_write(sci, &failed_page); 2107 err = nilfs_segctor_prepare_write(sci, &failed_page);
2173 if (err) { 2108 if (err) {
2174 nilfs_abort_logs(&sci->sc_segbufs, failed_page, 2109 nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
2175 sci->sc_super_root, err);
2176 goto failed_to_write; 2110 goto failed_to_write;
2177 } 2111 }
2178 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); 2112
2113 nilfs_add_checksums_on_logs(&sci->sc_segbufs,
2114 nilfs->ns_crc_seed);
2179 2115
2180 err = nilfs_segctor_write(sci, nilfs); 2116 err = nilfs_segctor_write(sci, nilfs);
2181 if (unlikely(err)) 2117 if (unlikely(err))
@@ -2196,8 +2132,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2196 } 2132 }
2197 } while (sci->sc_stage.scnt != NILFS_ST_DONE); 2133 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2198 2134
2199 sci->sc_super_root = NULL;
2200
2201 out: 2135 out:
2202 nilfs_segctor_check_out_files(sci, sbi); 2136 nilfs_segctor_check_out_files(sci, sbi);
2203 return err; 2137 return err;
@@ -2224,9 +2158,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2224static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) 2158static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
2225{ 2159{
2226 spin_lock(&sci->sc_state_lock); 2160 spin_lock(&sci->sc_state_lock);
2227 if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { 2161 if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
2228 sci->sc_timer->expires = jiffies + sci->sc_interval; 2162 sci->sc_timer.expires = jiffies + sci->sc_interval;
2229 add_timer(sci->sc_timer); 2163 add_timer(&sci->sc_timer);
2230 sci->sc_state |= NILFS_SEGCTOR_COMMIT; 2164 sci->sc_state |= NILFS_SEGCTOR_COMMIT;
2231 } 2165 }
2232 spin_unlock(&sci->sc_state_lock); 2166 spin_unlock(&sci->sc_state_lock);
@@ -2431,9 +2365,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
2431 spin_lock(&sci->sc_state_lock); 2365 spin_lock(&sci->sc_state_lock);
2432 sci->sc_seq_accepted = sci->sc_seq_request; 2366 sci->sc_seq_accepted = sci->sc_seq_request;
2433 spin_unlock(&sci->sc_state_lock); 2367 spin_unlock(&sci->sc_state_lock);
2434 2368 del_timer_sync(&sci->sc_timer);
2435 if (sci->sc_timer)
2436 del_timer_sync(sci->sc_timer);
2437} 2369}
2438 2370
2439/** 2371/**
@@ -2459,9 +2391,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2459 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2391 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2460 2392
2461 /* re-enable timer if checkpoint creation was not done */ 2393 /* re-enable timer if checkpoint creation was not done */
2462 if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2394 if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2463 time_before(jiffies, sci->sc_timer->expires)) 2395 time_before(jiffies, sci->sc_timer.expires))
2464 add_timer(sci->sc_timer); 2396 add_timer(&sci->sc_timer);
2465 } 2397 }
2466 spin_unlock(&sci->sc_state_lock); 2398 spin_unlock(&sci->sc_state_lock);
2467} 2399}
@@ -2475,6 +2407,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2475{ 2407{
2476 struct nilfs_sb_info *sbi = sci->sc_sbi; 2408 struct nilfs_sb_info *sbi = sci->sc_sbi;
2477 struct the_nilfs *nilfs = sbi->s_nilfs; 2409 struct the_nilfs *nilfs = sbi->s_nilfs;
2410 struct nilfs_super_block **sbp;
2478 int err = 0; 2411 int err = 0;
2479 2412
2480 nilfs_segctor_accept(sci); 2413 nilfs_segctor_accept(sci);
@@ -2490,8 +2423,13 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2490 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2423 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2491 nilfs_discontinued(nilfs)) { 2424 nilfs_discontinued(nilfs)) {
2492 down_write(&nilfs->ns_sem); 2425 down_write(&nilfs->ns_sem);
2493 err = nilfs_commit_super( 2426 err = -EIO;
2494 sbi, nilfs_altsb_need_update(nilfs)); 2427 sbp = nilfs_prepare_super(sbi,
2428 nilfs_sb_will_flip(nilfs));
2429 if (likely(sbp)) {
2430 nilfs_set_log_cursor(sbp[0], nilfs);
2431 err = nilfs_commit_super(sbi, NILFS_SB_COMMIT);
2432 }
2495 up_write(&nilfs->ns_sem); 2433 up_write(&nilfs->ns_sem);
2496 } 2434 }
2497 } 2435 }
@@ -2640,13 +2578,10 @@ static int nilfs_segctor_thread(void *arg)
2640{ 2578{
2641 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2579 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2642 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 2580 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2643 struct timer_list timer;
2644 int timeout = 0; 2581 int timeout = 0;
2645 2582
2646 init_timer(&timer); 2583 sci->sc_timer.data = (unsigned long)current;
2647 timer.data = (unsigned long)current; 2584 sci->sc_timer.function = nilfs_construction_timeout;
2648 timer.function = nilfs_construction_timeout;
2649 sci->sc_timer = &timer;
2650 2585
2651 /* start sync. */ 2586 /* start sync. */
2652 sci->sc_task = current; 2587 sci->sc_task = current;
@@ -2695,7 +2630,7 @@ static int nilfs_segctor_thread(void *arg)
2695 should_sleep = 0; 2630 should_sleep = 0;
2696 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) 2631 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
2697 should_sleep = time_before(jiffies, 2632 should_sleep = time_before(jiffies,
2698 sci->sc_timer->expires); 2633 sci->sc_timer.expires);
2699 2634
2700 if (should_sleep) { 2635 if (should_sleep) {
2701 spin_unlock(&sci->sc_state_lock); 2636 spin_unlock(&sci->sc_state_lock);
@@ -2704,7 +2639,7 @@ static int nilfs_segctor_thread(void *arg)
2704 } 2639 }
2705 finish_wait(&sci->sc_wait_daemon, &wait); 2640 finish_wait(&sci->sc_wait_daemon, &wait);
2706 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2641 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2707 time_after_eq(jiffies, sci->sc_timer->expires)); 2642 time_after_eq(jiffies, sci->sc_timer.expires));
2708 2643
2709 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) 2644 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
2710 set_nilfs_discontinued(nilfs); 2645 set_nilfs_discontinued(nilfs);
@@ -2713,8 +2648,6 @@ static int nilfs_segctor_thread(void *arg)
2713 2648
2714 end_thread: 2649 end_thread:
2715 spin_unlock(&sci->sc_state_lock); 2650 spin_unlock(&sci->sc_state_lock);
2716 del_timer_sync(sci->sc_timer);
2717 sci->sc_timer = NULL;
2718 2651
2719 /* end sync. */ 2652 /* end sync. */
2720 sci->sc_task = NULL; 2653 sci->sc_task = NULL;
@@ -2750,13 +2683,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2750 } 2683 }
2751} 2684}
2752 2685
2753static int nilfs_segctor_init(struct nilfs_sc_info *sci)
2754{
2755 sci->sc_seq_done = sci->sc_seq_request;
2756
2757 return nilfs_segctor_start_thread(sci);
2758}
2759
2760/* 2686/*
2761 * Setup & clean-up functions 2687 * Setup & clean-up functions
2762 */ 2688 */
@@ -2780,6 +2706,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2780 INIT_LIST_HEAD(&sci->sc_write_logs); 2706 INIT_LIST_HEAD(&sci->sc_write_logs);
2781 INIT_LIST_HEAD(&sci->sc_gc_inodes); 2707 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2782 INIT_LIST_HEAD(&sci->sc_copied_buffers); 2708 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2709 init_timer(&sci->sc_timer);
2783 2710
2784 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; 2711 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
2785 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; 2712 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2846,6 +2773,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2846 2773
2847 down_write(&sbi->s_nilfs->ns_segctor_sem); 2774 down_write(&sbi->s_nilfs->ns_segctor_sem);
2848 2775
2776 del_timer_sync(&sci->sc_timer);
2849 kfree(sci); 2777 kfree(sci);
2850} 2778}
2851 2779
@@ -2880,7 +2808,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2880 return -ENOMEM; 2808 return -ENOMEM;
2881 2809
2882 nilfs_attach_writer(nilfs, sbi); 2810 nilfs_attach_writer(nilfs, sbi);
2883 err = nilfs_segctor_init(NILFS_SC(sbi)); 2811 err = nilfs_segctor_start_thread(NILFS_SC(sbi));
2884 if (err) { 2812 if (err) {
2885 nilfs_detach_writer(nilfs, sbi); 2813 nilfs_detach_writer(nilfs, sbi);
2886 kfree(sbi->s_sc_info); 2814 kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 82dfd6a686b9..17c487bd8152 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
100 * @sc_write_logs: List of segment buffers to hold logs under writing 100 * @sc_write_logs: List of segment buffers to hold logs under writing
101 * @sc_segbuf_nblocks: Number of available blocks in segment buffers. 101 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
102 * @sc_curseg: Current segment buffer 102 * @sc_curseg: Current segment buffer
103 * @sc_super_root: Pointer to the super root buffer
104 * @sc_stage: Collection stage 103 * @sc_stage: Collection stage
105 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary 104 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
106 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary 105 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -148,7 +147,6 @@ struct nilfs_sc_info {
148 struct list_head sc_write_logs; 147 struct list_head sc_write_logs;
149 unsigned long sc_segbuf_nblocks; 148 unsigned long sc_segbuf_nblocks;
150 struct nilfs_segment_buffer *sc_curseg; 149 struct nilfs_segment_buffer *sc_curseg;
151 struct buffer_head *sc_super_root;
152 150
153 struct nilfs_cstage sc_stage; 151 struct nilfs_cstage sc_stage;
154 152
@@ -179,7 +177,7 @@ struct nilfs_sc_info {
179 unsigned long sc_lseg_stime; /* in 1/HZ seconds */ 177 unsigned long sc_lseg_stime; /* in 1/HZ seconds */
180 unsigned long sc_watermark; 178 unsigned long sc_watermark;
181 179
182 struct timer_list *sc_timer; 180 struct timer_list sc_timer;
183 struct task_struct *sc_task; 181 struct task_struct *sc_task;
184}; 182};
185 183
@@ -219,10 +217,10 @@ enum {
219 */ 217 */
220#define NILFS_SC_DEFAULT_WATERMARK 3600 218#define NILFS_SC_DEFAULT_WATERMARK 3600
221 219
220/* super.c */
221extern struct kmem_cache *nilfs_transaction_cachep;
222 222
223/* segment.c */ 223/* segment.c */
224extern int nilfs_init_transaction_cache(void);
225extern void nilfs_destroy_transaction_cache(void);
226extern void nilfs_relax_pressure_in_lock(struct super_block *); 224extern void nilfs_relax_pressure_in_lock(struct super_block *);
227 225
228extern int nilfs_construct_segment(struct super_block *); 226extern int nilfs_construct_segment(struct super_block *);
@@ -236,13 +234,13 @@ extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
236extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); 234extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
237 235
238/* recovery.c */ 236/* recovery.c */
239extern int nilfs_read_super_root_block(struct super_block *, sector_t, 237extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
240 struct buffer_head **, int); 238 struct buffer_head **, int);
241extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *, 239extern int nilfs_search_super_root(struct the_nilfs *,
242 struct nilfs_recovery_info *); 240 struct nilfs_recovery_info *);
243extern int nilfs_recover_logical_segments(struct the_nilfs *, 241extern int nilfs_salvage_orphan_logs(struct the_nilfs *,
244 struct nilfs_sb_info *, 242 struct nilfs_sb_info *,
245 struct nilfs_recovery_info *); 243 struct nilfs_recovery_info *);
246extern void nilfs_dispose_segment_list(struct list_head *); 244extern void nilfs_dispose_segment_list(struct list_head *);
247 245
248#endif /* _NILFS_SEGMENT_H */ 246#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 48145f505a6a..922263393c76 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -55,6 +55,8 @@
55#include "nilfs.h" 55#include "nilfs.h"
56#include "mdt.h" 56#include "mdt.h"
57#include "alloc.h" 57#include "alloc.h"
58#include "btree.h"
59#include "btnode.h"
58#include "page.h" 60#include "page.h"
59#include "cpfile.h" 61#include "cpfile.h"
60#include "ifile.h" 62#include "ifile.h"
@@ -67,8 +69,32 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
67 "(NILFS)"); 69 "(NILFS)");
68MODULE_LICENSE("GPL"); 70MODULE_LICENSE("GPL");
69 71
72struct kmem_cache *nilfs_inode_cachep;
73struct kmem_cache *nilfs_transaction_cachep;
74struct kmem_cache *nilfs_segbuf_cachep;
75struct kmem_cache *nilfs_btree_path_cache;
76
70static int nilfs_remount(struct super_block *sb, int *flags, char *data); 77static int nilfs_remount(struct super_block *sb, int *flags, char *data);
71 78
79static void nilfs_set_error(struct nilfs_sb_info *sbi)
80{
81 struct the_nilfs *nilfs = sbi->s_nilfs;
82 struct nilfs_super_block **sbp;
83
84 down_write(&nilfs->ns_sem);
85 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
86 nilfs->ns_mount_state |= NILFS_ERROR_FS;
87 sbp = nilfs_prepare_super(sbi, 0);
88 if (likely(sbp)) {
89 sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
90 if (sbp[1])
91 sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
92 nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
93 }
94 }
95 up_write(&nilfs->ns_sem);
96}
97
72/** 98/**
73 * nilfs_error() - report failure condition on a filesystem 99 * nilfs_error() - report failure condition on a filesystem
74 * 100 *
@@ -94,16 +120,7 @@ void nilfs_error(struct super_block *sb, const char *function,
94 va_end(args); 120 va_end(args);
95 121
96 if (!(sb->s_flags & MS_RDONLY)) { 122 if (!(sb->s_flags & MS_RDONLY)) {
97 struct the_nilfs *nilfs = sbi->s_nilfs; 123 nilfs_set_error(sbi);
98
99 down_write(&nilfs->ns_sem);
100 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
101 nilfs->ns_mount_state |= NILFS_ERROR_FS;
102 nilfs->ns_sbp[0]->s_state |=
103 cpu_to_le16(NILFS_ERROR_FS);
104 nilfs_commit_super(sbi, 1);
105 }
106 up_write(&nilfs->ns_sem);
107 124
108 if (nilfs_test_opt(sbi, ERRORS_RO)) { 125 if (nilfs_test_opt(sbi, ERRORS_RO)) {
109 printk(KERN_CRIT "Remounting filesystem read-only\n"); 126 printk(KERN_CRIT "Remounting filesystem read-only\n");
@@ -129,7 +146,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
129 va_end(args); 146 va_end(args);
130} 147}
131 148
132static struct kmem_cache *nilfs_inode_cachep;
133 149
134struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) 150struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
135{ 151{
@@ -155,83 +171,46 @@ void nilfs_destroy_inode(struct inode *inode)
155 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 171 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
156} 172}
157 173
158static void init_once(void *obj) 174static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
159{
160 struct nilfs_inode_info *ii = obj;
161
162 INIT_LIST_HEAD(&ii->i_dirty);
163#ifdef CONFIG_NILFS_XATTR
164 init_rwsem(&ii->xattr_sem);
165#endif
166 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
167 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
168 inode_init_once(&ii->vfs_inode);
169}
170
171static int nilfs_init_inode_cache(void)
172{
173 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
174 sizeof(struct nilfs_inode_info),
175 0, SLAB_RECLAIM_ACCOUNT,
176 init_once);
177
178 return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
179}
180
181static inline void nilfs_destroy_inode_cache(void)
182{
183 kmem_cache_destroy(nilfs_inode_cachep);
184}
185
186static void nilfs_clear_inode(struct inode *inode)
187{
188 struct nilfs_inode_info *ii = NILFS_I(inode);
189
190 /*
191 * Free resources allocated in nilfs_read_inode(), here.
192 */
193 BUG_ON(!list_empty(&ii->i_dirty));
194 brelse(ii->i_bh);
195 ii->i_bh = NULL;
196
197 if (test_bit(NILFS_I_BMAP, &ii->i_state))
198 nilfs_bmap_clear(ii->i_bmap);
199
200 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
201}
202
203static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
204{ 175{
205 struct the_nilfs *nilfs = sbi->s_nilfs; 176 struct the_nilfs *nilfs = sbi->s_nilfs;
206 int err; 177 int err;
207 int barrier_done = 0;
208 178
209 if (nilfs_test_opt(sbi, BARRIER)) {
210 set_buffer_ordered(nilfs->ns_sbh[0]);
211 barrier_done = 1;
212 }
213 retry: 179 retry:
214 set_buffer_dirty(nilfs->ns_sbh[0]); 180 set_buffer_dirty(nilfs->ns_sbh[0]);
215 err = sync_dirty_buffer(nilfs->ns_sbh[0]); 181
216 if (err == -EOPNOTSUPP && barrier_done) { 182 if (nilfs_test_opt(sbi, BARRIER)) {
217 nilfs_warning(sbi->s_super, __func__, 183 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
218 "barrier-based sync failed. " 184 WRITE_SYNC | WRITE_BARRIER);
219 "disabling barriers\n"); 185 if (err == -EOPNOTSUPP) {
220 nilfs_clear_opt(sbi, BARRIER); 186 nilfs_warning(sbi->s_super, __func__,
221 barrier_done = 0; 187 "barrier-based sync failed. "
222 clear_buffer_ordered(nilfs->ns_sbh[0]); 188 "disabling barriers\n");
223 goto retry; 189 nilfs_clear_opt(sbi, BARRIER);
190 goto retry;
191 }
192 } else {
193 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
224 } 194 }
195
225 if (unlikely(err)) { 196 if (unlikely(err)) {
226 printk(KERN_ERR 197 printk(KERN_ERR
227 "NILFS: unable to write superblock (err=%d)\n", err); 198 "NILFS: unable to write superblock (err=%d)\n", err);
228 if (err == -EIO && nilfs->ns_sbh[1]) { 199 if (err == -EIO && nilfs->ns_sbh[1]) {
200 /*
201 * sbp[0] points to newer log than sbp[1],
202 * so copy sbp[0] to sbp[1] to take over sbp[0].
203 */
204 memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0],
205 nilfs->ns_sbsize);
229 nilfs_fall_back_super_block(nilfs); 206 nilfs_fall_back_super_block(nilfs);
230 goto retry; 207 goto retry;
231 } 208 }
232 } else { 209 } else {
233 struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; 210 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
234 211
212 nilfs->ns_sbwcount++;
213
235 /* 214 /*
236 * The latest segment becomes trailable from the position 215 * The latest segment becomes trailable from the position
237 * written in superblock. 216 * written in superblock.
@@ -240,66 +219,122 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
240 219
241 /* update GC protection for recent segments */ 220 /* update GC protection for recent segments */
242 if (nilfs->ns_sbh[1]) { 221 if (nilfs->ns_sbh[1]) {
243 sbp = NULL; 222 if (flag == NILFS_SB_COMMIT_ALL) {
244 if (dupsb) {
245 set_buffer_dirty(nilfs->ns_sbh[1]); 223 set_buffer_dirty(nilfs->ns_sbh[1]);
246 if (!sync_dirty_buffer(nilfs->ns_sbh[1])) 224 if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0)
247 sbp = nilfs->ns_sbp[1]; 225 goto out;
248 } 226 }
227 if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) <
228 le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno))
229 sbp = nilfs->ns_sbp[1];
249 } 230 }
250 if (sbp) {
251 spin_lock(&nilfs->ns_last_segment_lock);
252 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
253 spin_unlock(&nilfs->ns_last_segment_lock);
254 }
255 }
256 231
232 spin_lock(&nilfs->ns_last_segment_lock);
233 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
234 spin_unlock(&nilfs->ns_last_segment_lock);
235 }
236 out:
257 return err; 237 return err;
258} 238}
259 239
260int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) 240void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
241 struct the_nilfs *nilfs)
242{
243 sector_t nfreeblocks;
244
245 /* nilfs->ns_sem must be locked by the caller. */
246 nilfs_count_free_blocks(nilfs, &nfreeblocks);
247 sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);
248
249 spin_lock(&nilfs->ns_last_segment_lock);
250 sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
251 sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
252 sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
253 spin_unlock(&nilfs->ns_last_segment_lock);
254}
255
256struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
257 int flip)
261{ 258{
262 struct the_nilfs *nilfs = sbi->s_nilfs; 259 struct the_nilfs *nilfs = sbi->s_nilfs;
263 struct nilfs_super_block **sbp = nilfs->ns_sbp; 260 struct nilfs_super_block **sbp = nilfs->ns_sbp;
264 sector_t nfreeblocks;
265 time_t t;
266 int err;
267 261
268 /* nilfs->sem must be locked by the caller. */ 262 /* nilfs->ns_sem must be locked by the caller. */
269 if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) { 263 if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
270 if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC) 264 if (sbp[1] &&
271 nilfs_swap_super_block(nilfs); 265 sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
272 else { 266 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
267 } else {
273 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", 268 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
274 sbi->s_super->s_id); 269 sbi->s_super->s_id);
275 return -EIO; 270 return NULL;
276 } 271 }
272 } else if (sbp[1] &&
273 sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
274 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
277 } 275 }
278 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
279 if (unlikely(err)) {
280 printk(KERN_ERR "NILFS: failed to count free blocks\n");
281 return err;
282 }
283 spin_lock(&nilfs->ns_last_segment_lock);
284 sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
285 sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
286 sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
287 spin_unlock(&nilfs->ns_last_segment_lock);
288 276
277 if (flip && sbp[1])
278 nilfs_swap_super_block(nilfs);
279
280 return sbp;
281}
282
283int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
284{
285 struct the_nilfs *nilfs = sbi->s_nilfs;
286 struct nilfs_super_block **sbp = nilfs->ns_sbp;
287 time_t t;
288
289 /* nilfs->ns_sem must be locked by the caller. */
289 t = get_seconds(); 290 t = get_seconds();
290 nilfs->ns_sbwtime[0] = t; 291 nilfs->ns_sbwtime = t;
291 sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
292 sbp[0]->s_wtime = cpu_to_le64(t); 292 sbp[0]->s_wtime = cpu_to_le64(t);
293 sbp[0]->s_sum = 0; 293 sbp[0]->s_sum = 0;
294 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, 294 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
295 (unsigned char *)sbp[0], 295 (unsigned char *)sbp[0],
296 nilfs->ns_sbsize)); 296 nilfs->ns_sbsize));
297 if (dupsb && sbp[1]) { 297 if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) {
298 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 298 sbp[1]->s_wtime = sbp[0]->s_wtime;
299 nilfs->ns_sbwtime[1] = t; 299 sbp[1]->s_sum = 0;
300 sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
301 (unsigned char *)sbp[1],
302 nilfs->ns_sbsize));
300 } 303 }
301 clear_nilfs_sb_dirty(nilfs); 304 clear_nilfs_sb_dirty(nilfs);
302 return nilfs_sync_super(sbi, dupsb); 305 return nilfs_sync_super(sbi, flag);
306}
307
308/**
309 * nilfs_cleanup_super() - write filesystem state for cleanup
310 * @sbi: nilfs_sb_info to be unmounted or degraded to read-only
311 *
312 * This function restores state flags in the on-disk super block.
313 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
314 * filesystem was not clean previously.
315 */
316int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
317{
318 struct nilfs_super_block **sbp;
319 int flag = NILFS_SB_COMMIT;
320 int ret = -EIO;
321
322 sbp = nilfs_prepare_super(sbi, 0);
323 if (sbp) {
324 sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state);
325 nilfs_set_log_cursor(sbp[0], sbi->s_nilfs);
326 if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
327 /*
328 * make the "clean" flag also to the opposite
329 * super block if both super blocks point to
330 * the same checkpoint.
331 */
332 sbp[1]->s_state = sbp[0]->s_state;
333 flag = NILFS_SB_COMMIT_ALL;
334 }
335 ret = nilfs_commit_super(sbi, flag);
336 }
337 return ret;
303} 338}
304 339
305static void nilfs_put_super(struct super_block *sb) 340static void nilfs_put_super(struct super_block *sb)
@@ -313,8 +348,7 @@ static void nilfs_put_super(struct super_block *sb)
313 348
314 if (!(sb->s_flags & MS_RDONLY)) { 349 if (!(sb->s_flags & MS_RDONLY)) {
315 down_write(&nilfs->ns_sem); 350 down_write(&nilfs->ns_sem);
316 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); 351 nilfs_cleanup_super(sbi);
317 nilfs_commit_super(sbi, 1);
318 up_write(&nilfs->ns_sem); 352 up_write(&nilfs->ns_sem);
319 } 353 }
320 down_write(&nilfs->ns_super_sem); 354 down_write(&nilfs->ns_super_sem);
@@ -335,6 +369,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
335{ 369{
336 struct nilfs_sb_info *sbi = NILFS_SB(sb); 370 struct nilfs_sb_info *sbi = NILFS_SB(sb);
337 struct the_nilfs *nilfs = sbi->s_nilfs; 371 struct the_nilfs *nilfs = sbi->s_nilfs;
372 struct nilfs_super_block **sbp;
338 int err = 0; 373 int err = 0;
339 374
340 /* This function is called when super block should be written back */ 375 /* This function is called when super block should be written back */
@@ -342,8 +377,13 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
342 err = nilfs_construct_segment(sb); 377 err = nilfs_construct_segment(sb);
343 378
344 down_write(&nilfs->ns_sem); 379 down_write(&nilfs->ns_sem);
345 if (nilfs_sb_dirty(nilfs)) 380 if (nilfs_sb_dirty(nilfs)) {
346 nilfs_commit_super(sbi, 1); 381 sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs));
382 if (likely(sbp)) {
383 nilfs_set_log_cursor(sbp[0], nilfs);
384 nilfs_commit_super(sbi, NILFS_SB_COMMIT);
385 }
386 }
347 up_write(&nilfs->ns_sem); 387 up_write(&nilfs->ns_sem);
348 388
349 return err; 389 return err;
@@ -360,9 +400,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
360 list_add(&sbi->s_list, &nilfs->ns_supers); 400 list_add(&sbi->s_list, &nilfs->ns_supers);
361 up_write(&nilfs->ns_super_sem); 401 up_write(&nilfs->ns_super_sem);
362 402
403 err = -ENOMEM;
363 sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); 404 sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
364 if (!sbi->s_ifile) 405 if (!sbi->s_ifile)
365 return -ENOMEM; 406 goto delist;
366 407
367 down_read(&nilfs->ns_segctor_sem); 408 down_read(&nilfs->ns_segctor_sem);
368 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 409 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -393,6 +434,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
393 nilfs_mdt_destroy(sbi->s_ifile); 434 nilfs_mdt_destroy(sbi->s_ifile);
394 sbi->s_ifile = NULL; 435 sbi->s_ifile = NULL;
395 436
437 delist:
396 down_write(&nilfs->ns_super_sem); 438 down_write(&nilfs->ns_super_sem);
397 list_del_init(&sbi->s_list); 439 list_del_init(&sbi->s_list);
398 up_write(&nilfs->ns_super_sem); 440 up_write(&nilfs->ns_super_sem);
@@ -466,20 +508,20 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
466 struct nilfs_sb_info *sbi = NILFS_SB(sb); 508 struct nilfs_sb_info *sbi = NILFS_SB(sb);
467 509
468 if (!nilfs_test_opt(sbi, BARRIER)) 510 if (!nilfs_test_opt(sbi, BARRIER))
469 seq_printf(seq, ",nobarrier"); 511 seq_puts(seq, ",nobarrier");
470 if (nilfs_test_opt(sbi, SNAPSHOT)) 512 if (nilfs_test_opt(sbi, SNAPSHOT))
471 seq_printf(seq, ",cp=%llu", 513 seq_printf(seq, ",cp=%llu",
472 (unsigned long long int)sbi->s_snapshot_cno); 514 (unsigned long long int)sbi->s_snapshot_cno);
473 if (nilfs_test_opt(sbi, ERRORS_RO))
474 seq_printf(seq, ",errors=remount-ro");
475 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 515 if (nilfs_test_opt(sbi, ERRORS_PANIC))
476 seq_printf(seq, ",errors=panic"); 516 seq_puts(seq, ",errors=panic");
517 if (nilfs_test_opt(sbi, ERRORS_CONT))
518 seq_puts(seq, ",errors=continue");
477 if (nilfs_test_opt(sbi, STRICT_ORDER)) 519 if (nilfs_test_opt(sbi, STRICT_ORDER))
478 seq_printf(seq, ",order=strict"); 520 seq_puts(seq, ",order=strict");
479 if (nilfs_test_opt(sbi, NORECOVERY)) 521 if (nilfs_test_opt(sbi, NORECOVERY))
480 seq_printf(seq, ",norecovery"); 522 seq_puts(seq, ",norecovery");
481 if (nilfs_test_opt(sbi, DISCARD)) 523 if (nilfs_test_opt(sbi, DISCARD))
482 seq_printf(seq, ",discard"); 524 seq_puts(seq, ",discard");
483 525
484 return 0; 526 return 0;
485} 527}
@@ -491,7 +533,7 @@ static const struct super_operations nilfs_sops = {
491 /* .write_inode = nilfs_write_inode, */ 533 /* .write_inode = nilfs_write_inode, */
492 /* .put_inode = nilfs_put_inode, */ 534 /* .put_inode = nilfs_put_inode, */
493 /* .drop_inode = nilfs_drop_inode, */ 535 /* .drop_inode = nilfs_drop_inode, */
494 .delete_inode = nilfs_delete_inode, 536 .evict_inode = nilfs_evict_inode,
495 .put_super = nilfs_put_super, 537 .put_super = nilfs_put_super,
496 /* .write_super = nilfs_write_super, */ 538 /* .write_super = nilfs_write_super, */
497 .sync_fs = nilfs_sync_fs, 539 .sync_fs = nilfs_sync_fs,
@@ -499,7 +541,6 @@ static const struct super_operations nilfs_sops = {
499 /* .unlockfs */ 541 /* .unlockfs */
500 .statfs = nilfs_statfs, 542 .statfs = nilfs_statfs,
501 .remount_fs = nilfs_remount, 543 .remount_fs = nilfs_remount,
502 .clear_inode = nilfs_clear_inode,
503 /* .umount_begin */ 544 /* .umount_begin */
504 .show_options = nilfs_show_options 545 .show_options = nilfs_show_options
505}; 546};
@@ -548,23 +589,25 @@ static const struct export_operations nilfs_export_ops = {
548 589
549enum { 590enum {
550 Opt_err_cont, Opt_err_panic, Opt_err_ro, 591 Opt_err_cont, Opt_err_panic, Opt_err_ro,
551 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 592 Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
552 Opt_discard, Opt_err, 593 Opt_discard, Opt_nodiscard, Opt_err,
553}; 594};
554 595
555static match_table_t tokens = { 596static match_table_t tokens = {
556 {Opt_err_cont, "errors=continue"}, 597 {Opt_err_cont, "errors=continue"},
557 {Opt_err_panic, "errors=panic"}, 598 {Opt_err_panic, "errors=panic"},
558 {Opt_err_ro, "errors=remount-ro"}, 599 {Opt_err_ro, "errors=remount-ro"},
600 {Opt_barrier, "barrier"},
559 {Opt_nobarrier, "nobarrier"}, 601 {Opt_nobarrier, "nobarrier"},
560 {Opt_snapshot, "cp=%u"}, 602 {Opt_snapshot, "cp=%u"},
561 {Opt_order, "order=%s"}, 603 {Opt_order, "order=%s"},
562 {Opt_norecovery, "norecovery"}, 604 {Opt_norecovery, "norecovery"},
563 {Opt_discard, "discard"}, 605 {Opt_discard, "discard"},
606 {Opt_nodiscard, "nodiscard"},
564 {Opt_err, NULL} 607 {Opt_err, NULL}
565}; 608};
566 609
567static int parse_options(char *options, struct super_block *sb) 610static int parse_options(char *options, struct super_block *sb, int is_remount)
568{ 611{
569 struct nilfs_sb_info *sbi = NILFS_SB(sb); 612 struct nilfs_sb_info *sbi = NILFS_SB(sb);
570 char *p; 613 char *p;
@@ -581,6 +624,9 @@ static int parse_options(char *options, struct super_block *sb)
581 624
582 token = match_token(p, tokens, args); 625 token = match_token(p, tokens, args);
583 switch (token) { 626 switch (token) {
627 case Opt_barrier:
628 nilfs_set_opt(sbi, BARRIER);
629 break;
584 case Opt_nobarrier: 630 case Opt_nobarrier:
585 nilfs_clear_opt(sbi, BARRIER); 631 nilfs_clear_opt(sbi, BARRIER);
586 break; 632 break;
@@ -606,8 +652,26 @@ static int parse_options(char *options, struct super_block *sb)
606 case Opt_snapshot: 652 case Opt_snapshot:
607 if (match_int(&args[0], &option) || option <= 0) 653 if (match_int(&args[0], &option) || option <= 0)
608 return 0; 654 return 0;
609 if (!(sb->s_flags & MS_RDONLY)) 655 if (is_remount) {
656 if (!nilfs_test_opt(sbi, SNAPSHOT)) {
657 printk(KERN_ERR
658 "NILFS: cannot change regular "
659 "mount to snapshot.\n");
660 return 0;
661 } else if (option != sbi->s_snapshot_cno) {
662 printk(KERN_ERR
663 "NILFS: cannot remount to a "
664 "different snapshot.\n");
665 return 0;
666 }
667 break;
668 }
669 if (!(sb->s_flags & MS_RDONLY)) {
670 printk(KERN_ERR "NILFS: cannot mount snapshot "
671 "read/write. A read-only option is "
672 "required.\n");
610 return 0; 673 return 0;
674 }
611 sbi->s_snapshot_cno = option; 675 sbi->s_snapshot_cno = option;
612 nilfs_set_opt(sbi, SNAPSHOT); 676 nilfs_set_opt(sbi, SNAPSHOT);
613 break; 677 break;
@@ -617,6 +681,9 @@ static int parse_options(char *options, struct super_block *sb)
617 case Opt_discard: 681 case Opt_discard:
618 nilfs_set_opt(sbi, DISCARD); 682 nilfs_set_opt(sbi, DISCARD);
619 break; 683 break;
684 case Opt_nodiscard:
685 nilfs_clear_opt(sbi, DISCARD);
686 break;
620 default: 687 default:
621 printk(KERN_ERR 688 printk(KERN_ERR
622 "NILFS: Unrecognized mount option \"%s\"\n", p); 689 "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -631,17 +698,24 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
631 struct nilfs_super_block *sbp) 698 struct nilfs_super_block *sbp)
632{ 699{
633 sbi->s_mount_opt = 700 sbi->s_mount_opt =
634 NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER; 701 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
635} 702}
636 703
637static int nilfs_setup_super(struct nilfs_sb_info *sbi) 704static int nilfs_setup_super(struct nilfs_sb_info *sbi)
638{ 705{
639 struct the_nilfs *nilfs = sbi->s_nilfs; 706 struct the_nilfs *nilfs = sbi->s_nilfs;
640 struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; 707 struct nilfs_super_block **sbp;
641 int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count); 708 int max_mnt_count;
642 int mnt_count = le16_to_cpu(sbp->s_mnt_count); 709 int mnt_count;
710
711 /* nilfs->ns_sem must be locked by the caller. */
712 sbp = nilfs_prepare_super(sbi, 0);
713 if (!sbp)
714 return -EIO;
715
716 max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
717 mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
643 718
644 /* nilfs->sem must be locked by the caller. */
645 if (nilfs->ns_mount_state & NILFS_ERROR_FS) { 719 if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
646 printk(KERN_WARNING 720 printk(KERN_WARNING
647 "NILFS warning: mounting fs with errors\n"); 721 "NILFS warning: mounting fs with errors\n");
@@ -652,12 +726,15 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
652#endif 726#endif
653 } 727 }
654 if (!max_mnt_count) 728 if (!max_mnt_count)
655 sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); 729 sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
656 730
657 sbp->s_mnt_count = cpu_to_le16(mnt_count + 1); 731 sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
658 sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS); 732 sbp[0]->s_state =
659 sbp->s_mtime = cpu_to_le64(get_seconds()); 733 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
660 return nilfs_commit_super(sbi, 1); 734 sbp[0]->s_mtime = cpu_to_le64(get_seconds());
735 /* synchronize sbp[1] with sbp[0] */
736 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
737 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
661} 738}
662 739
663struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, 740struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -694,7 +771,31 @@ int nilfs_store_magic_and_option(struct super_block *sb,
694 sbi->s_interval = le32_to_cpu(sbp->s_c_interval); 771 sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
695 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); 772 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
696 773
697 return !parse_options(data, sb) ? -EINVAL : 0 ; 774 return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
775}
776
777int nilfs_check_feature_compatibility(struct super_block *sb,
778 struct nilfs_super_block *sbp)
779{
780 __u64 features;
781
782 features = le64_to_cpu(sbp->s_feature_incompat) &
783 ~NILFS_FEATURE_INCOMPAT_SUPP;
784 if (features) {
785 printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
786 "optional features (%llx)\n",
787 (unsigned long long)features);
788 return -EINVAL;
789 }
790 features = le64_to_cpu(sbp->s_feature_compat_ro) &
791 ~NILFS_FEATURE_COMPAT_RO_SUPP;
792 if (!(sb->s_flags & MS_RDONLY) && features) {
793 printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
794 "unsupported optional features (%llx)\n",
795 (unsigned long long)features);
796 return -EINVAL;
797 }
798 return 0;
698} 799}
699 800
700/** 801/**
@@ -778,9 +879,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
778 goto failed_sbi; 879 goto failed_sbi;
779 } 880 }
780 cno = sbi->s_snapshot_cno; 881 cno = sbi->s_snapshot_cno;
781 } else 882 }
782 /* Read-only mount */
783 sbi->s_snapshot_cno = cno;
784 } 883 }
785 884
786 err = nilfs_attach_checkpoint(sbi, cno); 885 err = nilfs_attach_checkpoint(sbi, cno);
@@ -845,11 +944,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
845static int nilfs_remount(struct super_block *sb, int *flags, char *data) 944static int nilfs_remount(struct super_block *sb, int *flags, char *data)
846{ 945{
847 struct nilfs_sb_info *sbi = NILFS_SB(sb); 946 struct nilfs_sb_info *sbi = NILFS_SB(sb);
848 struct nilfs_super_block *sbp;
849 struct the_nilfs *nilfs = sbi->s_nilfs; 947 struct the_nilfs *nilfs = sbi->s_nilfs;
850 unsigned long old_sb_flags; 948 unsigned long old_sb_flags;
851 struct nilfs_mount_options old_opts; 949 struct nilfs_mount_options old_opts;
852 int err; 950 int was_snapshot, err;
853 951
854 lock_kernel(); 952 lock_kernel();
855 953
@@ -857,19 +955,18 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
857 old_sb_flags = sb->s_flags; 955 old_sb_flags = sb->s_flags;
858 old_opts.mount_opt = sbi->s_mount_opt; 956 old_opts.mount_opt = sbi->s_mount_opt;
859 old_opts.snapshot_cno = sbi->s_snapshot_cno; 957 old_opts.snapshot_cno = sbi->s_snapshot_cno;
958 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
860 959
861 if (!parse_options(data, sb)) { 960 if (!parse_options(data, sb, 1)) {
862 err = -EINVAL; 961 err = -EINVAL;
863 goto restore_opts; 962 goto restore_opts;
864 } 963 }
865 sb->s_flags = (sb->s_flags & ~MS_POSIXACL); 964 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
866 965
867 if ((*flags & MS_RDONLY) && 966 err = -EINVAL;
868 sbi->s_snapshot_cno != old_opts.snapshot_cno) { 967 if (was_snapshot && !(*flags & MS_RDONLY)) {
869 printk(KERN_WARNING "NILFS (device %s): couldn't " 968 printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
870 "remount to a different snapshot.\n", 969 "read/write.\n", sb->s_id);
871 sb->s_id);
872 err = -EINVAL;
873 goto restore_opts; 970 goto restore_opts;
874 } 971 }
875 972
@@ -877,7 +974,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
877 printk(KERN_WARNING "NILFS (device %s): couldn't " 974 printk(KERN_WARNING "NILFS (device %s): couldn't "
878 "remount because the filesystem is in an " 975 "remount because the filesystem is in an "
879 "incomplete recovery state.\n", sb->s_id); 976 "incomplete recovery state.\n", sb->s_id);
880 err = -EINVAL;
881 goto restore_opts; 977 goto restore_opts;
882 } 978 }
883 979
@@ -888,45 +984,35 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
888 nilfs_detach_segment_constructor(sbi); 984 nilfs_detach_segment_constructor(sbi);
889 sb->s_flags |= MS_RDONLY; 985 sb->s_flags |= MS_RDONLY;
890 986
891 sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
892 /* nilfs_set_opt(sbi, SNAPSHOT); */
893
894 /* 987 /*
895 * Remounting a valid RW partition RDONLY, so set 988 * Remounting a valid RW partition RDONLY, so set
896 * the RDONLY flag and then mark the partition as valid again. 989 * the RDONLY flag and then mark the partition as valid again.
897 */ 990 */
898 down_write(&nilfs->ns_sem); 991 down_write(&nilfs->ns_sem);
899 sbp = nilfs->ns_sbp[0]; 992 nilfs_cleanup_super(sbi);
900 if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
901 (nilfs->ns_mount_state & NILFS_VALID_FS))
902 sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
903 sbp->s_mtime = cpu_to_le64(get_seconds());
904 nilfs_commit_super(sbi, 1);
905 up_write(&nilfs->ns_sem); 993 up_write(&nilfs->ns_sem);
906 } else { 994 } else {
995 __u64 features;
996
907 /* 997 /*
908 * Mounting a RDONLY partition read-write, so reread and 998 * Mounting a RDONLY partition read-write, so reread and
909 * store the current valid flag. (It may have been changed 999 * store the current valid flag. (It may have been changed
910 * by fsck since we originally mounted the partition.) 1000 * by fsck since we originally mounted the partition.)
911 */ 1001 */
912 if (nilfs->ns_current && nilfs->ns_current != sbi) { 1002 down_read(&nilfs->ns_sem);
913 printk(KERN_WARNING "NILFS (device %s): couldn't " 1003 features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
914 "remount because an RW-mount exists.\n", 1004 ~NILFS_FEATURE_COMPAT_RO_SUPP;
915 sb->s_id); 1005 up_read(&nilfs->ns_sem);
916 err = -EBUSY; 1006 if (features) {
917 goto restore_opts;
918 }
919 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
920 printk(KERN_WARNING "NILFS (device %s): couldn't " 1007 printk(KERN_WARNING "NILFS (device %s): couldn't "
921 "remount because the current RO-mount is not " 1008 "remount RDWR because of unsupported optional "
922 "the latest one.\n", 1009 "features (%llx)\n",
923 sb->s_id); 1010 sb->s_id, (unsigned long long)features);
924 err = -EINVAL; 1011 err = -EROFS;
925 goto restore_opts; 1012 goto restore_opts;
926 } 1013 }
1014
927 sb->s_flags &= ~MS_RDONLY; 1015 sb->s_flags &= ~MS_RDONLY;
928 nilfs_clear_opt(sbi, SNAPSHOT);
929 sbi->s_snapshot_cno = 0;
930 1016
931 err = nilfs_attach_segment_constructor(sbi); 1017 err = nilfs_attach_segment_constructor(sbi);
932 if (err) 1018 if (err)
@@ -935,8 +1021,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
935 down_write(&nilfs->ns_sem); 1021 down_write(&nilfs->ns_sem);
936 nilfs_setup_super(sbi); 1022 nilfs_setup_super(sbi);
937 up_write(&nilfs->ns_sem); 1023 up_write(&nilfs->ns_sem);
938
939 nilfs->ns_current = sbi;
940 } 1024 }
941 out: 1025 out:
942 up_write(&nilfs->ns_super_sem); 1026 up_write(&nilfs->ns_super_sem);
@@ -1022,10 +1106,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1022{ 1106{
1023 struct nilfs_super_data sd; 1107 struct nilfs_super_data sd;
1024 struct super_block *s; 1108 struct super_block *s;
1109 fmode_t mode = FMODE_READ;
1025 struct the_nilfs *nilfs; 1110 struct the_nilfs *nilfs;
1026 int err, need_to_close = 1; 1111 int err, need_to_close = 1;
1027 1112
1028 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); 1113 if (!(flags & MS_RDONLY))
1114 mode |= FMODE_WRITE;
1115
1116 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
1029 if (IS_ERR(sd.bdev)) 1117 if (IS_ERR(sd.bdev))
1030 return PTR_ERR(sd.bdev); 1118 return PTR_ERR(sd.bdev);
1031 1119
@@ -1092,10 +1180,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1092 1180
1093 /* New superblock instance created */ 1181 /* New superblock instance created */
1094 s->s_flags = flags; 1182 s->s_flags = flags;
1183 s->s_mode = mode;
1095 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1184 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1096 sb_set_blocksize(s, block_size(sd.bdev)); 1185 sb_set_blocksize(s, block_size(sd.bdev));
1097 1186
1098 err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs); 1187 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
1188 nilfs);
1099 if (err) 1189 if (err)
1100 goto cancel_new; 1190 goto cancel_new;
1101 1191
@@ -1106,7 +1196,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1106 mutex_unlock(&nilfs->ns_mount_mutex); 1196 mutex_unlock(&nilfs->ns_mount_mutex);
1107 put_nilfs(nilfs); 1197 put_nilfs(nilfs);
1108 if (need_to_close) 1198 if (need_to_close)
1109 close_bdev_exclusive(sd.bdev, flags); 1199 close_bdev_exclusive(sd.bdev, mode);
1110 simple_set_mnt(mnt, s); 1200 simple_set_mnt(mnt, s);
1111 return 0; 1201 return 0;
1112 1202
@@ -1114,7 +1204,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1114 mutex_unlock(&nilfs->ns_mount_mutex); 1204 mutex_unlock(&nilfs->ns_mount_mutex);
1115 put_nilfs(nilfs); 1205 put_nilfs(nilfs);
1116 failed: 1206 failed:
1117 close_bdev_exclusive(sd.bdev, flags); 1207 close_bdev_exclusive(sd.bdev, mode);
1118 1208
1119 return err; 1209 return err;
1120 1210
@@ -1124,7 +1214,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1124 put_nilfs(nilfs); 1214 put_nilfs(nilfs);
1125 deactivate_locked_super(s); 1215 deactivate_locked_super(s);
1126 /* 1216 /*
1127 * deactivate_super() invokes close_bdev_exclusive(). 1217 * deactivate_locked_super() invokes close_bdev_exclusive().
1128 * We must finish all post-cleaning before this call; 1218 * We must finish all post-cleaning before this call;
1129 * put_nilfs() needs the block device. 1219 * put_nilfs() needs the block device.
1130 */ 1220 */
@@ -1139,54 +1229,93 @@ struct file_system_type nilfs_fs_type = {
1139 .fs_flags = FS_REQUIRES_DEV, 1229 .fs_flags = FS_REQUIRES_DEV,
1140}; 1230};
1141 1231
1142static int __init init_nilfs_fs(void) 1232static void nilfs_inode_init_once(void *obj)
1143{ 1233{
1144 int err; 1234 struct nilfs_inode_info *ii = obj;
1145
1146 err = nilfs_init_inode_cache();
1147 if (err)
1148 goto failed;
1149 1235
1150 err = nilfs_init_transaction_cache(); 1236 INIT_LIST_HEAD(&ii->i_dirty);
1151 if (err) 1237#ifdef CONFIG_NILFS_XATTR
1152 goto failed_inode_cache; 1238 init_rwsem(&ii->xattr_sem);
1239#endif
1240 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
1241 ii->i_bmap = &ii->i_bmap_data;
1242 inode_init_once(&ii->vfs_inode);
1243}
1153 1244
1154 err = nilfs_init_segbuf_cache(); 1245static void nilfs_segbuf_init_once(void *obj)
1155 if (err) 1246{
1156 goto failed_transaction_cache; 1247 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
1248}
1157 1249
1158 err = nilfs_btree_path_cache_init(); 1250static void nilfs_destroy_cachep(void)
1159 if (err) 1251{
1160 goto failed_segbuf_cache; 1252 if (nilfs_inode_cachep)
1253 kmem_cache_destroy(nilfs_inode_cachep);
1254 if (nilfs_transaction_cachep)
1255 kmem_cache_destroy(nilfs_transaction_cachep);
1256 if (nilfs_segbuf_cachep)
1257 kmem_cache_destroy(nilfs_segbuf_cachep);
1258 if (nilfs_btree_path_cache)
1259 kmem_cache_destroy(nilfs_btree_path_cache);
1260}
1161 1261
1162 err = register_filesystem(&nilfs_fs_type); 1262static int __init nilfs_init_cachep(void)
1163 if (err) 1263{
1164 goto failed_btree_path_cache; 1264 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
1265 sizeof(struct nilfs_inode_info), 0,
1266 SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
1267 if (!nilfs_inode_cachep)
1268 goto fail;
1269
1270 nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
1271 sizeof(struct nilfs_transaction_info), 0,
1272 SLAB_RECLAIM_ACCOUNT, NULL);
1273 if (!nilfs_transaction_cachep)
1274 goto fail;
1275
1276 nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
1277 sizeof(struct nilfs_segment_buffer), 0,
1278 SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
1279 if (!nilfs_segbuf_cachep)
1280 goto fail;
1281
1282 nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
1283 sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
1284 0, 0, NULL);
1285 if (!nilfs_btree_path_cache)
1286 goto fail;
1165 1287
1166 return 0; 1288 return 0;
1167 1289
1168 failed_btree_path_cache: 1290fail:
1169 nilfs_btree_path_cache_destroy(); 1291 nilfs_destroy_cachep();
1292 return -ENOMEM;
1293}
1170 1294
1171 failed_segbuf_cache: 1295static int __init init_nilfs_fs(void)
1172 nilfs_destroy_segbuf_cache(); 1296{
1297 int err;
1173 1298
1174 failed_transaction_cache: 1299 err = nilfs_init_cachep();
1175 nilfs_destroy_transaction_cache(); 1300 if (err)
1301 goto fail;
1176 1302
1177 failed_inode_cache: 1303 err = register_filesystem(&nilfs_fs_type);
1178 nilfs_destroy_inode_cache(); 1304 if (err)
1305 goto free_cachep;
1179 1306
1180 failed: 1307 printk(KERN_INFO "NILFS version 2 loaded\n");
1308 return 0;
1309
1310free_cachep:
1311 nilfs_destroy_cachep();
1312fail:
1181 return err; 1313 return err;
1182} 1314}
1183 1315
1184static void __exit exit_nilfs_fs(void) 1316static void __exit exit_nilfs_fs(void)
1185{ 1317{
1186 nilfs_destroy_segbuf_cache(); 1318 nilfs_destroy_cachep();
1187 nilfs_destroy_transaction_cache();
1188 nilfs_destroy_inode_cache();
1189 nilfs_btree_path_cache_destroy();
1190 unregister_filesystem(&nilfs_fs_type); 1319 unregister_filesystem(&nilfs_fs_type);
1191} 1320}
1192 1321
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7e4f01..ba7c10c917fc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -38,6 +38,8 @@
38static LIST_HEAD(nilfs_objects); 38static LIST_HEAD(nilfs_objects);
39static DEFINE_SPINLOCK(nilfs_lock); 39static DEFINE_SPINLOCK(nilfs_lock);
40 40
41static int nilfs_valid_sb(struct nilfs_super_block *sbp);
42
41void nilfs_set_last_segment(struct the_nilfs *nilfs, 43void nilfs_set_last_segment(struct the_nilfs *nilfs,
42 sector_t start_blocknr, u64 seq, __u64 cno) 44 sector_t start_blocknr, u64 seq, __u64 cno)
43{ 45{
@@ -45,6 +47,16 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
45 nilfs->ns_last_pseg = start_blocknr; 47 nilfs->ns_last_pseg = start_blocknr;
46 nilfs->ns_last_seq = seq; 48 nilfs->ns_last_seq = seq;
47 nilfs->ns_last_cno = cno; 49 nilfs->ns_last_cno = cno;
50
51 if (!nilfs_sb_dirty(nilfs)) {
52 if (nilfs->ns_prev_seq == nilfs->ns_last_seq)
53 goto stay_cursor;
54
55 set_nilfs_sb_dirty(nilfs);
56 }
57 nilfs->ns_prev_seq = nilfs->ns_last_seq;
58
59 stay_cursor:
48 spin_unlock(&nilfs->ns_last_segment_lock); 60 spin_unlock(&nilfs->ns_last_segment_lock);
49} 61}
50 62
@@ -159,8 +171,7 @@ void put_nilfs(struct the_nilfs *nilfs)
159 kfree(nilfs); 171 kfree(nilfs);
160} 172}
161 173
162static int nilfs_load_super_root(struct the_nilfs *nilfs, 174static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
163 struct nilfs_sb_info *sbi, sector_t sr_block)
164{ 175{
165 struct buffer_head *bh_sr; 176 struct buffer_head *bh_sr;
166 struct nilfs_super_root *raw_sr; 177 struct nilfs_super_root *raw_sr;
@@ -169,7 +180,7 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
169 unsigned inode_size; 180 unsigned inode_size;
170 int err; 181 int err;
171 182
172 err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1); 183 err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
173 if (unlikely(err)) 184 if (unlikely(err))
174 return err; 185 return err;
175 186
@@ -248,6 +259,37 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
248} 259}
249 260
250/** 261/**
262 * nilfs_store_log_cursor - load log cursor from a super block
263 * @nilfs: nilfs object
264 * @sbp: buffer storing super block to be read
265 *
266 * nilfs_store_log_cursor() reads the last position of the log
267 * containing a super root from a given super block, and initializes
268 * relevant information on the nilfs object preparatory for log
269 * scanning and recovery.
270 */
271static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
272 struct nilfs_super_block *sbp)
273{
274 int ret = 0;
275
276 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
277 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
278 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
279
280 nilfs->ns_prev_seq = nilfs->ns_last_seq;
281 nilfs->ns_seg_seq = nilfs->ns_last_seq;
282 nilfs->ns_segnum =
283 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
284 nilfs->ns_cno = nilfs->ns_last_cno + 1;
285 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
286 printk(KERN_ERR "NILFS invalid last segment number.\n");
287 ret = -EINVAL;
288 }
289 return ret;
290}
291
292/**
251 * load_nilfs - load and recover the nilfs 293 * load_nilfs - load and recover the nilfs
252 * @nilfs: the_nilfs structure to be released 294 * @nilfs: the_nilfs structure to be released
253 * @sbi: nilfs_sb_info used to recover past segment 295 * @sbi: nilfs_sb_info used to recover past segment
@@ -285,13 +327,55 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
285 327
286 nilfs_init_recovery_info(&ri); 328 nilfs_init_recovery_info(&ri);
287 329
288 err = nilfs_search_super_root(nilfs, sbi, &ri); 330 err = nilfs_search_super_root(nilfs, &ri);
289 if (unlikely(err)) { 331 if (unlikely(err)) {
290 printk(KERN_ERR "NILFS: error searching super root.\n"); 332 struct nilfs_super_block **sbp = nilfs->ns_sbp;
291 goto failed; 333 int blocksize;
334
335 if (err != -EINVAL)
336 goto scan_error;
337
338 if (!nilfs_valid_sb(sbp[1])) {
339 printk(KERN_WARNING
340 "NILFS warning: unable to fall back to spare"
341 "super block\n");
342 goto scan_error;
343 }
344 printk(KERN_INFO
345 "NILFS: try rollback from an earlier position\n");
346
347 /*
348 * restore super block with its spare and reconfigure
349 * relevant states of the nilfs object.
350 */
351 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
352 nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed);
353 nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
354
355 /* verify consistency between two super blocks */
356 blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
357 if (blocksize != nilfs->ns_blocksize) {
358 printk(KERN_WARNING
359 "NILFS warning: blocksize differs between "
360 "two super blocks (%d != %d)\n",
361 blocksize, nilfs->ns_blocksize);
362 goto scan_error;
363 }
364
365 err = nilfs_store_log_cursor(nilfs, sbp[0]);
366 if (err)
367 goto scan_error;
368
369 /* drop clean flag to allow roll-forward and recovery */
370 nilfs->ns_mount_state &= ~NILFS_VALID_FS;
371 valid_fs = 0;
372
373 err = nilfs_search_super_root(nilfs, &ri);
374 if (err)
375 goto scan_error;
292 } 376 }
293 377
294 err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root); 378 err = nilfs_load_super_root(nilfs, ri.ri_super_root);
295 if (unlikely(err)) { 379 if (unlikely(err)) {
296 printk(KERN_ERR "NILFS: error loading super root.\n"); 380 printk(KERN_ERR "NILFS: error loading super root.\n");
297 goto failed; 381 goto failed;
@@ -301,11 +385,23 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
301 goto skip_recovery; 385 goto skip_recovery;
302 386
303 if (s_flags & MS_RDONLY) { 387 if (s_flags & MS_RDONLY) {
388 __u64 features;
389
304 if (nilfs_test_opt(sbi, NORECOVERY)) { 390 if (nilfs_test_opt(sbi, NORECOVERY)) {
305 printk(KERN_INFO "NILFS: norecovery option specified. " 391 printk(KERN_INFO "NILFS: norecovery option specified. "
306 "skipping roll-forward recovery\n"); 392 "skipping roll-forward recovery\n");
307 goto skip_recovery; 393 goto skip_recovery;
308 } 394 }
395 features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
396 ~NILFS_FEATURE_COMPAT_RO_SUPP;
397 if (features) {
398 printk(KERN_ERR "NILFS: couldn't proceed with "
399 "recovery because of unsupported optional "
400 "features (%llx)\n",
401 (unsigned long long)features);
402 err = -EROFS;
403 goto failed_unload;
404 }
309 if (really_read_only) { 405 if (really_read_only) {
310 printk(KERN_ERR "NILFS: write access " 406 printk(KERN_ERR "NILFS: write access "
311 "unavailable, cannot proceed.\n"); 407 "unavailable, cannot proceed.\n");
@@ -320,14 +416,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
320 goto failed_unload; 416 goto failed_unload;
321 } 417 }
322 418
323 err = nilfs_recover_logical_segments(nilfs, sbi, &ri); 419 err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri);
324 if (err) 420 if (err)
325 goto failed_unload; 421 goto failed_unload;
326 422
327 down_write(&nilfs->ns_sem); 423 down_write(&nilfs->ns_sem);
328 nilfs->ns_mount_state |= NILFS_VALID_FS; 424 nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
329 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); 425 err = nilfs_cleanup_super(sbi);
330 err = nilfs_commit_super(sbi, 1);
331 up_write(&nilfs->ns_sem); 426 up_write(&nilfs->ns_sem);
332 427
333 if (err) { 428 if (err) {
@@ -343,10 +438,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
343 sbi->s_super->s_flags = s_flags; 438 sbi->s_super->s_flags = s_flags;
344 return 0; 439 return 0;
345 440
441 scan_error:
442 printk(KERN_ERR "NILFS: error searching super root.\n");
443 goto failed;
444
346 failed_unload: 445 failed_unload:
347 nilfs_mdt_destroy(nilfs->ns_cpfile); 446 nilfs_mdt_destroy(nilfs->ns_cpfile);
348 nilfs_mdt_destroy(nilfs->ns_sufile); 447 nilfs_mdt_destroy(nilfs->ns_sufile);
349 nilfs_mdt_destroy(nilfs->ns_dat); 448 nilfs_mdt_destroy(nilfs->ns_dat);
449 nilfs_mdt_destroy(nilfs->ns_gc_dat);
350 450
351 failed: 451 failed:
352 nilfs_clear_recovery_info(&ri); 452 nilfs_clear_recovery_info(&ri);
@@ -486,11 +586,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
486 printk(KERN_WARNING 586 printk(KERN_WARNING
487 "NILFS warning: unable to read secondary superblock\n"); 587 "NILFS warning: unable to read secondary superblock\n");
488 588
589 /*
590 * Compare two super blocks and set 1 in swp if the secondary
591 * super block is valid and newer. Otherwise, set 0 in swp.
592 */
489 valid[0] = nilfs_valid_sb(sbp[0]); 593 valid[0] = nilfs_valid_sb(sbp[0]);
490 valid[1] = nilfs_valid_sb(sbp[1]); 594 valid[1] = nilfs_valid_sb(sbp[1]);
491 swp = valid[1] && 595 swp = valid[1] && (!valid[0] ||
492 (!valid[0] || 596 le64_to_cpu(sbp[1]->s_last_cno) >
493 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime)); 597 le64_to_cpu(sbp[0]->s_last_cno));
494 598
495 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { 599 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
496 brelse(sbh[1]); 600 brelse(sbh[1]);
@@ -505,14 +609,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
505 return -EINVAL; 609 return -EINVAL;
506 } 610 }
507 611
508 if (swp) { 612 if (!valid[!swp])
509 printk(KERN_WARNING "NILFS warning: broken superblock. " 613 printk(KERN_WARNING "NILFS warning: broken superblock. "
510 "using spare superblock.\n"); 614 "using spare superblock.\n");
615 if (swp)
511 nilfs_swap_super_block(nilfs); 616 nilfs_swap_super_block(nilfs);
512 }
513 617
514 nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime); 618 nilfs->ns_sbwcount = 0;
515 nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0; 619 nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
516 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); 620 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
517 *sbpp = sbp[0]; 621 *sbpp = sbp[0];
518 return 0; 622 return 0;
@@ -553,6 +657,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
553 if (err) 657 if (err)
554 goto out; 658 goto out;
555 659
660 err = nilfs_check_feature_compatibility(sb, sbp);
661 if (err)
662 goto out;
663
556 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); 664 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
557 if (sb->s_blocksize != blocksize && 665 if (sb->s_blocksize != blocksize &&
558 !sb_set_blocksize(sb, blocksize)) { 666 !sb_set_blocksize(sb, blocksize)) {
@@ -564,7 +672,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
564 goto out; 672 goto out;
565 } 673 }
566 674
567 blocksize = sb_min_blocksize(sb, BLOCK_SIZE); 675 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
568 if (!blocksize) { 676 if (!blocksize) {
569 printk(KERN_ERR "NILFS: unable to set blocksize\n"); 677 printk(KERN_ERR "NILFS: unable to set blocksize\n");
570 err = -EINVAL; 678 err = -EINVAL;
@@ -578,7 +686,18 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
578 if (err) 686 if (err)
579 goto failed_sbh; 687 goto failed_sbh;
580 688
689 err = nilfs_check_feature_compatibility(sb, sbp);
690 if (err)
691 goto failed_sbh;
692
581 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); 693 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
694 if (blocksize < NILFS_MIN_BLOCK_SIZE ||
695 blocksize > NILFS_MAX_BLOCK_SIZE) {
696 printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
697 "filesystem blocksize %d\n", blocksize);
698 err = -EINVAL;
699 goto failed_sbh;
700 }
582 if (sb->s_blocksize != blocksize) { 701 if (sb->s_blocksize != blocksize) {
583 int hw_blocksize = bdev_logical_block_size(sb->s_bdev); 702 int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
584 703
@@ -600,6 +719,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
600 when reloading fails. */ 719 when reloading fails. */
601 } 720 }
602 nilfs->ns_blocksize_bits = sb->s_blocksize_bits; 721 nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
722 nilfs->ns_blocksize = blocksize;
603 723
604 err = nilfs_store_disk_layout(nilfs, sbp); 724 err = nilfs_store_disk_layout(nilfs, sbp);
605 if (err) 725 if (err)
@@ -612,23 +732,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
612 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; 732 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
613 nilfs->ns_bdi = bdi ? : &default_backing_dev_info; 733 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
614 734
615 /* Finding last segment */ 735 err = nilfs_store_log_cursor(nilfs, sbp);
616 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); 736 if (err)
617 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
618 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
619
620 nilfs->ns_seg_seq = nilfs->ns_last_seq;
621 nilfs->ns_segnum =
622 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
623 nilfs->ns_cno = nilfs->ns_last_cno + 1;
624 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
625 printk(KERN_ERR "NILFS invalid last segment number.\n");
626 err = -EINVAL;
627 goto failed_sbh; 737 goto failed_sbh;
628 }
629 /* Dummy values */
630 nilfs->ns_free_segments_count =
631 nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
632 738
633 /* Initialize gcinode cache */ 739 /* Initialize gcinode cache */
634 err = nilfs_init_gccache(nilfs); 740 err = nilfs_init_gccache(nilfs);
@@ -670,7 +776,8 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
670 start * sects_per_block, 776 start * sects_per_block,
671 nblocks * sects_per_block, 777 nblocks * sects_per_block,
672 GFP_NOFS, 778 GFP_NOFS,
673 DISCARD_FL_BARRIER); 779 BLKDEV_IFL_WAIT |
780 BLKDEV_IFL_BARRIER);
674 if (ret < 0) 781 if (ret < 0)
675 return ret; 782 return ret;
676 nblocks = 0; 783 nblocks = 0;
@@ -680,7 +787,8 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
680 ret = blkdev_issue_discard(nilfs->ns_bdev, 787 ret = blkdev_issue_discard(nilfs->ns_bdev,
681 start * sects_per_block, 788 start * sects_per_block,
682 nblocks * sects_per_block, 789 nblocks * sects_per_block,
683 GFP_NOFS, DISCARD_FL_BARRIER); 790 GFP_NOFS,
791 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
684 return ret; 792 return ret;
685} 793}
686 794
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1ab974533697..f785a7b0ab99 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -57,7 +57,8 @@ enum {
57 * @ns_current: back pointer to current mount 57 * @ns_current: back pointer to current mount
58 * @ns_sbh: buffer heads of on-disk super blocks 58 * @ns_sbh: buffer heads of on-disk super blocks
59 * @ns_sbp: pointers to super block data 59 * @ns_sbp: pointers to super block data
60 * @ns_sbwtime: previous write time of super blocks 60 * @ns_sbwtime: previous write time of super block
61 * @ns_sbwcount: write count of super block
61 * @ns_sbsize: size of valid data in super block 62 * @ns_sbsize: size of valid data in super block
62 * @ns_supers: list of nilfs super block structs 63 * @ns_supers: list of nilfs super block structs
63 * @ns_seg_seq: segment sequence counter 64 * @ns_seg_seq: segment sequence counter
@@ -73,7 +74,7 @@ enum {
73 * @ns_last_seq: sequence value of the latest segment 74 * @ns_last_seq: sequence value of the latest segment
74 * @ns_last_cno: checkpoint number of the latest segment 75 * @ns_last_cno: checkpoint number of the latest segment
75 * @ns_prot_seq: least sequence number of segments which must not be reclaimed 76 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
76 * @ns_free_segments_count: counter of free segments 77 * @ns_prev_seq: base sequence number used to decide if advance log cursor
77 * @ns_segctor_sem: segment constructor semaphore 78 * @ns_segctor_sem: segment constructor semaphore
78 * @ns_dat: DAT file inode 79 * @ns_dat: DAT file inode
79 * @ns_cpfile: checkpoint file inode 80 * @ns_cpfile: checkpoint file inode
@@ -82,6 +83,7 @@ enum {
82 * @ns_gc_inodes: dummy inodes to keep live blocks 83 * @ns_gc_inodes: dummy inodes to keep live blocks
83 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks 84 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
84 * @ns_blocksize_bits: bit length of block size 85 * @ns_blocksize_bits: bit length of block size
86 * @ns_blocksize: block size
85 * @ns_nsegments: number of segments in filesystem 87 * @ns_nsegments: number of segments in filesystem
86 * @ns_blocks_per_segment: number of blocks per segment 88 * @ns_blocks_per_segment: number of blocks per segment
87 * @ns_r_segments_percentage: reserved segments percentage 89 * @ns_r_segments_percentage: reserved segments percentage
@@ -119,7 +121,8 @@ struct the_nilfs {
119 */ 121 */
120 struct buffer_head *ns_sbh[2]; 122 struct buffer_head *ns_sbh[2];
121 struct nilfs_super_block *ns_sbp[2]; 123 struct nilfs_super_block *ns_sbp[2];
122 time_t ns_sbwtime[2]; 124 time_t ns_sbwtime;
125 unsigned ns_sbwcount;
123 unsigned ns_sbsize; 126 unsigned ns_sbsize;
124 unsigned ns_mount_state; 127 unsigned ns_mount_state;
125 128
@@ -149,7 +152,7 @@ struct the_nilfs {
149 u64 ns_last_seq; 152 u64 ns_last_seq;
150 __u64 ns_last_cno; 153 __u64 ns_last_cno;
151 u64 ns_prot_seq; 154 u64 ns_prot_seq;
152 unsigned long ns_free_segments_count; 155 u64 ns_prev_seq;
153 156
154 struct rw_semaphore ns_segctor_sem; 157 struct rw_semaphore ns_segctor_sem;
155 158
@@ -168,6 +171,7 @@ struct the_nilfs {
168 171
169 /* Disk layout information (static) */ 172 /* Disk layout information (static) */
170 unsigned int ns_blocksize_bits; 173 unsigned int ns_blocksize_bits;
174 unsigned int ns_blocksize;
171 unsigned long ns_nsegments; 175 unsigned long ns_nsegments;
172 unsigned long ns_blocks_per_segment; 176 unsigned long ns_blocks_per_segment;
173 unsigned long ns_r_segments_percentage; 177 unsigned long ns_r_segments_percentage;
@@ -203,20 +207,17 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
203 207
204/* Minimum interval of periodical update of superblocks (in seconds) */ 208/* Minimum interval of periodical update of superblocks (in seconds) */
205#define NILFS_SB_FREQ 10 209#define NILFS_SB_FREQ 10
206#define NILFS_ALTSB_FREQ 60 /* spare superblock */
207 210
208static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) 211static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
209{ 212{
210 u64 t = get_seconds(); 213 u64 t = get_seconds();
211 return t < nilfs->ns_sbwtime[0] || 214 return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ;
212 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
213} 215}
214 216
215static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs) 217static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
216{ 218{
217 u64 t = get_seconds(); 219 int flip_bits = nilfs->ns_sbwcount & 0x0FL;
218 struct nilfs_super_block **sbp = nilfs->ns_sbp; 220 return (flip_bits != 0x08 && flip_bits != 0x0F);
219 return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
220} 221}
221 222
222void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 223void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index dffbb0911d02..b388443c3a09 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,3 +3,4 @@ config FSNOTIFY
3 3
4source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
6#source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 0922cc826c46..ae5f33a6d868 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,6 @@
1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o 1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \
2 mark.o vfsmount_mark.o
2 3
3obj-y += dnotify/ 4obj-y += dnotify/
4obj-y += inotify/ 5obj-y += inotify/
6obj-y += fanotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 7e54e52964dd..3344bdd5506e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -29,17 +29,17 @@
29int dir_notify_enable __read_mostly = 1; 29int dir_notify_enable __read_mostly = 1;
30 30
31static struct kmem_cache *dnotify_struct_cache __read_mostly; 31static struct kmem_cache *dnotify_struct_cache __read_mostly;
32static struct kmem_cache *dnotify_mark_entry_cache __read_mostly; 32static struct kmem_cache *dnotify_mark_cache __read_mostly;
33static struct fsnotify_group *dnotify_group __read_mostly; 33static struct fsnotify_group *dnotify_group __read_mostly;
34static DEFINE_MUTEX(dnotify_mark_mutex); 34static DEFINE_MUTEX(dnotify_mark_mutex);
35 35
36/* 36/*
37 * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which 37 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
38 * is being watched by dnotify. If multiple userspace applications are watching 38 * is being watched by dnotify. If multiple userspace applications are watching
39 * the same directory with dnotify their information is chained in dn 39 * the same directory with dnotify their information is chained in dn
40 */ 40 */
41struct dnotify_mark_entry { 41struct dnotify_mark {
42 struct fsnotify_mark_entry fsn_entry; 42 struct fsnotify_mark fsn_mark;
43 struct dnotify_struct *dn; 43 struct dnotify_struct *dn;
44}; 44};
45 45
@@ -51,27 +51,27 @@ struct dnotify_mark_entry {
51 * it calls the fsnotify function so it can update the set of all events relevant 51 * it calls the fsnotify function so it can update the set of all events relevant
52 * to this inode. 52 * to this inode.
53 */ 53 */
54static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry) 54static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
55{ 55{
56 __u32 new_mask, old_mask; 56 __u32 new_mask, old_mask;
57 struct dnotify_struct *dn; 57 struct dnotify_struct *dn;
58 struct dnotify_mark_entry *dnentry = container_of(entry, 58 struct dnotify_mark *dn_mark = container_of(fsn_mark,
59 struct dnotify_mark_entry, 59 struct dnotify_mark,
60 fsn_entry); 60 fsn_mark);
61 61
62 assert_spin_locked(&entry->lock); 62 assert_spin_locked(&fsn_mark->lock);
63 63
64 old_mask = entry->mask; 64 old_mask = fsn_mark->mask;
65 new_mask = 0; 65 new_mask = 0;
66 for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next) 66 for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); 67 new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
68 entry->mask = new_mask; 68 fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
69 69
70 if (old_mask == new_mask) 70 if (old_mask == new_mask)
71 return; 71 return;
72 72
73 if (entry->inode) 73 if (fsn_mark->i.inode)
74 fsnotify_recalc_inode_mask(entry->inode); 74 fsnotify_recalc_inode_mask(fsn_mark->i.inode);
75} 75}
76 76
77/* 77/*
@@ -83,29 +83,25 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
83 * events. 83 * events.
84 */ 84 */
85static int dnotify_handle_event(struct fsnotify_group *group, 85static int dnotify_handle_event(struct fsnotify_group *group,
86 struct fsnotify_mark *inode_mark,
87 struct fsnotify_mark *vfsmount_mark,
86 struct fsnotify_event *event) 88 struct fsnotify_event *event)
87{ 89{
88 struct fsnotify_mark_entry *entry = NULL; 90 struct dnotify_mark *dn_mark;
89 struct dnotify_mark_entry *dnentry;
90 struct inode *to_tell; 91 struct inode *to_tell;
91 struct dnotify_struct *dn; 92 struct dnotify_struct *dn;
92 struct dnotify_struct **prev; 93 struct dnotify_struct **prev;
93 struct fown_struct *fown; 94 struct fown_struct *fown;
94 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; 95 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
95 96
96 to_tell = event->to_tell; 97 BUG_ON(vfsmount_mark);
97 98
98 spin_lock(&to_tell->i_lock); 99 to_tell = event->to_tell;
99 entry = fsnotify_find_mark_entry(group, to_tell);
100 spin_unlock(&to_tell->i_lock);
101 100
102 /* unlikely since we alreay passed dnotify_should_send_event() */ 101 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
103 if (unlikely(!entry))
104 return 0;
105 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
106 102
107 spin_lock(&entry->lock); 103 spin_lock(&inode_mark->lock);
108 prev = &dnentry->dn; 104 prev = &dn_mark->dn;
109 while ((dn = *prev) != NULL) { 105 while ((dn = *prev) != NULL) {
110 if ((dn->dn_mask & test_mask) == 0) { 106 if ((dn->dn_mask & test_mask) == 0) {
111 prev = &dn->dn_next; 107 prev = &dn->dn_next;
@@ -118,12 +114,11 @@ static int dnotify_handle_event(struct fsnotify_group *group,
118 else { 114 else {
119 *prev = dn->dn_next; 115 *prev = dn->dn_next;
120 kmem_cache_free(dnotify_struct_cache, dn); 116 kmem_cache_free(dnotify_struct_cache, dn);
121 dnotify_recalc_inode_mask(entry); 117 dnotify_recalc_inode_mask(inode_mark);
122 } 118 }
123 } 119 }
124 120
125 spin_unlock(&entry->lock); 121 spin_unlock(&inode_mark->lock);
126 fsnotify_put_mark(entry);
127 122
128 return 0; 123 return 0;
129} 124}
@@ -133,44 +128,27 @@ static int dnotify_handle_event(struct fsnotify_group *group,
133 * userspace notification for that pair. 128 * userspace notification for that pair.
134 */ 129 */
135static bool dnotify_should_send_event(struct fsnotify_group *group, 130static bool dnotify_should_send_event(struct fsnotify_group *group,
136 struct inode *inode, __u32 mask) 131 struct inode *inode,
132 struct fsnotify_mark *inode_mark,
133 struct fsnotify_mark *vfsmount_mark,
134 __u32 mask, void *data, int data_type)
137{ 135{
138 struct fsnotify_mark_entry *entry;
139 bool send;
140
141 /* !dir_notify_enable should never get here, don't waste time checking
142 if (!dir_notify_enable)
143 return 0; */
144
145 /* not a dir, dnotify doesn't care */ 136 /* not a dir, dnotify doesn't care */
146 if (!S_ISDIR(inode->i_mode)) 137 if (!S_ISDIR(inode->i_mode))
147 return false; 138 return false;
148 139
149 spin_lock(&inode->i_lock); 140 return true;
150 entry = fsnotify_find_mark_entry(group, inode);
151 spin_unlock(&inode->i_lock);
152
153 /* no mark means no dnotify watch */
154 if (!entry)
155 return false;
156
157 mask = (mask & ~FS_EVENT_ON_CHILD);
158 send = (mask & entry->mask);
159
160 fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
161
162 return send;
163} 141}
164 142
165static void dnotify_free_mark(struct fsnotify_mark_entry *entry) 143static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
166{ 144{
167 struct dnotify_mark_entry *dnentry = container_of(entry, 145 struct dnotify_mark *dn_mark = container_of(fsn_mark,
168 struct dnotify_mark_entry, 146 struct dnotify_mark,
169 fsn_entry); 147 fsn_mark);
170 148
171 BUG_ON(dnentry->dn); 149 BUG_ON(dn_mark->dn);
172 150
173 kmem_cache_free(dnotify_mark_entry_cache, dnentry); 151 kmem_cache_free(dnotify_mark_cache, dn_mark);
174} 152}
175 153
176static struct fsnotify_ops dnotify_fsnotify_ops = { 154static struct fsnotify_ops dnotify_fsnotify_ops = {
@@ -183,15 +161,15 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
183 161
184/* 162/*
185 * Called every time a file is closed. Looks first for a dnotify mark on the 163 * Called every time a file is closed. Looks first for a dnotify mark on the
186 * inode. If one is found run all of the ->dn entries attached to that 164 * inode. If one is found run all of the ->dn structures attached to that
187 * mark for one relevant to this process closing the file and remove that 165 * mark for one relevant to this process closing the file and remove that
188 * dnotify_struct. If that was the last dnotify_struct also remove the 166 * dnotify_struct. If that was the last dnotify_struct also remove the
189 * fsnotify_mark_entry. 167 * fsnotify_mark.
190 */ 168 */
191void dnotify_flush(struct file *filp, fl_owner_t id) 169void dnotify_flush(struct file *filp, fl_owner_t id)
192{ 170{
193 struct fsnotify_mark_entry *entry; 171 struct fsnotify_mark *fsn_mark;
194 struct dnotify_mark_entry *dnentry; 172 struct dnotify_mark *dn_mark;
195 struct dnotify_struct *dn; 173 struct dnotify_struct *dn;
196 struct dnotify_struct **prev; 174 struct dnotify_struct **prev;
197 struct inode *inode; 175 struct inode *inode;
@@ -200,38 +178,34 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
200 if (!S_ISDIR(inode->i_mode)) 178 if (!S_ISDIR(inode->i_mode))
201 return; 179 return;
202 180
203 spin_lock(&inode->i_lock); 181 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
204 entry = fsnotify_find_mark_entry(dnotify_group, inode); 182 if (!fsn_mark)
205 spin_unlock(&inode->i_lock);
206 if (!entry)
207 return; 183 return;
208 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); 184 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
209 185
210 mutex_lock(&dnotify_mark_mutex); 186 mutex_lock(&dnotify_mark_mutex);
211 187
212 spin_lock(&entry->lock); 188 spin_lock(&fsn_mark->lock);
213 prev = &dnentry->dn; 189 prev = &dn_mark->dn;
214 while ((dn = *prev) != NULL) { 190 while ((dn = *prev) != NULL) {
215 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { 191 if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
216 *prev = dn->dn_next; 192 *prev = dn->dn_next;
217 kmem_cache_free(dnotify_struct_cache, dn); 193 kmem_cache_free(dnotify_struct_cache, dn);
218 dnotify_recalc_inode_mask(entry); 194 dnotify_recalc_inode_mask(fsn_mark);
219 break; 195 break;
220 } 196 }
221 prev = &dn->dn_next; 197 prev = &dn->dn_next;
222 } 198 }
223 199
224 spin_unlock(&entry->lock); 200 spin_unlock(&fsn_mark->lock);
225 201
226 /* nothing else could have found us thanks to the dnotify_mark_mutex */ 202 /* nothing else could have found us thanks to the dnotify_mark_mutex */
227 if (dnentry->dn == NULL) 203 if (dn_mark->dn == NULL)
228 fsnotify_destroy_mark_by_entry(entry); 204 fsnotify_destroy_mark(fsn_mark);
229
230 fsnotify_recalc_group_mask(dnotify_group);
231 205
232 mutex_unlock(&dnotify_mark_mutex); 206 mutex_unlock(&dnotify_mark_mutex);
233 207
234 fsnotify_put_mark(entry); 208 fsnotify_put_mark(fsn_mark);
235} 209}
236 210
237/* this conversion is done only at watch creation */ 211/* this conversion is done only at watch creation */
@@ -259,16 +233,16 @@ static __u32 convert_arg(unsigned long arg)
259 233
260/* 234/*
261 * If multiple processes watch the same inode with dnotify there is only one 235 * If multiple processes watch the same inode with dnotify there is only one
262 * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct 236 * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
263 * onto that mark. This function either attaches the new dnotify_struct onto 237 * onto that mark. This function either attaches the new dnotify_struct onto
264 * that list, or it |= the mask onto an existing dnofiy_struct. 238 * that list, or it |= the mask onto an existing dnofiy_struct.
265 */ 239 */
266static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry, 240static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
267 fl_owner_t id, int fd, struct file *filp, __u32 mask) 241 fl_owner_t id, int fd, struct file *filp, __u32 mask)
268{ 242{
269 struct dnotify_struct *odn; 243 struct dnotify_struct *odn;
270 244
271 odn = dnentry->dn; 245 odn = dn_mark->dn;
272 while (odn != NULL) { 246 while (odn != NULL) {
273 /* adding more events to existing dnofiy_struct? */ 247 /* adding more events to existing dnofiy_struct? */
274 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { 248 if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
@@ -283,8 +257,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
283 dn->dn_fd = fd; 257 dn->dn_fd = fd;
284 dn->dn_filp = filp; 258 dn->dn_filp = filp;
285 dn->dn_owner = id; 259 dn->dn_owner = id;
286 dn->dn_next = dnentry->dn; 260 dn->dn_next = dn_mark->dn;
287 dnentry->dn = dn; 261 dn_mark->dn = dn;
288 262
289 return 0; 263 return 0;
290} 264}
@@ -296,8 +270,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
296 */ 270 */
297int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) 271int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
298{ 272{
299 struct dnotify_mark_entry *new_dnentry, *dnentry; 273 struct dnotify_mark *new_dn_mark, *dn_mark;
300 struct fsnotify_mark_entry *new_entry, *entry; 274 struct fsnotify_mark *new_fsn_mark, *fsn_mark;
301 struct dnotify_struct *dn; 275 struct dnotify_struct *dn;
302 struct inode *inode; 276 struct inode *inode;
303 fl_owner_t id = current->files; 277 fl_owner_t id = current->files;
@@ -306,7 +280,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
306 __u32 mask; 280 __u32 mask;
307 281
308 /* we use these to tell if we need to kfree */ 282 /* we use these to tell if we need to kfree */
309 new_entry = NULL; 283 new_fsn_mark = NULL;
310 dn = NULL; 284 dn = NULL;
311 285
312 if (!dir_notify_enable) { 286 if (!dir_notify_enable) {
@@ -336,8 +310,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
336 } 310 }
337 311
338 /* new fsnotify mark, we expect most fcntl calls to add a new mark */ 312 /* new fsnotify mark, we expect most fcntl calls to add a new mark */
339 new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL); 313 new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
340 if (!new_dnentry) { 314 if (!new_dn_mark) {
341 error = -ENOMEM; 315 error = -ENOMEM;
342 goto out_err; 316 goto out_err;
343 } 317 }
@@ -345,29 +319,27 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
345 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ 319 /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
346 mask = convert_arg(arg); 320 mask = convert_arg(arg);
347 321
348 /* set up the new_entry and new_dnentry */ 322 /* set up the new_fsn_mark and new_dn_mark */
349 new_entry = &new_dnentry->fsn_entry; 323 new_fsn_mark = &new_dn_mark->fsn_mark;
350 fsnotify_init_mark(new_entry, dnotify_free_mark); 324 fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
351 new_entry->mask = mask; 325 new_fsn_mark->mask = mask;
352 new_dnentry->dn = NULL; 326 new_dn_mark->dn = NULL;
353 327
354 /* this is needed to prevent the fcntl/close race described below */ 328 /* this is needed to prevent the fcntl/close race described below */
355 mutex_lock(&dnotify_mark_mutex); 329 mutex_lock(&dnotify_mark_mutex);
356 330
357 /* add the new_entry or find an old one. */ 331 /* add the new_fsn_mark or find an old one. */
358 spin_lock(&inode->i_lock); 332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
359 entry = fsnotify_find_mark_entry(dnotify_group, inode); 333 if (fsn_mark) {
360 spin_unlock(&inode->i_lock); 334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
361 if (entry) { 335 spin_lock(&fsn_mark->lock);
362 dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
363 spin_lock(&entry->lock);
364 } else { 336 } else {
365 fsnotify_add_mark(new_entry, dnotify_group, inode); 337 fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
366 spin_lock(&new_entry->lock); 338 spin_lock(&new_fsn_mark->lock);
367 entry = new_entry; 339 fsn_mark = new_fsn_mark;
368 dnentry = new_dnentry; 340 dn_mark = new_dn_mark;
369 /* we used new_entry, so don't free it */ 341 /* we used new_fsn_mark, so don't free it */
370 new_entry = NULL; 342 new_fsn_mark = NULL;
371 } 343 }
372 344
373 rcu_read_lock(); 345 rcu_read_lock();
@@ -376,17 +348,17 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
376 348
377 /* if (f != filp) means that we lost a race and another task/thread 349 /* if (f != filp) means that we lost a race and another task/thread
378 * actually closed the fd we are still playing with before we grabbed 350 * actually closed the fd we are still playing with before we grabbed
379 * the dnotify_mark_mutex and entry->lock. Since closing the fd is the 351 * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the
380 * only time we clean up the mark entries we need to get our mark off 352 * only time we clean up the marks we need to get our mark off
381 * the list. */ 353 * the list. */
382 if (f != filp) { 354 if (f != filp) {
383 /* if we added ourselves, shoot ourselves, it's possible that 355 /* if we added ourselves, shoot ourselves, it's possible that
384 * the flush actually did shoot this entry. That's fine too 356 * the flush actually did shoot this fsn_mark. That's fine too
385 * since multiple calls to destroy_mark is perfectly safe, if 357 * since multiple calls to destroy_mark is perfectly safe, if
386 * we found a dnentry already attached to the inode, just sod 358 * we found a dn_mark already attached to the inode, just sod
387 * off silently as the flush at close time dealt with it. 359 * off silently as the flush at close time dealt with it.
388 */ 360 */
389 if (dnentry == new_dnentry) 361 if (dn_mark == new_dn_mark)
390 destroy = 1; 362 destroy = 1;
391 goto out; 363 goto out;
392 } 364 }
@@ -394,13 +366,13 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
394 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 366 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
395 if (error) { 367 if (error) {
396 /* if we added, we must shoot */ 368 /* if we added, we must shoot */
397 if (dnentry == new_dnentry) 369 if (dn_mark == new_dn_mark)
398 destroy = 1; 370 destroy = 1;
399 goto out; 371 goto out;
400 } 372 }
401 373
402 error = attach_dn(dn, dnentry, id, fd, filp, mask); 374 error = attach_dn(dn, dn_mark, id, fd, filp, mask);
403 /* !error means that we attached the dn to the dnentry, so don't free it */ 375 /* !error means that we attached the dn to the dn_mark, so don't free it */
404 if (!error) 376 if (!error)
405 dn = NULL; 377 dn = NULL;
406 /* -EEXIST means that we didn't add this new dn and used an old one. 378 /* -EEXIST means that we didn't add this new dn and used an old one.
@@ -408,20 +380,18 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
408 else if (error == -EEXIST) 380 else if (error == -EEXIST)
409 error = 0; 381 error = 0;
410 382
411 dnotify_recalc_inode_mask(entry); 383 dnotify_recalc_inode_mask(fsn_mark);
412out: 384out:
413 spin_unlock(&entry->lock); 385 spin_unlock(&fsn_mark->lock);
414 386
415 if (destroy) 387 if (destroy)
416 fsnotify_destroy_mark_by_entry(entry); 388 fsnotify_destroy_mark(fsn_mark);
417
418 fsnotify_recalc_group_mask(dnotify_group);
419 389
420 mutex_unlock(&dnotify_mark_mutex); 390 mutex_unlock(&dnotify_mark_mutex);
421 fsnotify_put_mark(entry); 391 fsnotify_put_mark(fsn_mark);
422out_err: 392out_err:
423 if (new_entry) 393 if (new_fsn_mark)
424 fsnotify_put_mark(new_entry); 394 fsnotify_put_mark(new_fsn_mark);
425 if (dn) 395 if (dn)
426 kmem_cache_free(dnotify_struct_cache, dn); 396 kmem_cache_free(dnotify_struct_cache, dn);
427 return error; 397 return error;
@@ -430,10 +400,9 @@ out_err:
430static int __init dnotify_init(void) 400static int __init dnotify_init(void)
431{ 401{
432 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); 402 dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
433 dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC); 403 dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
434 404
435 dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM, 405 dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
436 0, &dnotify_fsnotify_ops);
437 if (IS_ERR(dnotify_group)) 406 if (IS_ERR(dnotify_group))
438 panic("unable to allocate fsnotify group for dnotify\n"); 407 panic("unable to allocate fsnotify group for dnotify\n");
439 return 0; 408 return 0;
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
new file mode 100644
index 000000000000..3ac36b7bf6b9
--- /dev/null
+++ b/fs/notify/fanotify/Kconfig
@@ -0,0 +1,26 @@
1config FANOTIFY
2 bool "Filesystem wide access notification"
3 select FSNOTIFY
4 select ANON_INODES
5 default n
6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access
8 notification system which differs from inotify in that it sends
9 and open file descriptor to the userspace listener along with
10 the event.
11
12 If unsure, say Y.
13
14config FANOTIFY_ACCESS_PERMISSIONS
15 bool "fanotify permissions checking"
16 depends on FANOTIFY
17 depends on SECURITY
18 default n
19 ---help---
20 Say Y here is you want fanotify listeners to be able to make permissions
21 decisions concerning filesystem events. This is used by some fanotify
22 listeners which need to scan files before allowing the system access to
23 use those files. This is used by some anti-malware vendors and by some
24 hierarchical storage managent systems.
25
26 If unsure, say N.
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
new file mode 100644
index 000000000000..0999213e7e6e
--- /dev/null
+++ b/fs/notify/fanotify/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_FANOTIFY) += fanotify.o fanotify_user.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
new file mode 100644
index 000000000000..85366c78cc37
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.c
@@ -0,0 +1,209 @@
1#include <linux/fanotify.h>
2#include <linux/fdtable.h>
3#include <linux/fsnotify_backend.h>
4#include <linux/init.h>
5#include <linux/jiffies.h>
6#include <linux/kernel.h> /* UINT_MAX */
7#include <linux/mount.h>
8#include <linux/sched.h>
9#include <linux/types.h>
10#include <linux/wait.h>
11
12static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
13{
14 pr_debug("%s: old=%p new=%p\n", __func__, old, new);
15
16 if (old->to_tell == new->to_tell &&
17 old->data_type == new->data_type &&
18 old->tgid == new->tgid) {
19 switch (old->data_type) {
20 case (FSNOTIFY_EVENT_PATH):
21 if ((old->path.mnt == new->path.mnt) &&
22 (old->path.dentry == new->path.dentry))
23 return true;
24 case (FSNOTIFY_EVENT_NONE):
25 return true;
26 default:
27 BUG();
28 };
29 }
30 return false;
31}
32
33/* and the list better be locked by something too! */
34static struct fsnotify_event *fanotify_merge(struct list_head *list,
35 struct fsnotify_event *event)
36{
37 struct fsnotify_event_holder *test_holder;
38 struct fsnotify_event *test_event = NULL;
39 struct fsnotify_event *new_event;
40
41 pr_debug("%s: list=%p event=%p\n", __func__, list, event);
42
43
44 list_for_each_entry_reverse(test_holder, list, event_list) {
45 if (should_merge(test_holder->event, event)) {
46 test_event = test_holder->event;
47 break;
48 }
49 }
50
51 if (!test_event)
52 return NULL;
53
54 fsnotify_get_event(test_event);
55
56 /* if they are exactly the same we are done */
57 if (test_event->mask == event->mask)
58 return test_event;
59
60 /*
61 * if the refcnt == 2 this is the only queue
62 * for this event and so we can update the mask
63 * in place.
64 */
65 if (atomic_read(&test_event->refcnt) == 2) {
66 test_event->mask |= event->mask;
67 return test_event;
68 }
69
70 new_event = fsnotify_clone_event(test_event);
71
72 /* done with test_event */
73 fsnotify_put_event(test_event);
74
75 /* couldn't allocate memory, merge was not possible */
76 if (unlikely(!new_event))
77 return ERR_PTR(-ENOMEM);
78
79 /* build new event and replace it on the list */
80 new_event->mask = (test_event->mask | event->mask);
81 fsnotify_replace_event(test_holder, new_event);
82
83 /* we hold a reference on new_event from clone_event */
84 return new_event;
85}
86
87#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
88static int fanotify_get_response_from_access(struct fsnotify_group *group,
89 struct fsnotify_event *event)
90{
91 int ret;
92
93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
94
95 wait_event(group->fanotify_data.access_waitq, event->response);
96
97 /* userspace responded, convert to something usable */
98 spin_lock(&event->lock);
99 switch (event->response) {
100 case FAN_ALLOW:
101 ret = 0;
102 break;
103 case FAN_DENY:
104 default:
105 ret = -EPERM;
106 }
107 event->response = 0;
108 spin_unlock(&event->lock);
109
110 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
111 group, event, ret);
112
113 return ret;
114}
115#endif
116
117static int fanotify_handle_event(struct fsnotify_group *group,
118 struct fsnotify_mark *inode_mark,
119 struct fsnotify_mark *fanotify_mark,
120 struct fsnotify_event *event)
121{
122 int ret = 0;
123 struct fsnotify_event *notify_event = NULL;
124
125 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
126 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
127 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
128 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
129 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
130 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
131 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
132 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
133 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
134
135 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
136
137 notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
138 if (IS_ERR(notify_event))
139 return PTR_ERR(notify_event);
140
141#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
142 if (event->mask & FAN_ALL_PERM_EVENTS) {
143 /* if we merged we need to wait on the new event */
144 if (notify_event)
145 event = notify_event;
146 ret = fanotify_get_response_from_access(group, event);
147 }
148#endif
149
150 if (notify_event)
151 fsnotify_put_event(notify_event);
152
153 return ret;
154}
155
156static bool fanotify_should_send_event(struct fsnotify_group *group,
157 struct inode *to_tell,
158 struct fsnotify_mark *inode_mark,
159 struct fsnotify_mark *vfsmnt_mark,
160 __u32 event_mask, void *data, int data_type)
161{
162 __u32 marks_mask, marks_ignored_mask;
163
164 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
165 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
166 inode_mark, vfsmnt_mark, event_mask, data, data_type);
167
168 /* sorry, fanotify only gives a damn about files and dirs */
169 if (!S_ISREG(to_tell->i_mode) &&
170 !S_ISDIR(to_tell->i_mode))
171 return false;
172
173 /* if we don't have enough info to send an event to userspace say no */
174 if (data_type != FSNOTIFY_EVENT_PATH)
175 return false;
176
177 if (inode_mark && vfsmnt_mark) {
178 marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
179 marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
180 } else if (inode_mark) {
181 /*
182 * if the event is for a child and this inode doesn't care about
183 * events on the child, don't send it!
184 */
185 if ((event_mask & FS_EVENT_ON_CHILD) &&
186 !(inode_mark->mask & FS_EVENT_ON_CHILD))
187 return false;
188 marks_mask = inode_mark->mask;
189 marks_ignored_mask = inode_mark->ignored_mask;
190 } else if (vfsmnt_mark) {
191 marks_mask = vfsmnt_mark->mask;
192 marks_ignored_mask = vfsmnt_mark->ignored_mask;
193 } else {
194 BUG();
195 }
196
197 if (event_mask & marks_mask & ~marks_ignored_mask)
198 return true;
199
200 return false;
201}
202
203const struct fsnotify_ops fanotify_fsnotify_ops = {
204 .handle_event = fanotify_handle_event,
205 .should_send_event = fanotify_should_send_event,
206 .free_group_priv = NULL,
207 .free_event_priv = NULL,
208 .freeing_mark = NULL,
209};
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 000000000000..5ed8e58d7bfc
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,787 @@
1#include <linux/fanotify.h>
2#include <linux/fcntl.h>
3#include <linux/file.h>
4#include <linux/fs.h>
5#include <linux/anon_inodes.h>
6#include <linux/fsnotify_backend.h>
7#include <linux/init.h>
8#include <linux/mount.h>
9#include <linux/namei.h>
10#include <linux/poll.h>
11#include <linux/security.h>
12#include <linux/syscalls.h>
13#include <linux/slab.h>
14#include <linux/types.h>
15#include <linux/uaccess.h>
16
17#include <asm/ioctls.h>
18
19extern const struct fsnotify_ops fanotify_fsnotify_ops;
20
21static struct kmem_cache *fanotify_mark_cache __read_mostly;
22static struct kmem_cache *fanotify_response_event_cache __read_mostly;
23
24struct fanotify_response_event {
25 struct list_head list;
26 __s32 fd;
27 struct fsnotify_event *event;
28};
29
30/*
31 * Get an fsnotify notification event if one exists and is small
32 * enough to fit in "count". Return an error pointer if the count
33 * is not large enough.
34 *
35 * Called with the group->notification_mutex held.
36 */
37static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
38 size_t count)
39{
40 BUG_ON(!mutex_is_locked(&group->notification_mutex));
41
42 pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
43
44 if (fsnotify_notify_queue_is_empty(group))
45 return NULL;
46
47 if (FAN_EVENT_METADATA_LEN > count)
48 return ERR_PTR(-EINVAL);
49
50 /* held the notification_mutex the whole time, so this is the
51 * same event we peeked above */
52 return fsnotify_remove_notify_event(group);
53}
54
55static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
56{
57 int client_fd;
58 struct dentry *dentry;
59 struct vfsmount *mnt;
60 struct file *new_file;
61
62 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
63
64 client_fd = get_unused_fd();
65 if (client_fd < 0)
66 return client_fd;
67
68 if (event->data_type != FSNOTIFY_EVENT_PATH) {
69 WARN_ON(1);
70 put_unused_fd(client_fd);
71 return -EINVAL;
72 }
73
74 /*
75 * we need a new file handle for the userspace program so it can read even if it was
76 * originally opened O_WRONLY.
77 */
78 dentry = dget(event->path.dentry);
79 mnt = mntget(event->path.mnt);
80 /* it's possible this event was an overflow event. in that case dentry and mnt
81 * are NULL; That's fine, just don't call dentry open */
82 if (dentry && mnt)
83 new_file = dentry_open(dentry, mnt,
84 group->fanotify_data.f_flags | FMODE_NONOTIFY,
85 current_cred());
86 else
87 new_file = ERR_PTR(-EOVERFLOW);
88 if (IS_ERR(new_file)) {
89 /*
90 * we still send an event even if we can't open the file. this
91 * can happen when say tasks are gone and we try to open their
92 * /proc files or we try to open a WRONLY file like in sysfs
93 * we just send the errno to userspace since there isn't much
94 * else we can do.
95 */
96 put_unused_fd(client_fd);
97 client_fd = PTR_ERR(new_file);
98 } else {
99 fd_install(client_fd, new_file);
100 }
101
102 return client_fd;
103}
104
105static ssize_t fill_event_metadata(struct fsnotify_group *group,
106 struct fanotify_event_metadata *metadata,
107 struct fsnotify_event *event)
108{
109 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
110 group, metadata, event);
111
112 metadata->event_len = FAN_EVENT_METADATA_LEN;
113 metadata->vers = FANOTIFY_METADATA_VERSION;
114 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
115 metadata->pid = pid_vnr(event->tgid);
116 metadata->fd = create_fd(group, event);
117
118 return metadata->fd;
119}
120
121#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
122static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
123 __s32 fd)
124{
125 struct fanotify_response_event *re, *return_re = NULL;
126
127 mutex_lock(&group->fanotify_data.access_mutex);
128 list_for_each_entry(re, &group->fanotify_data.access_list, list) {
129 if (re->fd != fd)
130 continue;
131
132 list_del_init(&re->list);
133 return_re = re;
134 break;
135 }
136 mutex_unlock(&group->fanotify_data.access_mutex);
137
138 pr_debug("%s: found return_re=%p\n", __func__, return_re);
139
140 return return_re;
141}
142
143static int process_access_response(struct fsnotify_group *group,
144 struct fanotify_response *response_struct)
145{
146 struct fanotify_response_event *re;
147 __s32 fd = response_struct->fd;
148 __u32 response = response_struct->response;
149
150 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
151 fd, response);
152 /*
153 * make sure the response is valid, if invalid we do nothing and either
154 * userspace can send a valid responce or we will clean it up after the
155 * timeout
156 */
157 switch (response) {
158 case FAN_ALLOW:
159 case FAN_DENY:
160 break;
161 default:
162 return -EINVAL;
163 }
164
165 if (fd < 0)
166 return -EINVAL;
167
168 re = dequeue_re(group, fd);
169 if (!re)
170 return -ENOENT;
171
172 re->event->response = response;
173
174 wake_up(&group->fanotify_data.access_waitq);
175
176 kmem_cache_free(fanotify_response_event_cache, re);
177
178 return 0;
179}
180
181static int prepare_for_access_response(struct fsnotify_group *group,
182 struct fsnotify_event *event,
183 __s32 fd)
184{
185 struct fanotify_response_event *re;
186
187 if (!(event->mask & FAN_ALL_PERM_EVENTS))
188 return 0;
189
190 re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
191 if (!re)
192 return -ENOMEM;
193
194 re->event = event;
195 re->fd = fd;
196
197 mutex_lock(&group->fanotify_data.access_mutex);
198
199 if (group->fanotify_data.bypass_perm) {
200 mutex_unlock(&group->fanotify_data.access_mutex);
201 kmem_cache_free(fanotify_response_event_cache, re);
202 event->response = FAN_ALLOW;
203 return 0;
204 }
205
206 list_add_tail(&re->list, &group->fanotify_data.access_list);
207 mutex_unlock(&group->fanotify_data.access_mutex);
208
209 return 0;
210}
211
212static void remove_access_response(struct fsnotify_group *group,
213 struct fsnotify_event *event,
214 __s32 fd)
215{
216 struct fanotify_response_event *re;
217
218 if (!(event->mask & FAN_ALL_PERM_EVENTS))
219 return;
220
221 re = dequeue_re(group, fd);
222 if (!re)
223 return;
224
225 BUG_ON(re->event != event);
226
227 kmem_cache_free(fanotify_response_event_cache, re);
228
229 return;
230}
231#else
232static int prepare_for_access_response(struct fsnotify_group *group,
233 struct fsnotify_event *event,
234 __s32 fd)
235{
236 return 0;
237}
238
239static void remove_access_response(struct fsnotify_group *group,
240 struct fsnotify_event *event,
241 __s32 fd)
242{
243 return;
244}
245#endif
246
247static ssize_t copy_event_to_user(struct fsnotify_group *group,
248 struct fsnotify_event *event,
249 char __user *buf)
250{
251 struct fanotify_event_metadata fanotify_event_metadata;
252 int fd, ret;
253
254 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
255
256 fd = fill_event_metadata(group, &fanotify_event_metadata, event);
257 if (fd < 0)
258 return fd;
259
260 ret = prepare_for_access_response(group, event, fd);
261 if (ret)
262 goto out_close_fd;
263
264 ret = -EFAULT;
265 if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
266 goto out_kill_access_response;
267
268 return FAN_EVENT_METADATA_LEN;
269
270out_kill_access_response:
271 remove_access_response(group, event, fd);
272out_close_fd:
273 sys_close(fd);
274 return ret;
275}
276
277/* intofiy userspace file descriptor functions */
278static unsigned int fanotify_poll(struct file *file, poll_table *wait)
279{
280 struct fsnotify_group *group = file->private_data;
281 int ret = 0;
282
283 poll_wait(file, &group->notification_waitq, wait);
284 mutex_lock(&group->notification_mutex);
285 if (!fsnotify_notify_queue_is_empty(group))
286 ret = POLLIN | POLLRDNORM;
287 mutex_unlock(&group->notification_mutex);
288
289 return ret;
290}
291
292static ssize_t fanotify_read(struct file *file, char __user *buf,
293 size_t count, loff_t *pos)
294{
295 struct fsnotify_group *group;
296 struct fsnotify_event *kevent;
297 char __user *start;
298 int ret;
299 DEFINE_WAIT(wait);
300
301 start = buf;
302 group = file->private_data;
303
304 pr_debug("%s: group=%p\n", __func__, group);
305
306 while (1) {
307 prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
308
309 mutex_lock(&group->notification_mutex);
310 kevent = get_one_event(group, count);
311 mutex_unlock(&group->notification_mutex);
312
313 if (kevent) {
314 ret = PTR_ERR(kevent);
315 if (IS_ERR(kevent))
316 break;
317 ret = copy_event_to_user(group, kevent, buf);
318 fsnotify_put_event(kevent);
319 if (ret < 0)
320 break;
321 buf += ret;
322 count -= ret;
323 continue;
324 }
325
326 ret = -EAGAIN;
327 if (file->f_flags & O_NONBLOCK)
328 break;
329 ret = -EINTR;
330 if (signal_pending(current))
331 break;
332
333 if (start != buf)
334 break;
335
336 schedule();
337 }
338
339 finish_wait(&group->notification_waitq, &wait);
340 if (start != buf && ret != -EFAULT)
341 ret = buf - start;
342 return ret;
343}
344
345static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
346{
347#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
348 struct fanotify_response response = { .fd = -1, .response = -1 };
349 struct fsnotify_group *group;
350 int ret;
351
352 group = file->private_data;
353
354 if (count > sizeof(response))
355 count = sizeof(response);
356
357 pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
358
359 if (copy_from_user(&response, buf, count))
360 return -EFAULT;
361
362 ret = process_access_response(group, &response);
363 if (ret < 0)
364 count = ret;
365
366 return count;
367#else
368 return -EINVAL;
369#endif
370}
371
372static int fanotify_release(struct inode *ignored, struct file *file)
373{
374 struct fsnotify_group *group = file->private_data;
375 struct fanotify_response_event *re, *lre;
376
377 pr_debug("%s: file=%p group=%p\n", __func__, file, group);
378
379#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
380 mutex_lock(&group->fanotify_data.access_mutex);
381
382 group->fanotify_data.bypass_perm = true;
383
384 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
385 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
386 re, re->event);
387
388 list_del_init(&re->list);
389 re->event->response = FAN_ALLOW;
390
391 kmem_cache_free(fanotify_response_event_cache, re);
392 }
393 mutex_unlock(&group->fanotify_data.access_mutex);
394
395 wake_up(&group->fanotify_data.access_waitq);
396#endif
397 /* matches the fanotify_init->fsnotify_alloc_group */
398 fsnotify_put_group(group);
399
400 return 0;
401}
402
403static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
404{
405 struct fsnotify_group *group;
406 struct fsnotify_event_holder *holder;
407 void __user *p;
408 int ret = -ENOTTY;
409 size_t send_len = 0;
410
411 group = file->private_data;
412
413 p = (void __user *) arg;
414
415 switch (cmd) {
416 case FIONREAD:
417 mutex_lock(&group->notification_mutex);
418 list_for_each_entry(holder, &group->notification_list, event_list)
419 send_len += FAN_EVENT_METADATA_LEN;
420 mutex_unlock(&group->notification_mutex);
421 ret = put_user(send_len, (int __user *) p);
422 break;
423 }
424
425 return ret;
426}
427
428static const struct file_operations fanotify_fops = {
429 .poll = fanotify_poll,
430 .read = fanotify_read,
431 .write = fanotify_write,
432 .fasync = NULL,
433 .release = fanotify_release,
434 .unlocked_ioctl = fanotify_ioctl,
435 .compat_ioctl = fanotify_ioctl,
436};
437
438static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
439{
440 kmem_cache_free(fanotify_mark_cache, fsn_mark);
441}
442
443static int fanotify_find_path(int dfd, const char __user *filename,
444 struct path *path, unsigned int flags)
445{
446 int ret;
447
448 pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
449 dfd, filename, flags);
450
451 if (filename == NULL) {
452 struct file *file;
453 int fput_needed;
454
455 ret = -EBADF;
456 file = fget_light(dfd, &fput_needed);
457 if (!file)
458 goto out;
459
460 ret = -ENOTDIR;
461 if ((flags & FAN_MARK_ONLYDIR) &&
462 !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) {
463 fput_light(file, fput_needed);
464 goto out;
465 }
466
467 *path = file->f_path;
468 path_get(path);
469 fput_light(file, fput_needed);
470 } else {
471 unsigned int lookup_flags = 0;
472
473 if (!(flags & FAN_MARK_DONT_FOLLOW))
474 lookup_flags |= LOOKUP_FOLLOW;
475 if (flags & FAN_MARK_ONLYDIR)
476 lookup_flags |= LOOKUP_DIRECTORY;
477
478 ret = user_path_at(dfd, filename, lookup_flags, path);
479 if (ret)
480 goto out;
481 }
482
483 /* you can only watch an inode if you have read permissions on it */
484 ret = inode_permission(path->dentry->d_inode, MAY_READ);
485 if (ret)
486 path_put(path);
487out:
488 return ret;
489}
490
491static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
492 __u32 mask,
493 unsigned int flags)
494{
495 __u32 oldmask;
496
497 spin_lock(&fsn_mark->lock);
498 if (!(flags & FAN_MARK_IGNORED_MASK)) {
499 oldmask = fsn_mark->mask;
500 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
501 } else {
502 oldmask = fsn_mark->ignored_mask;
503 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
504 }
505 spin_unlock(&fsn_mark->lock);
506
507 if (!(oldmask & ~mask))
508 fsnotify_destroy_mark(fsn_mark);
509
510 return mask & oldmask;
511}
512
513static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
514 struct vfsmount *mnt, __u32 mask,
515 unsigned int flags)
516{
517 struct fsnotify_mark *fsn_mark = NULL;
518 __u32 removed;
519
520 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
521 if (!fsn_mark)
522 return -ENOENT;
523
524 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
525 fsnotify_put_mark(fsn_mark);
526 if (removed & mnt->mnt_fsnotify_mask)
527 fsnotify_recalc_vfsmount_mask(mnt);
528
529 return 0;
530}
531
532static int fanotify_remove_inode_mark(struct fsnotify_group *group,
533 struct inode *inode, __u32 mask,
534 unsigned int flags)
535{
536 struct fsnotify_mark *fsn_mark = NULL;
537 __u32 removed;
538
539 fsn_mark = fsnotify_find_inode_mark(group, inode);
540 if (!fsn_mark)
541 return -ENOENT;
542
543 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
544 /* matches the fsnotify_find_inode_mark() */
545 fsnotify_put_mark(fsn_mark);
546 if (removed & inode->i_fsnotify_mask)
547 fsnotify_recalc_inode_mask(inode);
548
549 return 0;
550}
551
552static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
553 __u32 mask,
554 unsigned int flags)
555{
556 __u32 oldmask;
557
558 spin_lock(&fsn_mark->lock);
559 if (!(flags & FAN_MARK_IGNORED_MASK)) {
560 oldmask = fsn_mark->mask;
561 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
562 } else {
563 oldmask = fsn_mark->ignored_mask;
564 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask));
565 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
566 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
567 }
568 spin_unlock(&fsn_mark->lock);
569
570 return mask & ~oldmask;
571}
572
573static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
574 struct vfsmount *mnt, __u32 mask,
575 unsigned int flags)
576{
577 struct fsnotify_mark *fsn_mark;
578 __u32 added;
579
580 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
581 if (!fsn_mark) {
582 int ret;
583
584 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
585 if (!fsn_mark)
586 return -ENOMEM;
587
588 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
589 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
590 if (ret) {
591 fanotify_free_mark(fsn_mark);
592 return ret;
593 }
594 }
595 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
596 fsnotify_put_mark(fsn_mark);
597 if (added & ~mnt->mnt_fsnotify_mask)
598 fsnotify_recalc_vfsmount_mask(mnt);
599
600 return 0;
601}
602
603static int fanotify_add_inode_mark(struct fsnotify_group *group,
604 struct inode *inode, __u32 mask,
605 unsigned int flags)
606{
607 struct fsnotify_mark *fsn_mark;
608 __u32 added;
609
610 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
611
612 fsn_mark = fsnotify_find_inode_mark(group, inode);
613 if (!fsn_mark) {
614 int ret;
615
616 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
617 if (!fsn_mark)
618 return -ENOMEM;
619
620 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
621 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
622 if (ret) {
623 fanotify_free_mark(fsn_mark);
624 return ret;
625 }
626 }
627 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
628 fsnotify_put_mark(fsn_mark);
629 if (added & ~inode->i_fsnotify_mask)
630 fsnotify_recalc_inode_mask(inode);
631 return 0;
632}
633
634/* fanotify syscalls */
635SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
636{
637 struct fsnotify_group *group;
638 int f_flags, fd;
639
640 pr_debug("%s: flags=%d event_f_flags=%d\n",
641 __func__, flags, event_f_flags);
642
643 if (!capable(CAP_SYS_ADMIN))
644 return -EPERM;
645
646 if (flags & ~FAN_ALL_INIT_FLAGS)
647 return -EINVAL;
648
649 f_flags = O_RDWR | FMODE_NONOTIFY;
650 if (flags & FAN_CLOEXEC)
651 f_flags |= O_CLOEXEC;
652 if (flags & FAN_NONBLOCK)
653 f_flags |= O_NONBLOCK;
654
655 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
656 group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
657 if (IS_ERR(group))
658 return PTR_ERR(group);
659
660 group->fanotify_data.f_flags = event_f_flags;
661#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
662 mutex_init(&group->fanotify_data.access_mutex);
663 init_waitqueue_head(&group->fanotify_data.access_waitq);
664 INIT_LIST_HEAD(&group->fanotify_data.access_list);
665#endif
666
667 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
668 if (fd < 0)
669 goto out_put_group;
670
671 return fd;
672
673out_put_group:
674 fsnotify_put_group(group);
675 return fd;
676}
677
678SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
679 __u64 mask, int dfd,
680 const char __user * pathname)
681{
682 struct inode *inode = NULL;
683 struct vfsmount *mnt = NULL;
684 struct fsnotify_group *group;
685 struct file *filp;
686 struct path path;
687 int ret, fput_needed;
688
689 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
690 __func__, fanotify_fd, flags, dfd, pathname, mask);
691
692 /* we only use the lower 32 bits as of right now. */
693 if (mask & ((__u64)0xffffffff << 32))
694 return -EINVAL;
695
696 if (flags & ~FAN_ALL_MARK_FLAGS)
697 return -EINVAL;
698 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
699 case FAN_MARK_ADD:
700 case FAN_MARK_REMOVE:
701 case FAN_MARK_FLUSH:
702 break;
703 default:
704 return -EINVAL;
705 }
706#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
707 if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
708#else
709 if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
710#endif
711 return -EINVAL;
712
713 filp = fget_light(fanotify_fd, &fput_needed);
714 if (unlikely(!filp))
715 return -EBADF;
716
717 /* verify that this is indeed an fanotify instance */
718 ret = -EINVAL;
719 if (unlikely(filp->f_op != &fanotify_fops))
720 goto fput_and_out;
721
722 ret = fanotify_find_path(dfd, pathname, &path, flags);
723 if (ret)
724 goto fput_and_out;
725
726 /* inode held in place by reference to path; group by fget on fd */
727 if (!(flags & FAN_MARK_MOUNT))
728 inode = path.dentry->d_inode;
729 else
730 mnt = path.mnt;
731 group = filp->private_data;
732
733 /* create/update an inode mark */
734 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
735 case FAN_MARK_ADD:
736 if (flags & FAN_MARK_MOUNT)
737 ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
738 else
739 ret = fanotify_add_inode_mark(group, inode, mask, flags);
740 break;
741 case FAN_MARK_REMOVE:
742 if (flags & FAN_MARK_MOUNT)
743 ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
744 else
745 ret = fanotify_remove_inode_mark(group, inode, mask, flags);
746 break;
747 case FAN_MARK_FLUSH:
748 if (flags & FAN_MARK_MOUNT)
749 fsnotify_clear_vfsmount_marks_by_group(group);
750 else
751 fsnotify_clear_inode_marks_by_group(group);
752 break;
753 default:
754 ret = -EINVAL;
755 }
756
757 path_put(&path);
758fput_and_out:
759 fput_light(filp, fput_needed);
760 return ret;
761}
762
763#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
764asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
765 long dfd, long pathname)
766{
767 return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
768 mask, (int) dfd,
769 (const char __user *) pathname);
770}
771SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
772#endif
773
774/*
775 * fanotify_user_setup - Our initialization function. Note that we cannnot return
776 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
777 * must result in panic().
778 */
779static int __init fanotify_user_setup(void)
780{
781 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
782 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
783 SLAB_PANIC);
784
785 return 0;
786}
787device_initcall(fanotify_user_setup);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fcc2f064af83..36802420d69a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -21,6 +21,7 @@
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/mount.h>
24#include <linux/srcu.h> 25#include <linux/srcu.h>
25 26
26#include <linux/fsnotify_backend.h> 27#include <linux/fsnotify_backend.h>
@@ -35,6 +36,11 @@ void __fsnotify_inode_delete(struct inode *inode)
35} 36}
36EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); 37EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
37 38
39void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
40{
41 fsnotify_clear_marks_by_mount(mnt);
42}
43
38/* 44/*
39 * Given an inode, first check if we care what happens to our children. Inotify 45 * Given an inode, first check if we care what happens to our children. Inotify
40 * and dnotify both tell their parents about events. If we care about any event 46 * and dnotify both tell their parents about events. If we care about any event
@@ -78,13 +84,16 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
78} 84}
79 85
80/* Notify this dentry's parent about a child's events. */ 86/* Notify this dentry's parent about a child's events. */
81void __fsnotify_parent(struct dentry *dentry, __u32 mask) 87void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
82{ 88{
83 struct dentry *parent; 89 struct dentry *parent;
84 struct inode *p_inode; 90 struct inode *p_inode;
85 bool send = false; 91 bool send = false;
86 bool should_update_children = false; 92 bool should_update_children = false;
87 93
94 if (!dentry)
95 dentry = path->dentry;
96
88 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) 97 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
89 return; 98 return;
90 99
@@ -115,8 +124,12 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
115 * specifies these are events which came from a child. */ 124 * specifies these are events which came from a child. */
116 mask |= FS_EVENT_ON_CHILD; 125 mask |= FS_EVENT_ON_CHILD;
117 126
118 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, 127 if (path)
119 dentry->d_name.name, 0); 128 fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
129 dentry->d_name.name, 0);
130 else
131 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
132 dentry->d_name.name, 0);
120 dput(parent); 133 dput(parent);
121 } 134 }
122 135
@@ -127,63 +140,185 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
127} 140}
128EXPORT_SYMBOL_GPL(__fsnotify_parent); 141EXPORT_SYMBOL_GPL(__fsnotify_parent);
129 142
143static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
144 struct fsnotify_mark *inode_mark,
145 struct fsnotify_mark *vfsmount_mark,
146 __u32 mask, void *data,
147 int data_is, u32 cookie,
148 const unsigned char *file_name,
149 struct fsnotify_event **event)
150{
151 struct fsnotify_group *group = NULL;
152 __u32 inode_test_mask = 0;
153 __u32 vfsmount_test_mask = 0;
154
155 if (unlikely(!inode_mark && !vfsmount_mark)) {
156 BUG();
157 return 0;
158 }
159
160 /* clear ignored on inode modification */
161 if (mask & FS_MODIFY) {
162 if (inode_mark &&
163 !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
164 inode_mark->ignored_mask = 0;
165 if (vfsmount_mark &&
166 !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
167 vfsmount_mark->ignored_mask = 0;
168 }
169
170 /* does the inode mark tell us to do something? */
171 if (inode_mark) {
172 group = inode_mark->group;
173 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
174 inode_test_mask &= inode_mark->mask;
175 inode_test_mask &= ~inode_mark->ignored_mask;
176 }
177
178 /* does the vfsmount_mark tell us to do something? */
179 if (vfsmount_mark) {
180 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
181 group = vfsmount_mark->group;
182 vfsmount_test_mask &= vfsmount_mark->mask;
183 vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
184 if (inode_mark)
185 vfsmount_test_mask &= ~inode_mark->ignored_mask;
186 }
187
188 pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
189 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
190 " data=%p data_is=%d cookie=%d event=%p\n",
191 __func__, group, to_tell, mnt, mask, inode_mark,
192 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
193 data_is, cookie, *event);
194
195 if (!inode_test_mask && !vfsmount_test_mask)
196 return 0;
197
198 if (group->ops->should_send_event(group, to_tell, inode_mark,
199 vfsmount_mark, mask, data,
200 data_is) == false)
201 return 0;
202
203 if (!*event) {
204 *event = fsnotify_create_event(to_tell, mask, data,
205 data_is, file_name,
206 cookie, GFP_KERNEL);
207 if (!*event)
208 return -ENOMEM;
209 }
210 return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
211}
212
130/* 213/*
131 * This is the main call to fsnotify. The VFS calls into hook specific functions 214 * This is the main call to fsnotify. The VFS calls into hook specific functions
132 * in linux/fsnotify.h. Those functions then in turn call here. Here will call 215 * in linux/fsnotify.h. Those functions then in turn call here. Here will call
133 * out to all of the registered fsnotify_group. Those groups can then use the 216 * out to all of the registered fsnotify_group. Those groups can then use the
134 * notification event in whatever means they feel necessary. 217 * notification event in whatever means they feel necessary.
135 */ 218 */
136void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie) 219int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
220 const unsigned char *file_name, u32 cookie)
137{ 221{
138 struct fsnotify_group *group; 222 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
223 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
224 struct fsnotify_group *inode_group, *vfsmount_group;
139 struct fsnotify_event *event = NULL; 225 struct fsnotify_event *event = NULL;
140 int idx; 226 struct vfsmount *mnt;
227 int idx, ret = 0;
141 /* global tests shouldn't care about events on child only the specific event */ 228 /* global tests shouldn't care about events on child only the specific event */
142 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); 229 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
143 230
144 if (list_empty(&fsnotify_groups)) 231 if (data_is == FSNOTIFY_EVENT_PATH)
145 return; 232 mnt = ((struct path *)data)->mnt;
233 else
234 mnt = NULL;
146 235
147 if (!(test_mask & fsnotify_mask))
148 return;
149
150 if (!(test_mask & to_tell->i_fsnotify_mask))
151 return;
152 /* 236 /*
153 * SRCU!! the groups list is very very much read only and the path is 237 * if this is a modify event we may need to clear the ignored masks
154 * very hot. The VAST majority of events are not going to need to do 238 * otherwise return if neither the inode nor the vfsmount care about
155 * anything other than walk the list so it's crazy to pre-allocate. 239 * this type of event.
156 */ 240 */
157 idx = srcu_read_lock(&fsnotify_grp_srcu); 241 if (!(mask & FS_MODIFY) &&
158 list_for_each_entry_rcu(group, &fsnotify_groups, group_list) { 242 !(test_mask & to_tell->i_fsnotify_mask) &&
159 if (test_mask & group->mask) { 243 !(mnt && test_mask & mnt->mnt_fsnotify_mask))
160 if (!group->ops->should_send_event(group, to_tell, mask)) 244 return 0;
161 continue; 245
162 if (!event) { 246 idx = srcu_read_lock(&fsnotify_mark_srcu);
163 event = fsnotify_create_event(to_tell, mask, data, 247
164 data_is, file_name, cookie, 248 if ((mask & FS_MODIFY) ||
165 GFP_KERNEL); 249 (test_mask & to_tell->i_fsnotify_mask))
166 /* shit, we OOM'd and now we can't tell, maybe 250 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
167 * someday someone else will want to do something 251 &fsnotify_mark_srcu);
168 * here */ 252
169 if (!event) 253 if (mnt && ((mask & FS_MODIFY) ||
170 break; 254 (test_mask & mnt->mnt_fsnotify_mask))) {
171 } 255 vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
172 group->ops->handle_event(group, event); 256 &fsnotify_mark_srcu);
257 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
258 &fsnotify_mark_srcu);
259 }
260
261 while (inode_node || vfsmount_node) {
262 inode_group = vfsmount_group = NULL;
263
264 if (inode_node) {
265 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
266 struct fsnotify_mark, i.i_list);
267 inode_group = inode_mark->group;
173 } 268 }
269
270 if (vfsmount_node) {
271 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
272 struct fsnotify_mark, m.m_list);
273 vfsmount_group = vfsmount_mark->group;
274 }
275
276 if (inode_group > vfsmount_group) {
277 /* handle inode */
278 send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
279 data_is, cookie, file_name, &event);
280 /* we didn't use the vfsmount_mark */
281 vfsmount_group = NULL;
282 } else if (vfsmount_group > inode_group) {
283 send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
284 data_is, cookie, file_name, &event);
285 inode_group = NULL;
286 } else {
287 send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
288 mask, data, data_is, cookie, file_name,
289 &event);
290 }
291
292 if (inode_group)
293 inode_node = srcu_dereference(inode_node->next,
294 &fsnotify_mark_srcu);
295 if (vfsmount_group)
296 vfsmount_node = srcu_dereference(vfsmount_node->next,
297 &fsnotify_mark_srcu);
174 } 298 }
175 srcu_read_unlock(&fsnotify_grp_srcu, idx); 299
300 srcu_read_unlock(&fsnotify_mark_srcu, idx);
176 /* 301 /*
177 * fsnotify_create_event() took a reference so the event can't be cleaned 302 * fsnotify_create_event() took a reference so the event can't be cleaned
178 * up while we are still trying to add it to lists, drop that one. 303 * up while we are still trying to add it to lists, drop that one.
179 */ 304 */
180 if (event) 305 if (event)
181 fsnotify_put_event(event); 306 fsnotify_put_event(event);
307
308 return ret;
182} 309}
183EXPORT_SYMBOL_GPL(fsnotify); 310EXPORT_SYMBOL_GPL(fsnotify);
184 311
185static __init int fsnotify_init(void) 312static __init int fsnotify_init(void)
186{ 313{
187 return init_srcu_struct(&fsnotify_grp_srcu); 314 int ret;
315
316 BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
317
318 ret = init_srcu_struct(&fsnotify_mark_srcu);
319 if (ret)
320 panic("initializing fsnotify_mark_srcu");
321
322 return 0;
188} 323}
189subsys_initcall(fsnotify_init); 324core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 4dc240824b2d..85e7d2b431d9 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,21 +6,34 @@
6#include <linux/srcu.h> 6#include <linux/srcu.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9/* protects reads of fsnotify_groups */
10extern struct srcu_struct fsnotify_grp_srcu;
11/* all groups which receive fsnotify events */
12extern struct list_head fsnotify_groups;
13/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
14extern __u32 fsnotify_mask;
15
16/* destroy all events sitting in this groups notification queue */ 9/* destroy all events sitting in this groups notification queue */
17extern void fsnotify_flush_notify(struct fsnotify_group *group); 10extern void fsnotify_flush_notify(struct fsnotify_group *group);
18 11
12/* protects reads of inode and vfsmount marks list */
13extern struct srcu_struct fsnotify_mark_srcu;
14
15extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
16 __u32 mask);
17/* add a mark to an inode */
18extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
19 struct fsnotify_group *group, struct inode *inode,
20 int allow_dups);
21/* add a mark to a vfsmount */
22extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
23 struct fsnotify_group *group, struct vfsmount *mnt,
24 int allow_dups);
25
19/* final kfree of a group */ 26/* final kfree of a group */
20extern void fsnotify_final_destroy_group(struct fsnotify_group *group); 27extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
21 28
29/* vfsmount specific destruction of a mark */
30extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
31/* inode specific destruction of a mark */
32extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
22/* run the list of all marks associated with inode and flag them to be freed */ 33/* run the list of all marks associated with inode and flag them to be freed */
23extern void fsnotify_clear_marks_by_inode(struct inode *inode); 34extern void fsnotify_clear_marks_by_inode(struct inode *inode);
35/* run the list of all marks associated with vfsmount and flag them to be freed */
36extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
24/* 37/*
25 * update the dentry->d_flags of all of inode's children to indicate if inode cares 38 * update the dentry->d_flags of all of inode's children to indicate if inode cares
26 * about events that happen to its children. 39 * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0e1677144bc5..d309f38449cb 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -28,64 +28,6 @@
28 28
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30 30
31/* protects writes to fsnotify_groups and fsnotify_mask */
32static DEFINE_MUTEX(fsnotify_grp_mutex);
33/* protects reads while running the fsnotify_groups list */
34struct srcu_struct fsnotify_grp_srcu;
35/* all groups registered to receive filesystem notifications */
36LIST_HEAD(fsnotify_groups);
37/* bitwise OR of all events (FS_*) interesting to some group on this system */
38__u32 fsnotify_mask;
39
40/*
41 * When a new group registers or changes it's set of interesting events
42 * this function updates the fsnotify_mask to contain all interesting events
43 */
44void fsnotify_recalc_global_mask(void)
45{
46 struct fsnotify_group *group;
47 __u32 mask = 0;
48 int idx;
49
50 idx = srcu_read_lock(&fsnotify_grp_srcu);
51 list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
52 mask |= group->mask;
53 srcu_read_unlock(&fsnotify_grp_srcu, idx);
54 fsnotify_mask = mask;
55}
56
57/*
58 * Update the group->mask by running all of the marks associated with this
59 * group and finding the bitwise | of all of the mark->mask. If we change
60 * the group->mask we need to update the global mask of events interesting
61 * to the system.
62 */
63void fsnotify_recalc_group_mask(struct fsnotify_group *group)
64{
65 __u32 mask = 0;
66 __u32 old_mask = group->mask;
67 struct fsnotify_mark_entry *entry;
68
69 spin_lock(&group->mark_lock);
70 list_for_each_entry(entry, &group->mark_entries, g_list)
71 mask |= entry->mask;
72 spin_unlock(&group->mark_lock);
73
74 group->mask = mask;
75
76 if (old_mask != mask)
77 fsnotify_recalc_global_mask();
78}
79
80/*
81 * Take a reference to a group so things found under the fsnotify_grp_mutex
82 * can't get freed under us
83 */
84static void fsnotify_get_group(struct fsnotify_group *group)
85{
86 atomic_inc(&group->refcnt);
87}
88
89/* 31/*
90 * Final freeing of a group 32 * Final freeing of a group
91 */ 33 */
@@ -110,145 +52,53 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)
110 */ 52 */
111static void fsnotify_destroy_group(struct fsnotify_group *group) 53static void fsnotify_destroy_group(struct fsnotify_group *group)
112{ 54{
113 /* clear all inode mark entries for this group */ 55 /* clear all inode marks for this group */
114 fsnotify_clear_marks_by_group(group); 56 fsnotify_clear_marks_by_group(group);
115 57
58 synchronize_srcu(&fsnotify_mark_srcu);
59
116 /* past the point of no return, matches the initial value of 1 */ 60 /* past the point of no return, matches the initial value of 1 */
117 if (atomic_dec_and_test(&group->num_marks)) 61 if (atomic_dec_and_test(&group->num_marks))
118 fsnotify_final_destroy_group(group); 62 fsnotify_final_destroy_group(group);
119} 63}
120 64
121/* 65/*
122 * Remove this group from the global list of groups that will get events
123 * this can be done even if there are still references and things still using
124 * this group. This just stops the group from getting new events.
125 */
126static void __fsnotify_evict_group(struct fsnotify_group *group)
127{
128 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
129
130 if (group->on_group_list)
131 list_del_rcu(&group->group_list);
132 group->on_group_list = 0;
133}
134
135/*
136 * Called when a group is no longer interested in getting events. This can be
137 * used if a group is misbehaving or if for some reason a group should no longer
138 * get any filesystem events.
139 */
140void fsnotify_evict_group(struct fsnotify_group *group)
141{
142 mutex_lock(&fsnotify_grp_mutex);
143 __fsnotify_evict_group(group);
144 mutex_unlock(&fsnotify_grp_mutex);
145}
146
147/*
148 * Drop a reference to a group. Free it if it's through. 66 * Drop a reference to a group. Free it if it's through.
149 */ 67 */
150void fsnotify_put_group(struct fsnotify_group *group) 68void fsnotify_put_group(struct fsnotify_group *group)
151{ 69{
152 if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex)) 70 if (atomic_dec_and_test(&group->refcnt))
153 return; 71 fsnotify_destroy_group(group);
154
155 /*
156 * OK, now we know that there's no other users *and* we hold mutex,
157 * so no new references will appear
158 */
159 __fsnotify_evict_group(group);
160
161 /*
162 * now it's off the list, so the only thing we might care about is
163 * srcu access....
164 */
165 mutex_unlock(&fsnotify_grp_mutex);
166 synchronize_srcu(&fsnotify_grp_srcu);
167
168 /* and now it is really dead. _Nothing_ could be seeing it */
169 fsnotify_recalc_global_mask();
170 fsnotify_destroy_group(group);
171}
172
173/*
174 * Simply run the fsnotify_groups list and find a group which matches
175 * the given parameters. If a group is found we take a reference to that
176 * group.
177 */
178static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
179 const struct fsnotify_ops *ops)
180{
181 struct fsnotify_group *group_iter;
182 struct fsnotify_group *group = NULL;
183
184 BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
185
186 list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
187 if (group_iter->group_num == group_num) {
188 if ((group_iter->mask == mask) &&
189 (group_iter->ops == ops)) {
190 fsnotify_get_group(group_iter);
191 group = group_iter;
192 } else
193 group = ERR_PTR(-EEXIST);
194 }
195 }
196 return group;
197} 72}
198 73
199/* 74/*
200 * Either finds an existing group which matches the group_num, mask, and ops or 75 * Create a new fsnotify_group and hold a reference for the group returned.
201 * creates a new group and adds it to the global group list. In either case we
202 * take a reference for the group returned.
203 */ 76 */
204struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask, 77struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
205 const struct fsnotify_ops *ops)
206{ 78{
207 struct fsnotify_group *group, *tgroup; 79 struct fsnotify_group *group;
208 80
209 /* very low use, simpler locking if we just always alloc */ 81 group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
210 group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
211 if (!group) 82 if (!group)
212 return ERR_PTR(-ENOMEM); 83 return ERR_PTR(-ENOMEM);
213 84
85 /* set to 0 when there a no external references to this group */
214 atomic_set(&group->refcnt, 1); 86 atomic_set(&group->refcnt, 1);
215 87 /*
216 group->on_group_list = 0; 88 * hits 0 when there are no external references AND no marks for
217 group->group_num = group_num; 89 * this group
218 group->mask = mask; 90 */
91 atomic_set(&group->num_marks, 1);
219 92
220 mutex_init(&group->notification_mutex); 93 mutex_init(&group->notification_mutex);
221 INIT_LIST_HEAD(&group->notification_list); 94 INIT_LIST_HEAD(&group->notification_list);
222 init_waitqueue_head(&group->notification_waitq); 95 init_waitqueue_head(&group->notification_waitq);
223 group->q_len = 0;
224 group->max_events = UINT_MAX; 96 group->max_events = UINT_MAX;
225 97
226 spin_lock_init(&group->mark_lock); 98 spin_lock_init(&group->mark_lock);
227 atomic_set(&group->num_marks, 0); 99 INIT_LIST_HEAD(&group->marks_list);
228 INIT_LIST_HEAD(&group->mark_entries);
229 100
230 group->ops = ops; 101 group->ops = ops;
231 102
232 mutex_lock(&fsnotify_grp_mutex);
233 tgroup = fsnotify_find_group(group_num, mask, ops);
234 if (tgroup) {
235 /* group already exists */
236 mutex_unlock(&fsnotify_grp_mutex);
237 /* destroy the new one we made */
238 fsnotify_put_group(group);
239 return tgroup;
240 }
241
242 /* group not found, add a new one */
243 list_add_rcu(&group->group_list, &fsnotify_groups);
244 group->on_group_list = 1;
245 /* being on the fsnotify_groups list holds one num_marks */
246 atomic_inc(&group->num_marks);
247
248 mutex_unlock(&fsnotify_grp_mutex);
249
250 if (mask)
251 fsnotify_recalc_global_mask();
252
253 return group; 103 return group;
254} 104}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0399bcbe09c8..33297c005060 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -16,72 +16,6 @@
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */ 17 */
18 18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * entry->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * entry->lock protects 2 things, entry->group and entry->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the mark_entries list anchored inside a given group
42 * and each entry is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
47 * given inode and each entry is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark_by_entry)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_mark_entries safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h> 19#include <linux/fs.h>
86#include <linux/init.h> 20#include <linux/init.h>
87#include <linux/kernel.h> 21#include <linux/kernel.h>
@@ -95,30 +29,19 @@
95#include <linux/fsnotify_backend.h> 29#include <linux/fsnotify_backend.h>
96#include "fsnotify.h" 30#include "fsnotify.h"
97 31
98void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
99{
100 atomic_inc(&entry->refcnt);
101}
102
103void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
104{
105 if (atomic_dec_and_test(&entry->refcnt))
106 entry->free_mark(entry);
107}
108
109/* 32/*
110 * Recalculate the mask of events relevant to a given inode locked. 33 * Recalculate the mask of events relevant to a given inode locked.
111 */ 34 */
112static void fsnotify_recalc_inode_mask_locked(struct inode *inode) 35static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
113{ 36{
114 struct fsnotify_mark_entry *entry; 37 struct fsnotify_mark *mark;
115 struct hlist_node *pos; 38 struct hlist_node *pos;
116 __u32 new_mask = 0; 39 __u32 new_mask = 0;
117 40
118 assert_spin_locked(&inode->i_lock); 41 assert_spin_locked(&inode->i_lock);
119 42
120 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) 43 hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
121 new_mask |= entry->mask; 44 new_mask |= mark->mask;
122 inode->i_fsnotify_mask = new_mask; 45 inode->i_fsnotify_mask = new_mask;
123} 46}
124 47
@@ -135,107 +58,26 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
135 __fsnotify_update_child_dentry_flags(inode); 58 __fsnotify_update_child_dentry_flags(inode);
136} 59}
137 60
138/* 61void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
139 * Any time a mark is getting freed we end up here.
140 * The caller had better be holding a reference to this mark so we don't actually
141 * do the final put under the entry->lock
142 */
143void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
144{ 62{
145 struct fsnotify_group *group; 63 struct inode *inode = mark->i.inode;
146 struct inode *inode;
147 64
148 spin_lock(&entry->lock); 65 assert_spin_locked(&mark->lock);
66 assert_spin_locked(&mark->group->mark_lock);
149 67
150 group = entry->group;
151 inode = entry->inode;
152
153 BUG_ON(group && !inode);
154 BUG_ON(!group && inode);
155
156 /* if !group something else already marked this to die */
157 if (!group) {
158 spin_unlock(&entry->lock);
159 return;
160 }
161
162 /* 1 from caller and 1 for being on i_list/g_list */
163 BUG_ON(atomic_read(&entry->refcnt) < 2);
164
165 spin_lock(&group->mark_lock);
166 spin_lock(&inode->i_lock); 68 spin_lock(&inode->i_lock);
167 69
168 hlist_del_init(&entry->i_list); 70 hlist_del_init_rcu(&mark->i.i_list);
169 entry->inode = NULL; 71 mark->i.inode = NULL;
170
171 list_del_init(&entry->g_list);
172 entry->group = NULL;
173
174 fsnotify_put_mark(entry); /* for i_list and g_list */
175 72
176 /* 73 /*
177 * this mark is now off the inode->i_fsnotify_mark_entries list and we 74 * this mark is now off the inode->i_fsnotify_marks list and we
178 * hold the inode->i_lock, so this is the perfect time to update the 75 * hold the inode->i_lock, so this is the perfect time to update the
179 * inode->i_fsnotify_mask 76 * inode->i_fsnotify_mask
180 */ 77 */
181 fsnotify_recalc_inode_mask_locked(inode); 78 fsnotify_recalc_inode_mask_locked(inode);
182 79
183 spin_unlock(&inode->i_lock); 80 spin_unlock(&inode->i_lock);
184 spin_unlock(&group->mark_lock);
185 spin_unlock(&entry->lock);
186
187 /*
188 * Some groups like to know that marks are being freed. This is a
189 * callback to the group function to let it know that this entry
190 * is being freed.
191 */
192 if (group->ops->freeing_mark)
193 group->ops->freeing_mark(entry, group);
194
195 /*
196 * __fsnotify_update_child_dentry_flags(inode);
197 *
198 * I really want to call that, but we can't, we have no idea if the inode
199 * still exists the second we drop the entry->lock.
200 *
201 * The next time an event arrive to this inode from one of it's children
202 * __fsnotify_parent will see that the inode doesn't care about it's
203 * children and will update all of these flags then. So really this
204 * is just a lazy update (and could be a perf win...)
205 */
206
207
208 iput(inode);
209
210 /*
211 * it's possible that this group tried to destroy itself, but this
212 * this mark was simultaneously being freed by inode. If that's the
213 * case, we finish freeing the group here.
214 */
215 if (unlikely(atomic_dec_and_test(&group->num_marks)))
216 fsnotify_final_destroy_group(group);
217}
218
219/*
220 * Given a group, destroy all of the marks associated with that group.
221 */
222void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
223{
224 struct fsnotify_mark_entry *lentry, *entry;
225 LIST_HEAD(free_list);
226
227 spin_lock(&group->mark_lock);
228 list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
229 list_add(&entry->free_g_list, &free_list);
230 list_del_init(&entry->g_list);
231 fsnotify_get_mark(entry);
232 }
233 spin_unlock(&group->mark_lock);
234
235 list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
236 fsnotify_destroy_mark_by_entry(entry);
237 fsnotify_put_mark(entry);
238 }
239} 81}
240 82
241/* 83/*
@@ -243,112 +85,145 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
243 */ 85 */
244void fsnotify_clear_marks_by_inode(struct inode *inode) 86void fsnotify_clear_marks_by_inode(struct inode *inode)
245{ 87{
246 struct fsnotify_mark_entry *entry, *lentry; 88 struct fsnotify_mark *mark, *lmark;
247 struct hlist_node *pos, *n; 89 struct hlist_node *pos, *n;
248 LIST_HEAD(free_list); 90 LIST_HEAD(free_list);
249 91
250 spin_lock(&inode->i_lock); 92 spin_lock(&inode->i_lock);
251 hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) { 93 hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
252 list_add(&entry->free_i_list, &free_list); 94 list_add(&mark->i.free_i_list, &free_list);
253 hlist_del_init(&entry->i_list); 95 hlist_del_init_rcu(&mark->i.i_list);
254 fsnotify_get_mark(entry); 96 fsnotify_get_mark(mark);
255 } 97 }
256 spin_unlock(&inode->i_lock); 98 spin_unlock(&inode->i_lock);
257 99
258 list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) { 100 list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
259 fsnotify_destroy_mark_by_entry(entry); 101 fsnotify_destroy_mark(mark);
260 fsnotify_put_mark(entry); 102 fsnotify_put_mark(mark);
261 } 103 }
262} 104}
263 105
264/* 106/*
107 * Given a group clear all of the inode marks associated with that group.
108 */
109void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
110{
111 fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
112}
113
114/*
265 * given a group and inode, find the mark associated with that combination. 115 * given a group and inode, find the mark associated with that combination.
266 * if found take a reference to that mark and return it, else return NULL 116 * if found take a reference to that mark and return it, else return NULL
267 */ 117 */
268struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, 118struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
269 struct inode *inode) 119 struct inode *inode)
270{ 120{
271 struct fsnotify_mark_entry *entry; 121 struct fsnotify_mark *mark;
272 struct hlist_node *pos; 122 struct hlist_node *pos;
273 123
274 assert_spin_locked(&inode->i_lock); 124 assert_spin_locked(&inode->i_lock);
275 125
276 hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) { 126 hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
277 if (entry->group == group) { 127 if (mark->group == group) {
278 fsnotify_get_mark(entry); 128 fsnotify_get_mark(mark);
279 return entry; 129 return mark;
280 } 130 }
281 } 131 }
282 return NULL; 132 return NULL;
283} 133}
284 134
285/* 135/*
286 * Nothing fancy, just initialize lists and locks and counters. 136 * given a group and inode, find the mark associated with that combination.
137 * if found take a reference to that mark and return it, else return NULL
287 */ 138 */
288void fsnotify_init_mark(struct fsnotify_mark_entry *entry, 139struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
289 void (*free_mark)(struct fsnotify_mark_entry *entry)) 140 struct inode *inode)
141{
142 struct fsnotify_mark *mark;
143
144 spin_lock(&inode->i_lock);
145 mark = fsnotify_find_inode_mark_locked(group, inode);
146 spin_unlock(&inode->i_lock);
290 147
148 return mark;
149}
150
151/*
152 * If we are setting a mark mask on an inode mark we should pin the inode
153 * in memory.
154 */
155void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
156 __u32 mask)
291{ 157{
292 spin_lock_init(&entry->lock); 158 struct inode *inode;
293 atomic_set(&entry->refcnt, 1); 159
294 INIT_HLIST_NODE(&entry->i_list); 160 assert_spin_locked(&mark->lock);
295 entry->group = NULL; 161
296 entry->mask = 0; 162 if (mask &&
297 entry->inode = NULL; 163 mark->i.inode &&
298 entry->free_mark = free_mark; 164 !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
165 mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
166 inode = igrab(mark->i.inode);
167 /*
168 * we shouldn't be able to get here if the inode wasn't
169 * already safely held in memory. But bug in case it
170 * ever is wrong.
171 */
172 BUG_ON(!inode);
173 }
299} 174}
300 175
301/* 176/*
302 * Attach an initialized mark entry to a given group and inode. 177 * Attach an initialized mark to a given inode.
303 * These marks may be used for the fsnotify backend to determine which 178 * These marks may be used for the fsnotify backend to determine which
304 * event types should be delivered to which group and for which inodes. 179 * event types should be delivered to which group and for which inodes. These
180 * marks are ordered according to the group's location in memory.
305 */ 181 */
306int fsnotify_add_mark(struct fsnotify_mark_entry *entry, 182int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
307 struct fsnotify_group *group, struct inode *inode) 183 struct fsnotify_group *group, struct inode *inode,
184 int allow_dups)
308{ 185{
309 struct fsnotify_mark_entry *lentry; 186 struct fsnotify_mark *lmark;
187 struct hlist_node *node, *last = NULL;
310 int ret = 0; 188 int ret = 0;
311 189
312 inode = igrab(inode); 190 mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
313 if (unlikely(!inode)) 191
314 return -EINVAL; 192 assert_spin_locked(&mark->lock);
193 assert_spin_locked(&group->mark_lock);
315 194
316 /*
317 * LOCKING ORDER!!!!
318 * entry->lock
319 * group->mark_lock
320 * inode->i_lock
321 */
322 spin_lock(&entry->lock);
323 spin_lock(&group->mark_lock);
324 spin_lock(&inode->i_lock); 195 spin_lock(&inode->i_lock);
325 196
326 lentry = fsnotify_find_mark_entry(group, inode); 197 mark->i.inode = inode;
327 if (!lentry) {
328 entry->group = group;
329 entry->inode = inode;
330 198
331 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); 199 /* is mark the first mark? */
332 list_add(&entry->g_list, &group->mark_entries); 200 if (hlist_empty(&inode->i_fsnotify_marks)) {
201 hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks);
202 goto out;
203 }
333 204
334 fsnotify_get_mark(entry); /* for i_list and g_list */ 205 /* should mark be in the middle of the current list? */
206 hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) {
207 last = node;
208
209 if ((lmark->group == group) && !allow_dups) {
210 ret = -EEXIST;
211 goto out;
212 }
335 213
336 atomic_inc(&group->num_marks); 214 if (mark->group < lmark->group)
215 continue;
337 216
338 fsnotify_recalc_inode_mask_locked(inode); 217 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
218 goto out;
339 } 219 }
340 220
221 BUG_ON(last == NULL);
222 /* mark should be the last entry. last is the current last entry */
223 hlist_add_after_rcu(last, &mark->i.i_list);
224out:
225 fsnotify_recalc_inode_mask_locked(inode);
341 spin_unlock(&inode->i_lock); 226 spin_unlock(&inode->i_lock);
342 spin_unlock(&group->mark_lock);
343 spin_unlock(&entry->lock);
344
345 if (lentry) {
346 ret = -EEXIST;
347 iput(inode);
348 fsnotify_put_mark(lentry);
349 } else {
350 __fsnotify_update_child_dentry_flags(inode);
351 }
352 227
353 return ret; 228 return ret;
354} 229}
@@ -369,11 +244,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
369 struct inode *need_iput_tmp; 244 struct inode *need_iput_tmp;
370 245
371 /* 246 /*
372 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, 247 * We cannot __iget() an inode in state I_FREEING,
373 * I_WILL_FREE, or I_NEW which is fine because by that point 248 * I_WILL_FREE, or I_NEW which is fine because by that point
374 * the inode cannot have any associated watches. 249 * the inode cannot have any associated watches.
375 */ 250 */
376 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW)) 251 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
377 continue; 252 continue;
378 253
379 /* 254 /*
@@ -397,7 +272,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
397 /* In case the dropping of a reference would nuke next_i. */ 272 /* In case the dropping of a reference would nuke next_i. */
398 if ((&next_i->i_sb_list != list) && 273 if ((&next_i->i_sb_list != list) &&
399 atomic_read(&next_i->i_count) && 274 atomic_read(&next_i->i_count) &&
400 !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) { 275 !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
401 __iget(next_i); 276 __iget(next_i);
402 need_iput = next_i; 277 need_iput = next_i;
403 } 278 }
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b3a159b21cfd..b981fc0c8379 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,18 +1,3 @@
1config INOTIFY
2 bool "Inotify file change notification support"
3 default n
4 ---help---
5 Say Y here to enable legacy in kernel inotify support. Inotify is a
6 file change notification system. It is a replacement for dnotify.
7 This option only provides the legacy inotify in kernel API. There
8 are no in tree kernel users of this interface since it is deprecated.
9 You only need this if you are loading an out of tree kernel module
10 that uses inotify.
11
12 For more information, see <file:Documentation/filesystems/inotify.txt>
13
14 If unsure, say N.
15
16config INOTIFY_USER 1config INOTIFY_USER
17 bool "Inotify support for userspace" 2 bool "Inotify support for userspace"
18 select ANON_INODES 3 select ANON_INODES
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index 943828171362..a380dabe09de 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1 @@
1obj-$(CONFIG_INOTIFY) += inotify.o
2obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
deleted file mode 100644
index 40b1cf914ccb..000000000000
--- a/fs/notify/inotify/inotify.c
+++ /dev/null
@@ -1,933 +0,0 @@
1/*
2 * fs/inotify.c - inode-based file event notifications
3 *
4 * Authors:
5 * John McCutchan <ttb@tentacle.dhs.org>
6 * Robert Love <rml@novell.com>
7 *
8 * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
9 *
10 * Copyright (C) 2005 John McCutchan
11 * Copyright 2006 Hewlett-Packard Development Company, L.P.
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms of the GNU General Public License as published by the
15 * Free Software Foundation; either version 2, or (at your option) any
16 * later version.
17 *
18 * This program is distributed in the hope that it will be useful, but
19 * WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22 */
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/spinlock.h>
27#include <linux/idr.h>
28#include <linux/slab.h>
29#include <linux/fs.h>
30#include <linux/sched.h>
31#include <linux/init.h>
32#include <linux/list.h>
33#include <linux/writeback.h>
34#include <linux/inotify.h>
35#include <linux/fsnotify_backend.h>
36
37static atomic_t inotify_cookie;
38
39/*
40 * Lock ordering:
41 *
42 * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
43 * iprune_mutex (synchronize shrink_icache_memory())
44 * inode_lock (protects the super_block->s_inodes list)
45 * inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
46 * inotify_handle->mutex (protects inotify_handle and watches->h_list)
47 *
48 * The inode->inotify_mutex and inotify_handle->mutex and held during execution
49 * of a caller's event handler. Thus, the caller must not hold any locks
50 * taken in their event handler while calling any of the published inotify
51 * interfaces.
52 */
53
54/*
55 * Lifetimes of the three main data structures--inotify_handle, inode, and
56 * inotify_watch--are managed by reference count.
57 *
58 * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
59 * Additional references can bump the count via get_inotify_handle() and drop
60 * the count via put_inotify_handle().
61 *
62 * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
63 * to remove_watch_no_event(). Additional references can bump the count via
64 * get_inotify_watch() and drop the count via put_inotify_watch(). The caller
65 * is reponsible for the final put after receiving IN_IGNORED, or when using
66 * IN_ONESHOT after receiving the first event. Inotify does the final put if
67 * inotify_destroy() is called.
68 *
69 * inode: Pinned so long as the inode is associated with a watch, from
70 * inotify_add_watch() to the final put_inotify_watch().
71 */
72
73/*
74 * struct inotify_handle - represents an inotify instance
75 *
76 * This structure is protected by the mutex 'mutex'.
77 */
78struct inotify_handle {
79 struct idr idr; /* idr mapping wd -> watch */
80 struct mutex mutex; /* protects this bad boy */
81 struct list_head watches; /* list of watches */
82 atomic_t count; /* reference count */
83 u32 last_wd; /* the last wd allocated */
84 const struct inotify_operations *in_ops; /* inotify caller operations */
85};
86
87static inline void get_inotify_handle(struct inotify_handle *ih)
88{
89 atomic_inc(&ih->count);
90}
91
92static inline void put_inotify_handle(struct inotify_handle *ih)
93{
94 if (atomic_dec_and_test(&ih->count)) {
95 idr_destroy(&ih->idr);
96 kfree(ih);
97 }
98}
99
100/**
101 * get_inotify_watch - grab a reference to an inotify_watch
102 * @watch: watch to grab
103 */
104void get_inotify_watch(struct inotify_watch *watch)
105{
106 atomic_inc(&watch->count);
107}
108EXPORT_SYMBOL_GPL(get_inotify_watch);
109
110int pin_inotify_watch(struct inotify_watch *watch)
111{
112 struct super_block *sb = watch->inode->i_sb;
113 spin_lock(&sb_lock);
114 if (sb->s_count >= S_BIAS) {
115 atomic_inc(&sb->s_active);
116 spin_unlock(&sb_lock);
117 atomic_inc(&watch->count);
118 return 1;
119 }
120 spin_unlock(&sb_lock);
121 return 0;
122}
123
124/**
125 * put_inotify_watch - decrements the ref count on a given watch. cleans up
126 * watch references if the count reaches zero. inotify_watch is freed by
127 * inotify callers via the destroy_watch() op.
128 * @watch: watch to release
129 */
130void put_inotify_watch(struct inotify_watch *watch)
131{
132 if (atomic_dec_and_test(&watch->count)) {
133 struct inotify_handle *ih = watch->ih;
134
135 iput(watch->inode);
136 ih->in_ops->destroy_watch(watch);
137 put_inotify_handle(ih);
138 }
139}
140EXPORT_SYMBOL_GPL(put_inotify_watch);
141
142void unpin_inotify_watch(struct inotify_watch *watch)
143{
144 struct super_block *sb = watch->inode->i_sb;
145 put_inotify_watch(watch);
146 deactivate_super(sb);
147}
148
149/*
150 * inotify_handle_get_wd - returns the next WD for use by the given handle
151 *
152 * Callers must hold ih->mutex. This function can sleep.
153 */
154static int inotify_handle_get_wd(struct inotify_handle *ih,
155 struct inotify_watch *watch)
156{
157 int ret;
158
159 do {
160 if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
161 return -ENOSPC;
162 ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
163 } while (ret == -EAGAIN);
164
165 if (likely(!ret))
166 ih->last_wd = watch->wd;
167
168 return ret;
169}
170
171/*
172 * inotify_inode_watched - returns nonzero if there are watches on this inode
173 * and zero otherwise. We call this lockless, we do not care if we race.
174 */
175static inline int inotify_inode_watched(struct inode *inode)
176{
177 return !list_empty(&inode->inotify_watches);
178}
179
180/*
181 * Get child dentry flag into synch with parent inode.
182 * Flag should always be clear for negative dentrys.
183 */
184static void set_dentry_child_flags(struct inode *inode, int watched)
185{
186 struct dentry *alias;
187
188 spin_lock(&dcache_lock);
189 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
190 struct dentry *child;
191
192 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
193 if (!child->d_inode)
194 continue;
195
196 spin_lock(&child->d_lock);
197 if (watched)
198 child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
199 else
200 child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
201 spin_unlock(&child->d_lock);
202 }
203 }
204 spin_unlock(&dcache_lock);
205}
206
207/*
208 * inotify_find_handle - find the watch associated with the given inode and
209 * handle
210 *
211 * Callers must hold inode->inotify_mutex.
212 */
213static struct inotify_watch *inode_find_handle(struct inode *inode,
214 struct inotify_handle *ih)
215{
216 struct inotify_watch *watch;
217
218 list_for_each_entry(watch, &inode->inotify_watches, i_list) {
219 if (watch->ih == ih)
220 return watch;
221 }
222
223 return NULL;
224}
225
226/*
227 * remove_watch_no_event - remove watch without the IN_IGNORED event.
228 *
229 * Callers must hold both inode->inotify_mutex and ih->mutex.
230 */
231static void remove_watch_no_event(struct inotify_watch *watch,
232 struct inotify_handle *ih)
233{
234 list_del(&watch->i_list);
235 list_del(&watch->h_list);
236
237 if (!inotify_inode_watched(watch->inode))
238 set_dentry_child_flags(watch->inode, 0);
239
240 idr_remove(&ih->idr, watch->wd);
241}
242
243/**
244 * inotify_remove_watch_locked - Remove a watch from both the handle and the
245 * inode. Sends the IN_IGNORED event signifying that the inode is no longer
246 * watched. May be invoked from a caller's event handler.
247 * @ih: inotify handle associated with watch
248 * @watch: watch to remove
249 *
250 * Callers must hold both inode->inotify_mutex and ih->mutex.
251 */
252void inotify_remove_watch_locked(struct inotify_handle *ih,
253 struct inotify_watch *watch)
254{
255 remove_watch_no_event(watch, ih);
256 ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
257}
258EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
259
260/* Kernel API for producing events */
261
262/*
263 * inotify_d_instantiate - instantiate dcache entry for inode
264 */
265void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
266{
267 struct dentry *parent;
268
269 if (!inode)
270 return;
271
272 spin_lock(&entry->d_lock);
273 parent = entry->d_parent;
274 if (parent->d_inode && inotify_inode_watched(parent->d_inode))
275 entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
276 spin_unlock(&entry->d_lock);
277}
278
279/*
280 * inotify_d_move - dcache entry has been moved
281 */
282void inotify_d_move(struct dentry *entry)
283{
284 struct dentry *parent;
285
286 parent = entry->d_parent;
287 if (inotify_inode_watched(parent->d_inode))
288 entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
289 else
290 entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
291}
292
293/**
294 * inotify_inode_queue_event - queue an event to all watches on this inode
295 * @inode: inode event is originating from
296 * @mask: event mask describing this event
297 * @cookie: cookie for synchronization, or zero
298 * @name: filename, if any
299 * @n_inode: inode associated with name
300 */
301void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
302 const char *name, struct inode *n_inode)
303{
304 struct inotify_watch *watch, *next;
305
306 if (!inotify_inode_watched(inode))
307 return;
308
309 mutex_lock(&inode->inotify_mutex);
310 list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
311 u32 watch_mask = watch->mask;
312 if (watch_mask & mask) {
313 struct inotify_handle *ih= watch->ih;
314 mutex_lock(&ih->mutex);
315 if (watch_mask & IN_ONESHOT)
316 remove_watch_no_event(watch, ih);
317 ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
318 name, n_inode);
319 mutex_unlock(&ih->mutex);
320 }
321 }
322 mutex_unlock(&inode->inotify_mutex);
323}
324EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
325
326/**
327 * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
328 * @dentry: the dentry in question, we queue against this dentry's parent
329 * @mask: event mask describing this event
330 * @cookie: cookie for synchronization, or zero
331 * @name: filename, if any
332 */
333void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
334 u32 cookie, const char *name)
335{
336 struct dentry *parent;
337 struct inode *inode;
338
339 if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
340 return;
341
342 spin_lock(&dentry->d_lock);
343 parent = dentry->d_parent;
344 inode = parent->d_inode;
345
346 if (inotify_inode_watched(inode)) {
347 dget(parent);
348 spin_unlock(&dentry->d_lock);
349 inotify_inode_queue_event(inode, mask, cookie, name,
350 dentry->d_inode);
351 dput(parent);
352 } else
353 spin_unlock(&dentry->d_lock);
354}
355EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
356
357/**
358 * inotify_get_cookie - return a unique cookie for use in synchronizing events.
359 */
360u32 inotify_get_cookie(void)
361{
362 return atomic_inc_return(&inotify_cookie);
363}
364EXPORT_SYMBOL_GPL(inotify_get_cookie);
365
366/**
367 * inotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
368 * @list: list of inodes being unmounted (sb->s_inodes)
369 *
370 * Called with inode_lock held, protecting the unmounting super block's list
371 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
372 * We temporarily drop inode_lock, however, and CAN block.
373 */
374void inotify_unmount_inodes(struct list_head *list)
375{
376 struct inode *inode, *next_i, *need_iput = NULL;
377
378 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
379 struct inotify_watch *watch, *next_w;
380 struct inode *need_iput_tmp;
381 struct list_head *watches;
382
383 /*
384 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
385 * I_WILL_FREE, or I_NEW which is fine because by that point
386 * the inode cannot have any associated watches.
387 */
388 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
389 continue;
390
391 /*
392 * If i_count is zero, the inode cannot have any watches and
393 * doing an __iget/iput with MS_ACTIVE clear would actually
394 * evict all inodes with zero i_count from icache which is
395 * unnecessarily violent and may in fact be illegal to do.
396 */
397 if (!atomic_read(&inode->i_count))
398 continue;
399
400 need_iput_tmp = need_iput;
401 need_iput = NULL;
402 /* In case inotify_remove_watch_locked() drops a reference. */
403 if (inode != need_iput_tmp)
404 __iget(inode);
405 else
406 need_iput_tmp = NULL;
407 /* In case the dropping of a reference would nuke next_i. */
408 if ((&next_i->i_sb_list != list) &&
409 atomic_read(&next_i->i_count) &&
410 !(next_i->i_state & (I_CLEAR | I_FREEING |
411 I_WILL_FREE))) {
412 __iget(next_i);
413 need_iput = next_i;
414 }
415
416 /*
417 * We can safely drop inode_lock here because we hold
418 * references on both inode and next_i. Also no new inodes
419 * will be added since the umount has begun. Finally,
420 * iprune_mutex keeps shrink_icache_memory() away.
421 */
422 spin_unlock(&inode_lock);
423
424 if (need_iput_tmp)
425 iput(need_iput_tmp);
426
427 /* for each watch, send IN_UNMOUNT and then remove it */
428 mutex_lock(&inode->inotify_mutex);
429 watches = &inode->inotify_watches;
430 list_for_each_entry_safe(watch, next_w, watches, i_list) {
431 struct inotify_handle *ih= watch->ih;
432 get_inotify_watch(watch);
433 mutex_lock(&ih->mutex);
434 ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
435 NULL, NULL);
436 inotify_remove_watch_locked(ih, watch);
437 mutex_unlock(&ih->mutex);
438 put_inotify_watch(watch);
439 }
440 mutex_unlock(&inode->inotify_mutex);
441 iput(inode);
442
443 spin_lock(&inode_lock);
444 }
445}
446EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
447
448/**
449 * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
450 * @inode: inode that is about to be removed
451 */
452void inotify_inode_is_dead(struct inode *inode)
453{
454 struct inotify_watch *watch, *next;
455
456 mutex_lock(&inode->inotify_mutex);
457 list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
458 struct inotify_handle *ih = watch->ih;
459 mutex_lock(&ih->mutex);
460 inotify_remove_watch_locked(ih, watch);
461 mutex_unlock(&ih->mutex);
462 }
463 mutex_unlock(&inode->inotify_mutex);
464}
465EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
466
467/* Kernel Consumer API */
468
469/**
470 * inotify_init - allocate and initialize an inotify instance
471 * @ops: caller's inotify operations
472 */
473struct inotify_handle *inotify_init(const struct inotify_operations *ops)
474{
475 struct inotify_handle *ih;
476
477 ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
478 if (unlikely(!ih))
479 return ERR_PTR(-ENOMEM);
480
481 idr_init(&ih->idr);
482 INIT_LIST_HEAD(&ih->watches);
483 mutex_init(&ih->mutex);
484 ih->last_wd = 0;
485 ih->in_ops = ops;
486 atomic_set(&ih->count, 0);
487 get_inotify_handle(ih);
488
489 return ih;
490}
491EXPORT_SYMBOL_GPL(inotify_init);
492
493/**
494 * inotify_init_watch - initialize an inotify watch
495 * @watch: watch to initialize
496 */
497void inotify_init_watch(struct inotify_watch *watch)
498{
499 INIT_LIST_HEAD(&watch->h_list);
500 INIT_LIST_HEAD(&watch->i_list);
501 atomic_set(&watch->count, 0);
502 get_inotify_watch(watch); /* initial get */
503}
504EXPORT_SYMBOL_GPL(inotify_init_watch);
505
506/*
507 * Watch removals suck violently. To kick the watch out we need (in this
508 * order) inode->inotify_mutex and ih->mutex. That's fine if we have
509 * a hold on inode; however, for all other cases we need to make damn sure
510 * we don't race with umount. We can *NOT* just grab a reference to a
511 * watch - inotify_unmount_inodes() will happily sail past it and we'll end
512 * with reference to inode potentially outliving its superblock. Ideally
513 * we just want to grab an active reference to superblock if we can; that
514 * will make sure we won't go into inotify_umount_inodes() until we are
515 * done. Cleanup is just deactivate_super(). However, that leaves a messy
516 * case - what if we *are* racing with umount() and active references to
517 * superblock can't be acquired anymore? We can bump ->s_count, grab
518 * ->s_umount, which will almost certainly wait until the superblock is shut
519 * down and the watch in question is pining for fjords. That's fine, but
520 * there is a problem - we might have hit the window between ->s_active
521 * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
522 * is past the point of no return and is heading for shutdown) and the
523 * moment when deactivate_super() acquires ->s_umount. We could just do
524 * drop_super() yield() and retry, but that's rather antisocial and this
525 * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having
526 * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
527 * that we won't race with inotify_umount_inodes(). So we could grab a
528 * reference to watch and do the rest as above, just with drop_super() instead
529 * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we
530 * could grab ->s_umount. So the watch could've been gone already.
531 *
532 * That still can be dealt with - we need to save watch->wd, do idr_find()
533 * and compare its result with our pointer. If they match, we either have
534 * the damn thing still alive or we'd lost not one but two races at once,
535 * the watch had been killed and a new one got created with the same ->wd
536 * at the same address. That couldn't have happened in inotify_destroy(),
537 * but inotify_rm_wd() could run into that. Still, "new one got created"
538 * is not a problem - we have every right to kill it or leave it alone,
539 * whatever's more convenient.
540 *
541 * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
542 * "grab it and kill it" check. If it's been our original watch, we are
543 * fine, if it's a newcomer - nevermind, just pretend that we'd won the
544 * race and kill the fscker anyway; we are safe since we know that its
545 * superblock won't be going away.
546 *
547 * And yes, this is far beyond mere "not very pretty"; so's the entire
548 * concept of inotify to start with.
549 */
550
551/**
552 * pin_to_kill - pin the watch down for removal
553 * @ih: inotify handle
554 * @watch: watch to kill
555 *
556 * Called with ih->mutex held, drops it. Possible return values:
557 * 0 - nothing to do, it has died
558 * 1 - remove it, drop the reference and deactivate_super()
559 * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
560 * that variant, since it involved a lot of PITA, but that's the best that
561 * could've been done.
562 */
563static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
564{
565 struct super_block *sb = watch->inode->i_sb;
566 s32 wd = watch->wd;
567
568 spin_lock(&sb_lock);
569 if (sb->s_count >= S_BIAS) {
570 atomic_inc(&sb->s_active);
571 spin_unlock(&sb_lock);
572 get_inotify_watch(watch);
573 mutex_unlock(&ih->mutex);
574 return 1; /* the best outcome */
575 }
576 sb->s_count++;
577 spin_unlock(&sb_lock);
578 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
579 down_read(&sb->s_umount);
580 if (likely(!sb->s_root)) {
581 /* fs is already shut down; the watch is dead */
582 drop_super(sb);
583 return 0;
584 }
585 /* raced with the final deactivate_super() */
586 mutex_lock(&ih->mutex);
587 if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
588 /* the watch is dead */
589 mutex_unlock(&ih->mutex);
590 drop_super(sb);
591 return 0;
592 }
593 /* still alive or freed and reused with the same sb and wd; kill */
594 get_inotify_watch(watch);
595 mutex_unlock(&ih->mutex);
596 return 2;
597}
598
599static void unpin_and_kill(struct inotify_watch *watch, int how)
600{
601 struct super_block *sb = watch->inode->i_sb;
602 put_inotify_watch(watch);
603 switch (how) {
604 case 1:
605 deactivate_super(sb);
606 break;
607 case 2:
608 drop_super(sb);
609 }
610}
611
612/**
613 * inotify_destroy - clean up and destroy an inotify instance
614 * @ih: inotify handle
615 */
616void inotify_destroy(struct inotify_handle *ih)
617{
618 /*
619 * Destroy all of the watches for this handle. Unfortunately, not very
620 * pretty. We cannot do a simple iteration over the list, because we
621 * do not know the inode until we iterate to the watch. But we need to
622 * hold inode->inotify_mutex before ih->mutex. The following works.
623 *
624 * AV: it had to become even uglier to start working ;-/
625 */
626 while (1) {
627 struct inotify_watch *watch;
628 struct list_head *watches;
629 struct super_block *sb;
630 struct inode *inode;
631 int how;
632
633 mutex_lock(&ih->mutex);
634 watches = &ih->watches;
635 if (list_empty(watches)) {
636 mutex_unlock(&ih->mutex);
637 break;
638 }
639 watch = list_first_entry(watches, struct inotify_watch, h_list);
640 sb = watch->inode->i_sb;
641 how = pin_to_kill(ih, watch);
642 if (!how)
643 continue;
644
645 inode = watch->inode;
646 mutex_lock(&inode->inotify_mutex);
647 mutex_lock(&ih->mutex);
648
649 /* make sure we didn't race with another list removal */
650 if (likely(idr_find(&ih->idr, watch->wd))) {
651 remove_watch_no_event(watch, ih);
652 put_inotify_watch(watch);
653 }
654
655 mutex_unlock(&ih->mutex);
656 mutex_unlock(&inode->inotify_mutex);
657 unpin_and_kill(watch, how);
658 }
659
660 /* free this handle: the put matching the get in inotify_init() */
661 put_inotify_handle(ih);
662}
663EXPORT_SYMBOL_GPL(inotify_destroy);
664
665/**
666 * inotify_find_watch - find an existing watch for an (ih,inode) pair
667 * @ih: inotify handle
668 * @inode: inode to watch
669 * @watchp: pointer to existing inotify_watch
670 *
671 * Caller must pin given inode (via nameidata).
672 */
673s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
674 struct inotify_watch **watchp)
675{
676 struct inotify_watch *old;
677 int ret = -ENOENT;
678
679 mutex_lock(&inode->inotify_mutex);
680 mutex_lock(&ih->mutex);
681
682 old = inode_find_handle(inode, ih);
683 if (unlikely(old)) {
684 get_inotify_watch(old); /* caller must put watch */
685 *watchp = old;
686 ret = old->wd;
687 }
688
689 mutex_unlock(&ih->mutex);
690 mutex_unlock(&inode->inotify_mutex);
691
692 return ret;
693}
694EXPORT_SYMBOL_GPL(inotify_find_watch);
695
696/**
697 * inotify_find_update_watch - find and update the mask of an existing watch
698 * @ih: inotify handle
699 * @inode: inode's watch to update
700 * @mask: mask of events to watch
701 *
702 * Caller must pin given inode (via nameidata).
703 */
704s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
705 u32 mask)
706{
707 struct inotify_watch *old;
708 int mask_add = 0;
709 int ret;
710
711 if (mask & IN_MASK_ADD)
712 mask_add = 1;
713
714 /* don't allow invalid bits: we don't want flags set */
715 mask &= IN_ALL_EVENTS | IN_ONESHOT;
716 if (unlikely(!mask))
717 return -EINVAL;
718
719 mutex_lock(&inode->inotify_mutex);
720 mutex_lock(&ih->mutex);
721
722 /*
723 * Handle the case of re-adding a watch on an (inode,ih) pair that we
724 * are already watching. We just update the mask and return its wd.
725 */
726 old = inode_find_handle(inode, ih);
727 if (unlikely(!old)) {
728 ret = -ENOENT;
729 goto out;
730 }
731
732 if (mask_add)
733 old->mask |= mask;
734 else
735 old->mask = mask;
736 ret = old->wd;
737out:
738 mutex_unlock(&ih->mutex);
739 mutex_unlock(&inode->inotify_mutex);
740 return ret;
741}
742EXPORT_SYMBOL_GPL(inotify_find_update_watch);
743
744/**
745 * inotify_add_watch - add a watch to an inotify instance
746 * @ih: inotify handle
747 * @watch: caller allocated watch structure
748 * @inode: inode to watch
749 * @mask: mask of events to watch
750 *
751 * Caller must pin given inode (via nameidata).
752 * Caller must ensure it only calls inotify_add_watch() once per watch.
753 * Calls inotify_handle_get_wd() so may sleep.
754 */
755s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
756 struct inode *inode, u32 mask)
757{
758 int ret = 0;
759 int newly_watched;
760
761 /* don't allow invalid bits: we don't want flags set */
762 mask &= IN_ALL_EVENTS | IN_ONESHOT;
763 if (unlikely(!mask))
764 return -EINVAL;
765 watch->mask = mask;
766
767 mutex_lock(&inode->inotify_mutex);
768 mutex_lock(&ih->mutex);
769
770 /* Initialize a new watch */
771 ret = inotify_handle_get_wd(ih, watch);
772 if (unlikely(ret))
773 goto out;
774 ret = watch->wd;
775
776 /* save a reference to handle and bump the count to make it official */
777 get_inotify_handle(ih);
778 watch->ih = ih;
779
780 /*
781 * Save a reference to the inode and bump the ref count to make it
782 * official. We hold a reference to nameidata, which makes this safe.
783 */
784 watch->inode = igrab(inode);
785
786 /* Add the watch to the handle's and the inode's list */
787 newly_watched = !inotify_inode_watched(inode);
788 list_add(&watch->h_list, &ih->watches);
789 list_add(&watch->i_list, &inode->inotify_watches);
790 /*
791 * Set child flags _after_ adding the watch, so there is no race
792 * windows where newly instantiated children could miss their parent's
793 * watched flag.
794 */
795 if (newly_watched)
796 set_dentry_child_flags(inode, 1);
797
798out:
799 mutex_unlock(&ih->mutex);
800 mutex_unlock(&inode->inotify_mutex);
801 return ret;
802}
803EXPORT_SYMBOL_GPL(inotify_add_watch);
804
805/**
806 * inotify_clone_watch - put the watch next to existing one
807 * @old: already installed watch
808 * @new: new watch
809 *
810 * Caller must hold the inotify_mutex of inode we are dealing with;
811 * it is expected to remove the old watch before unlocking the inode.
812 */
813s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
814{
815 struct inotify_handle *ih = old->ih;
816 int ret = 0;
817
818 new->mask = old->mask;
819 new->ih = ih;
820
821 mutex_lock(&ih->mutex);
822
823 /* Initialize a new watch */
824 ret = inotify_handle_get_wd(ih, new);
825 if (unlikely(ret))
826 goto out;
827 ret = new->wd;
828
829 get_inotify_handle(ih);
830
831 new->inode = igrab(old->inode);
832
833 list_add(&new->h_list, &ih->watches);
834 list_add(&new->i_list, &old->inode->inotify_watches);
835out:
836 mutex_unlock(&ih->mutex);
837 return ret;
838}
839
840void inotify_evict_watch(struct inotify_watch *watch)
841{
842 get_inotify_watch(watch);
843 mutex_lock(&watch->ih->mutex);
844 inotify_remove_watch_locked(watch->ih, watch);
845 mutex_unlock(&watch->ih->mutex);
846}
847
848/**
849 * inotify_rm_wd - remove a watch from an inotify instance
850 * @ih: inotify handle
851 * @wd: watch descriptor to remove
852 *
853 * Can sleep.
854 */
855int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
856{
857 struct inotify_watch *watch;
858 struct super_block *sb;
859 struct inode *inode;
860 int how;
861
862 mutex_lock(&ih->mutex);
863 watch = idr_find(&ih->idr, wd);
864 if (unlikely(!watch)) {
865 mutex_unlock(&ih->mutex);
866 return -EINVAL;
867 }
868 sb = watch->inode->i_sb;
869 how = pin_to_kill(ih, watch);
870 if (!how)
871 return 0;
872
873 inode = watch->inode;
874
875 mutex_lock(&inode->inotify_mutex);
876 mutex_lock(&ih->mutex);
877
878 /* make sure that we did not race */
879 if (likely(idr_find(&ih->idr, wd) == watch))
880 inotify_remove_watch_locked(ih, watch);
881
882 mutex_unlock(&ih->mutex);
883 mutex_unlock(&inode->inotify_mutex);
884 unpin_and_kill(watch, how);
885
886 return 0;
887}
888EXPORT_SYMBOL_GPL(inotify_rm_wd);
889
890/**
891 * inotify_rm_watch - remove a watch from an inotify instance
892 * @ih: inotify handle
893 * @watch: watch to remove
894 *
895 * Can sleep.
896 */
897int inotify_rm_watch(struct inotify_handle *ih,
898 struct inotify_watch *watch)
899{
900 return inotify_rm_wd(ih, watch->wd);
901}
902EXPORT_SYMBOL_GPL(inotify_rm_watch);
903
904/*
905 * inotify_setup - core initialization function
906 */
907static int __init inotify_setup(void)
908{
909 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
910 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
911 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
912 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
913 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
914 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
915 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
916 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
917 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
918 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
919 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
920 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
921 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
922
923 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
924 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
925 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
926 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
927
928 atomic_set(&inotify_cookie, 0);
929
930 return 0;
931}
932
933module_init(inotify_setup);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index f234f3a4c8ca..b6642e4de4bf 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -9,13 +9,12 @@ struct inotify_event_private_data {
9 int wd; 9 int wd;
10}; 10};
11 11
12struct inotify_inode_mark_entry { 12struct inotify_inode_mark {
13 /* fsnotify_mark_entry MUST be the first thing */ 13 struct fsnotify_mark fsn_mark;
14 struct fsnotify_mark_entry fsn_entry;
15 int wd; 14 int wd;
16}; 15};
17 16
18extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 17extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
19 struct fsnotify_group *group); 18 struct fsnotify_group *group);
20extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); 19extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
21 20
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e27960cd76ab..a91b69a6a291 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -22,6 +22,7 @@
22 * General Public License for more details. 22 * General Public License for more details.
23 */ 23 */
24 24
25#include <linux/dcache.h> /* d_unlinked */
25#include <linux/fs.h> /* struct inode */ 26#include <linux/fs.h> /* struct inode */
26#include <linux/fsnotify_backend.h> 27#include <linux/fsnotify_backend.h>
27#include <linux/inotify.h> 28#include <linux/inotify.h>
@@ -32,26 +33,84 @@
32 33
33#include "inotify.h" 34#include "inotify.h"
34 35
35static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event) 36/*
37 * Check if 2 events contain the same information. We do not compare private data
38 * but at this moment that isn't a problem for any know fsnotify listeners.
39 */
40static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
41{
42 if ((old->mask == new->mask) &&
43 (old->to_tell == new->to_tell) &&
44 (old->data_type == new->data_type) &&
45 (old->name_len == new->name_len)) {
46 switch (old->data_type) {
47 case (FSNOTIFY_EVENT_INODE):
48 /* remember, after old was put on the wait_q we aren't
49 * allowed to look at the inode any more, only thing
50 * left to check was if the file_name is the same */
51 if (!old->name_len ||
52 !strcmp(old->file_name, new->file_name))
53 return true;
54 break;
55 case (FSNOTIFY_EVENT_PATH):
56 if ((old->path.mnt == new->path.mnt) &&
57 (old->path.dentry == new->path.dentry))
58 return true;
59 break;
60 case (FSNOTIFY_EVENT_NONE):
61 if (old->mask & FS_Q_OVERFLOW)
62 return true;
63 else if (old->mask & FS_IN_IGNORED)
64 return false;
65 return true;
66 };
67 }
68 return false;
69}
70
71static struct fsnotify_event *inotify_merge(struct list_head *list,
72 struct fsnotify_event *event)
36{ 73{
37 struct fsnotify_mark_entry *entry; 74 struct fsnotify_event_holder *last_holder;
38 struct inotify_inode_mark_entry *ientry; 75 struct fsnotify_event *last_event;
76
77 /* and the list better be locked by something too */
78 spin_lock(&event->lock);
79
80 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
81 last_event = last_holder->event;
82 if (event_compare(last_event, event))
83 fsnotify_get_event(last_event);
84 else
85 last_event = NULL;
86
87 spin_unlock(&event->lock);
88
89 return last_event;
90}
91
92static int inotify_handle_event(struct fsnotify_group *group,
93 struct fsnotify_mark *inode_mark,
94 struct fsnotify_mark *vfsmount_mark,
95 struct fsnotify_event *event)
96{
97 struct inotify_inode_mark *i_mark;
39 struct inode *to_tell; 98 struct inode *to_tell;
40 struct inotify_event_private_data *event_priv; 99 struct inotify_event_private_data *event_priv;
41 struct fsnotify_event_private_data *fsn_event_priv; 100 struct fsnotify_event_private_data *fsn_event_priv;
42 int wd, ret; 101 struct fsnotify_event *added_event;
102 int wd, ret = 0;
103
104 BUG_ON(vfsmount_mark);
105
106 pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
107 event, event->to_tell, event->mask);
43 108
44 to_tell = event->to_tell; 109 to_tell = event->to_tell;
45 110
46 spin_lock(&to_tell->i_lock); 111 i_mark = container_of(inode_mark, struct inotify_inode_mark,
47 entry = fsnotify_find_mark_entry(group, to_tell); 112 fsn_mark);
48 spin_unlock(&to_tell->i_lock); 113 wd = i_mark->wd;
49 /* race with watch removal? We already passes should_send */
50 if (unlikely(!entry))
51 return 0;
52 ientry = container_of(entry, struct inotify_inode_mark_entry,
53 fsn_entry);
54 wd = ientry->wd;
55 114
56 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 115 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
57 if (unlikely(!event_priv)) 116 if (unlikely(!event_priv))
@@ -62,48 +121,40 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
62 fsn_event_priv->group = group; 121 fsn_event_priv->group = group;
63 event_priv->wd = wd; 122 event_priv->wd = wd;
64 123
65 ret = fsnotify_add_notify_event(group, event, fsn_event_priv); 124 added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
66 if (ret) { 125 if (added_event) {
67 inotify_free_event_priv(fsn_event_priv); 126 inotify_free_event_priv(fsn_event_priv);
68 /* EEXIST says we tail matched, EOVERFLOW isn't something 127 if (!IS_ERR(added_event))
69 * to report up the stack. */ 128 fsnotify_put_event(added_event);
70 if ((ret == -EEXIST) || 129 else
71 (ret == -EOVERFLOW)) 130 ret = PTR_ERR(added_event);
72 ret = 0;
73 } 131 }
74 132
75 /* 133 if (inode_mark->mask & IN_ONESHOT)
76 * If we hold the entry until after the event is on the queue 134 fsnotify_destroy_mark(inode_mark);
77 * IN_IGNORED won't be able to pass this event in the queue
78 */
79 fsnotify_put_mark(entry);
80 135
81 return ret; 136 return ret;
82} 137}
83 138
84static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) 139static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
85{ 140{
86 inotify_ignored_and_remove_idr(entry, group); 141 inotify_ignored_and_remove_idr(fsn_mark, group);
87} 142}
88 143
89static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask) 144static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
145 struct fsnotify_mark *inode_mark,
146 struct fsnotify_mark *vfsmount_mark,
147 __u32 mask, void *data, int data_type)
90{ 148{
91 struct fsnotify_mark_entry *entry; 149 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
92 bool send; 150 (data_type == FSNOTIFY_EVENT_PATH)) {
93 151 struct path *path = data;
94 spin_lock(&inode->i_lock);
95 entry = fsnotify_find_mark_entry(group, inode);
96 spin_unlock(&inode->i_lock);
97 if (!entry)
98 return false;
99 152
100 mask = (mask & ~FS_EVENT_ON_CHILD); 153 if (d_unlinked(path->dentry))
101 send = (entry->mask & mask); 154 return false;
102 155 }
103 /* find took a reference */
104 fsnotify_put_mark(entry);
105 156
106 return send; 157 return true;
107} 158}
108 159
109/* 160/*
@@ -115,18 +166,18 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
115 */ 166 */
116static int idr_callback(int id, void *p, void *data) 167static int idr_callback(int id, void *p, void *data)
117{ 168{
118 struct fsnotify_mark_entry *entry; 169 struct fsnotify_mark *fsn_mark;
119 struct inotify_inode_mark_entry *ientry; 170 struct inotify_inode_mark *i_mark;
120 static bool warned = false; 171 static bool warned = false;
121 172
122 if (warned) 173 if (warned)
123 return 0; 174 return 0;
124 175
125 warned = true; 176 warned = true;
126 entry = p; 177 fsn_mark = p;
127 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 178 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
128 179
129 WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in " 180 WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
130 "idr. Probably leaking memory\n", id, p, data); 181 "idr. Probably leaking memory\n", id, p, data);
131 182
132 /* 183 /*
@@ -135,9 +186,9 @@ static int idr_callback(int id, void *p, void *data)
135 * out why we got here and the panic is no worse than the original 186 * out why we got here and the panic is no worse than the original
136 * BUG() that was here. 187 * BUG() that was here.
137 */ 188 */
138 if (entry) 189 if (fsn_mark)
139 printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n", 190 printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
140 entry->group, entry->inode, ientry->wd); 191 fsn_mark->group, fsn_mark->i.inode, i_mark->wd);
141 return 0; 192 return 0;
142} 193}
143 194
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e46ca685b9be..bf7f6d776c31 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -46,17 +46,11 @@
46/* these are configurable via /proc/sys/fs/inotify/ */ 46/* these are configurable via /proc/sys/fs/inotify/ */
47static int inotify_max_user_instances __read_mostly; 47static int inotify_max_user_instances __read_mostly;
48static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
49int inotify_max_user_watches __read_mostly; 49static int inotify_max_user_watches __read_mostly;
50 50
51static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 51static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
52struct kmem_cache *event_priv_cachep __read_mostly; 52struct kmem_cache *event_priv_cachep __read_mostly;
53 53
54/*
55 * When inotify registers a new group it increments this and uses that
56 * value as an offset to set the fsnotify group "name" and priority.
57 */
58static atomic_t inotify_grp_num;
59
60#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
61 55
62#include <linux/sysctl.h> 56#include <linux/sysctl.h>
@@ -96,11 +90,14 @@ static inline __u32 inotify_arg_to_mask(u32 arg)
96{ 90{
97 __u32 mask; 91 __u32 mask;
98 92
99 /* everything should accept their own ignored and cares about children */ 93 /*
100 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD); 94 * everything should accept their own ignored, cares about children,
95 * and should receive events when the inode is unmounted
96 */
97 mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
101 98
102 /* mask off the flags used to open the fd */ 99 /* mask off the flags used to open the fd */
103 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT)); 100 mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
104 101
105 return mask; 102 return mask;
106} 103}
@@ -144,6 +141,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
144 141
145 event = fsnotify_peek_notify_event(group); 142 event = fsnotify_peek_notify_event(group);
146 143
144 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
145
147 if (event->name_len) 146 if (event->name_len)
148 event_size += roundup(event->name_len + 1, event_size); 147 event_size += roundup(event->name_len + 1, event_size);
149 148
@@ -173,6 +172,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
173 size_t event_size = sizeof(struct inotify_event); 172 size_t event_size = sizeof(struct inotify_event);
174 size_t name_len = 0; 173 size_t name_len = 0;
175 174
175 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
176
176 /* we get the inotify watch descriptor from the event private data */ 177 /* we get the inotify watch descriptor from the event private data */
177 spin_lock(&event->lock); 178 spin_lock(&event->lock);
178 fsn_priv = fsnotify_remove_priv_from_event(group, event); 179 fsn_priv = fsnotify_remove_priv_from_event(group, event);
@@ -245,6 +246,8 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
245 kevent = get_one_event(group, count); 246 kevent = get_one_event(group, count);
246 mutex_unlock(&group->notification_mutex); 247 mutex_unlock(&group->notification_mutex);
247 248
249 pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
250
248 if (kevent) { 251 if (kevent) {
249 ret = PTR_ERR(kevent); 252 ret = PTR_ERR(kevent);
250 if (IS_ERR(kevent)) 253 if (IS_ERR(kevent))
@@ -289,6 +292,8 @@ static int inotify_release(struct inode *ignored, struct file *file)
289 struct fsnotify_group *group = file->private_data; 292 struct fsnotify_group *group = file->private_data;
290 struct user_struct *user = group->inotify_data.user; 293 struct user_struct *user = group->inotify_data.user;
291 294
295 pr_debug("%s: group=%p\n", __func__, group);
296
292 fsnotify_clear_marks_by_group(group); 297 fsnotify_clear_marks_by_group(group);
293 298
294 /* free this group, matching get was inotify_init->fsnotify_obtain_group */ 299 /* free this group, matching get was inotify_init->fsnotify_obtain_group */
@@ -312,6 +317,8 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
312 group = file->private_data; 317 group = file->private_data;
313 p = (void __user *) arg; 318 p = (void __user *) arg;
314 319
320 pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
321
315 switch (cmd) { 322 switch (cmd) {
316 case FIONREAD: 323 case FIONREAD:
317 mutex_lock(&group->notification_mutex); 324 mutex_lock(&group->notification_mutex);
@@ -357,59 +364,159 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
357 return error; 364 return error;
358} 365}
359 366
367static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
368 int *last_wd,
369 struct inotify_inode_mark *i_mark)
370{
371 int ret;
372
373 do {
374 if (unlikely(!idr_pre_get(idr, GFP_KERNEL)))
375 return -ENOMEM;
376
377 spin_lock(idr_lock);
378 ret = idr_get_new_above(idr, i_mark, *last_wd + 1,
379 &i_mark->wd);
380 /* we added the mark to the idr, take a reference */
381 if (!ret) {
382 *last_wd = i_mark->wd;
383 fsnotify_get_mark(&i_mark->fsn_mark);
384 }
385 spin_unlock(idr_lock);
386 } while (ret == -EAGAIN);
387
388 return ret;
389}
390
391static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
392 int wd)
393{
394 struct idr *idr = &group->inotify_data.idr;
395 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
396 struct inotify_inode_mark *i_mark;
397
398 assert_spin_locked(idr_lock);
399
400 i_mark = idr_find(idr, wd);
401 if (i_mark) {
402 struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark;
403
404 fsnotify_get_mark(fsn_mark);
405 /* One ref for being in the idr, one ref we just took */
406 BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
407 }
408
409 return i_mark;
410}
411
412static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
413 int wd)
414{
415 struct inotify_inode_mark *i_mark;
416 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
417
418 spin_lock(idr_lock);
419 i_mark = inotify_idr_find_locked(group, wd);
420 spin_unlock(idr_lock);
421
422 return i_mark;
423}
424
425static void do_inotify_remove_from_idr(struct fsnotify_group *group,
426 struct inotify_inode_mark *i_mark)
427{
428 struct idr *idr = &group->inotify_data.idr;
429 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
430 int wd = i_mark->wd;
431
432 assert_spin_locked(idr_lock);
433
434 idr_remove(idr, wd);
435
436 /* removed from the idr, drop that ref */
437 fsnotify_put_mark(&i_mark->fsn_mark);
438}
439
360/* 440/*
361 * Remove the mark from the idr (if present) and drop the reference 441 * Remove the mark from the idr (if present) and drop the reference
362 * on the mark because it was in the idr. 442 * on the mark because it was in the idr.
363 */ 443 */
364static void inotify_remove_from_idr(struct fsnotify_group *group, 444static void inotify_remove_from_idr(struct fsnotify_group *group,
365 struct inotify_inode_mark_entry *ientry) 445 struct inotify_inode_mark *i_mark)
366{ 446{
367 struct idr *idr; 447 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
368 struct fsnotify_mark_entry *entry; 448 struct inotify_inode_mark *found_i_mark = NULL;
369 struct inotify_inode_mark_entry *found_ientry;
370 int wd; 449 int wd;
371 450
372 spin_lock(&group->inotify_data.idr_lock); 451 spin_lock(idr_lock);
373 idr = &group->inotify_data.idr; 452 wd = i_mark->wd;
374 wd = ientry->wd;
375 453
376 if (wd == -1) 454 /*
455 * does this i_mark think it is in the idr? we shouldn't get called
456 * if it wasn't....
457 */
458 if (wd == -1) {
459 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
460 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
461 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
377 goto out; 462 goto out;
463 }
378 464
379 entry = idr_find(&group->inotify_data.idr, wd); 465 /* Lets look in the idr to see if we find it */
380 if (unlikely(!entry)) 466 found_i_mark = inotify_idr_find_locked(group, wd);
467 if (unlikely(!found_i_mark)) {
468 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
469 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
470 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
381 goto out; 471 goto out;
472 }
382 473
383 found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 474 /*
384 if (unlikely(found_ientry != ientry)) { 475 * We found an mark in the idr at the right wd, but it's
385 /* We found an entry in the idr with the right wd, but it's 476 * not the mark we were told to remove. eparis seriously
386 * not the entry we were told to remove. eparis seriously 477 * fucked up somewhere.
387 * fucked up somewhere. */ 478 */
388 WARN_ON(1); 479 if (unlikely(found_i_mark != i_mark)) {
389 ientry->wd = -1; 480 WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
481 "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
482 "found_i_mark->group=%p found_i_mark->inode=%p\n",
483 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
484 i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd,
485 found_i_mark->fsn_mark.group,
486 found_i_mark->fsn_mark.i.inode);
390 goto out; 487 goto out;
391 } 488 }
392 489
393 /* One ref for being in the idr, one ref held by the caller */ 490 /*
394 BUG_ON(atomic_read(&entry->refcnt) < 2); 491 * One ref for being in the idr
395 492 * one ref held by the caller trying to kill us
396 idr_remove(idr, wd); 493 * one ref grabbed by inotify_idr_find
397 ientry->wd = -1; 494 */
495 if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
496 printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
497 " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
498 i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
499 /* we can't really recover with bad ref cnting.. */
500 BUG();
501 }
398 502
399 /* removed from the idr, drop that ref */ 503 do_inotify_remove_from_idr(group, i_mark);
400 fsnotify_put_mark(entry);
401out: 504out:
402 spin_unlock(&group->inotify_data.idr_lock); 505 /* match the ref taken by inotify_idr_find_locked() */
506 if (found_i_mark)
507 fsnotify_put_mark(&found_i_mark->fsn_mark);
508 i_mark->wd = -1;
509 spin_unlock(idr_lock);
403} 510}
404 511
405/* 512/*
406 * Send IN_IGNORED for this wd, remove this wd from the idr. 513 * Send IN_IGNORED for this wd, remove this wd from the idr.
407 */ 514 */
408void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, 515void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
409 struct fsnotify_group *group) 516 struct fsnotify_group *group)
410{ 517{
411 struct inotify_inode_mark_entry *ientry; 518 struct inotify_inode_mark *i_mark;
412 struct fsnotify_event *ignored_event; 519 struct fsnotify_event *ignored_event, *notify_event;
413 struct inotify_event_private_data *event_priv; 520 struct inotify_event_private_data *event_priv;
414 struct fsnotify_event_private_data *fsn_event_priv; 521 struct fsnotify_event_private_data *fsn_event_priv;
415 int ret; 522 int ret;
@@ -420,7 +527,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
420 if (!ignored_event) 527 if (!ignored_event)
421 return; 528 return;
422 529
423 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 530 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
424 531
425 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); 532 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
426 if (unlikely(!event_priv)) 533 if (unlikely(!event_priv))
@@ -429,37 +536,44 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
429 fsn_event_priv = &event_priv->fsnotify_event_priv_data; 536 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
430 537
431 fsn_event_priv->group = group; 538 fsn_event_priv->group = group;
432 event_priv->wd = ientry->wd; 539 event_priv->wd = i_mark->wd;
433 540
434 ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); 541 notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
435 if (ret) 542 if (notify_event) {
543 if (IS_ERR(notify_event))
544 ret = PTR_ERR(notify_event);
545 else
546 fsnotify_put_event(notify_event);
436 inotify_free_event_priv(fsn_event_priv); 547 inotify_free_event_priv(fsn_event_priv);
548 }
437 549
438skip_send_ignore: 550skip_send_ignore:
439 551
440 /* matches the reference taken when the event was created */ 552 /* matches the reference taken when the event was created */
441 fsnotify_put_event(ignored_event); 553 fsnotify_put_event(ignored_event);
442 554
443 /* remove this entry from the idr */ 555 /* remove this mark from the idr */
444 inotify_remove_from_idr(group, ientry); 556 inotify_remove_from_idr(group, i_mark);
445 557
446 atomic_dec(&group->inotify_data.user->inotify_watches); 558 atomic_dec(&group->inotify_data.user->inotify_watches);
447} 559}
448 560
449/* ding dong the mark is dead */ 561/* ding dong the mark is dead */
450static void inotify_free_mark(struct fsnotify_mark_entry *entry) 562static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
451{ 563{
452 struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry; 564 struct inotify_inode_mark *i_mark;
565
566 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
453 567
454 kmem_cache_free(inotify_inode_mark_cachep, ientry); 568 kmem_cache_free(inotify_inode_mark_cachep, i_mark);
455} 569}
456 570
457static int inotify_update_existing_watch(struct fsnotify_group *group, 571static int inotify_update_existing_watch(struct fsnotify_group *group,
458 struct inode *inode, 572 struct inode *inode,
459 u32 arg) 573 u32 arg)
460{ 574{
461 struct fsnotify_mark_entry *entry; 575 struct fsnotify_mark *fsn_mark;
462 struct inotify_inode_mark_entry *ientry; 576 struct inotify_inode_mark *i_mark;
463 __u32 old_mask, new_mask; 577 __u32 old_mask, new_mask;
464 __u32 mask; 578 __u32 mask;
465 int add = (arg & IN_MASK_ADD); 579 int add = (arg & IN_MASK_ADD);
@@ -467,52 +581,43 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
467 581
468 /* don't allow invalid bits: we don't want flags set */ 582 /* don't allow invalid bits: we don't want flags set */
469 mask = inotify_arg_to_mask(arg); 583 mask = inotify_arg_to_mask(arg);
470 if (unlikely(!mask)) 584 if (unlikely(!(mask & IN_ALL_EVENTS)))
471 return -EINVAL; 585 return -EINVAL;
472 586
473 spin_lock(&inode->i_lock); 587 fsn_mark = fsnotify_find_inode_mark(group, inode);
474 entry = fsnotify_find_mark_entry(group, inode); 588 if (!fsn_mark)
475 spin_unlock(&inode->i_lock);
476 if (!entry)
477 return -ENOENT; 589 return -ENOENT;
478 590
479 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 591 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
480 592
481 spin_lock(&entry->lock); 593 spin_lock(&fsn_mark->lock);
482 594
483 old_mask = entry->mask; 595 old_mask = fsn_mark->mask;
484 if (add) { 596 if (add)
485 entry->mask |= mask; 597 fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
486 new_mask = entry->mask; 598 else
487 } else { 599 fsnotify_set_mark_mask_locked(fsn_mark, mask);
488 entry->mask = mask; 600 new_mask = fsn_mark->mask;
489 new_mask = entry->mask;
490 }
491 601
492 spin_unlock(&entry->lock); 602 spin_unlock(&fsn_mark->lock);
493 603
494 if (old_mask != new_mask) { 604 if (old_mask != new_mask) {
495 /* more bits in old than in new? */ 605 /* more bits in old than in new? */
496 int dropped = (old_mask & ~new_mask); 606 int dropped = (old_mask & ~new_mask);
497 /* more bits in this entry than the inode's mask? */ 607 /* more bits in this fsn_mark than the inode's mask? */
498 int do_inode = (new_mask & ~inode->i_fsnotify_mask); 608 int do_inode = (new_mask & ~inode->i_fsnotify_mask);
499 /* more bits in this entry than the group? */
500 int do_group = (new_mask & ~group->mask);
501 609
502 /* update the inode with this new entry */ 610 /* update the inode with this new fsn_mark */
503 if (dropped || do_inode) 611 if (dropped || do_inode)
504 fsnotify_recalc_inode_mask(inode); 612 fsnotify_recalc_inode_mask(inode);
505 613
506 /* update the group mask with the new mask */
507 if (dropped || do_group)
508 fsnotify_recalc_group_mask(group);
509 } 614 }
510 615
511 /* return the wd */ 616 /* return the wd */
512 ret = ientry->wd; 617 ret = i_mark->wd;
513 618
514 /* match the get from fsnotify_find_mark_entry() */ 619 /* match the get from fsnotify_find_mark() */
515 fsnotify_put_mark(entry); 620 fsnotify_put_mark(fsn_mark);
516 621
517 return ret; 622 return ret;
518} 623}
@@ -521,73 +626,51 @@ static int inotify_new_watch(struct fsnotify_group *group,
521 struct inode *inode, 626 struct inode *inode,
522 u32 arg) 627 u32 arg)
523{ 628{
524 struct inotify_inode_mark_entry *tmp_ientry; 629 struct inotify_inode_mark *tmp_i_mark;
525 __u32 mask; 630 __u32 mask;
526 int ret; 631 int ret;
632 struct idr *idr = &group->inotify_data.idr;
633 spinlock_t *idr_lock = &group->inotify_data.idr_lock;
527 634
528 /* don't allow invalid bits: we don't want flags set */ 635 /* don't allow invalid bits: we don't want flags set */
529 mask = inotify_arg_to_mask(arg); 636 mask = inotify_arg_to_mask(arg);
530 if (unlikely(!mask)) 637 if (unlikely(!(mask & IN_ALL_EVENTS)))
531 return -EINVAL; 638 return -EINVAL;
532 639
533 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); 640 tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
534 if (unlikely(!tmp_ientry)) 641 if (unlikely(!tmp_i_mark))
535 return -ENOMEM; 642 return -ENOMEM;
536 643
537 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); 644 fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
538 tmp_ientry->fsn_entry.mask = mask; 645 tmp_i_mark->fsn_mark.mask = mask;
539 tmp_ientry->wd = -1; 646 tmp_i_mark->wd = -1;
540 647
541 ret = -ENOSPC; 648 ret = -ENOSPC;
542 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) 649 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
543 goto out_err; 650 goto out_err;
544retry:
545 ret = -ENOMEM;
546 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
547 goto out_err;
548 651
549 /* we are putting the mark on the idr, take a reference */ 652 ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
550 fsnotify_get_mark(&tmp_ientry->fsn_entry); 653 tmp_i_mark);
551 654 if (ret)
552 spin_lock(&group->inotify_data.idr_lock);
553 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
554 group->inotify_data.last_wd+1,
555 &tmp_ientry->wd);
556 spin_unlock(&group->inotify_data.idr_lock);
557 if (ret) {
558 /* we didn't get on the idr, drop the idr reference */
559 fsnotify_put_mark(&tmp_ientry->fsn_entry);
560
561 /* idr was out of memory allocate and try again */
562 if (ret == -EAGAIN)
563 goto retry;
564 goto out_err; 655 goto out_err;
565 }
566 656
567 /* we are on the idr, now get on the inode */ 657 /* we are on the idr, now get on the inode */
568 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); 658 ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
569 if (ret) { 659 if (ret) {
570 /* we failed to get on the inode, get off the idr */ 660 /* we failed to get on the inode, get off the idr */
571 inotify_remove_from_idr(group, tmp_ientry); 661 inotify_remove_from_idr(group, tmp_i_mark);
572 goto out_err; 662 goto out_err;
573 } 663 }
574 664
575 /* update the idr hint, who cares about races, it's just a hint */
576 group->inotify_data.last_wd = tmp_ientry->wd;
577
578 /* increment the number of watches the user has */ 665 /* increment the number of watches the user has */
579 atomic_inc(&group->inotify_data.user->inotify_watches); 666 atomic_inc(&group->inotify_data.user->inotify_watches);
580 667
581 /* return the watch descriptor for this new entry */ 668 /* return the watch descriptor for this new mark */
582 ret = tmp_ientry->wd; 669 ret = tmp_i_mark->wd;
583
584 /* if this mark added a new event update the group mask */
585 if (mask & ~group->mask)
586 fsnotify_recalc_group_mask(group);
587 670
588out_err: 671out_err:
589 /* match the ref from fsnotify_init_markentry() */ 672 /* match the ref from fsnotify_init_mark() */
590 fsnotify_put_mark(&tmp_ientry->fsn_entry); 673 fsnotify_put_mark(&tmp_i_mark->fsn_mark);
591 674
592 return ret; 675 return ret;
593} 676}
@@ -616,11 +699,8 @@ retry:
616static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events) 699static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
617{ 700{
618 struct fsnotify_group *group; 701 struct fsnotify_group *group;
619 unsigned int grp_num;
620 702
621 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ 703 group = fsnotify_alloc_group(&inotify_fsnotify_ops);
622 grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
623 group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
624 if (IS_ERR(group)) 704 if (IS_ERR(group))
625 return group; 705 return group;
626 706
@@ -726,7 +806,7 @@ fput_and_out:
726SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) 806SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
727{ 807{
728 struct fsnotify_group *group; 808 struct fsnotify_group *group;
729 struct fsnotify_mark_entry *entry; 809 struct inotify_inode_mark *i_mark;
730 struct file *filp; 810 struct file *filp;
731 int ret = 0, fput_needed; 811 int ret = 0, fput_needed;
732 812
@@ -735,25 +815,23 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
735 return -EBADF; 815 return -EBADF;
736 816
737 /* verify that this is indeed an inotify instance */ 817 /* verify that this is indeed an inotify instance */
738 if (unlikely(filp->f_op != &inotify_fops)) { 818 ret = -EINVAL;
739 ret = -EINVAL; 819 if (unlikely(filp->f_op != &inotify_fops))
740 goto out; 820 goto out;
741 }
742 821
743 group = filp->private_data; 822 group = filp->private_data;
744 823
745 spin_lock(&group->inotify_data.idr_lock); 824 ret = -EINVAL;
746 entry = idr_find(&group->inotify_data.idr, wd); 825 i_mark = inotify_idr_find(group, wd);
747 if (unlikely(!entry)) { 826 if (unlikely(!i_mark))
748 spin_unlock(&group->inotify_data.idr_lock);
749 ret = -EINVAL;
750 goto out; 827 goto out;
751 }
752 fsnotify_get_mark(entry);
753 spin_unlock(&group->inotify_data.idr_lock);
754 828
755 fsnotify_destroy_mark_by_entry(entry); 829 ret = 0;
756 fsnotify_put_mark(entry); 830
831 fsnotify_destroy_mark(&i_mark->fsn_mark);
832
833 /* match ref taken by inotify_idr_find */
834 fsnotify_put_mark(&i_mark->fsn_mark);
757 835
758out: 836out:
759 fput_light(filp, fput_needed); 837 fput_light(filp, fput_needed);
@@ -767,7 +845,28 @@ out:
767 */ 845 */
768static int __init inotify_user_setup(void) 846static int __init inotify_user_setup(void)
769{ 847{
770 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 848 BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
849 BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
850 BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
851 BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
852 BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
853 BUILD_BUG_ON(IN_OPEN != FS_OPEN);
854 BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
855 BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
856 BUILD_BUG_ON(IN_CREATE != FS_CREATE);
857 BUILD_BUG_ON(IN_DELETE != FS_DELETE);
858 BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
859 BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
860 BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
861 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
862 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
863 BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
864 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
865 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
866
867 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
868
869 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
771 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 870 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
772 871
773 inotify_max_queued_events = 16384; 872 inotify_max_queued_events = 16384;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
new file mode 100644
index 000000000000..325185e514bb
--- /dev/null
+++ b/fs/notify/mark.c
@@ -0,0 +1,371 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19/*
20 * fsnotify inode mark locking/lifetime/and refcnting
21 *
22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are
24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped.
28 *
29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
31 * be taken in order as follows:
32 *
33 * mark->lock
34 * group->mark_lock
35 * inode->i_lock
36 *
37 * mark->lock protects 2 things, mark->group and mark->inode. You must hold
38 * that lock to dereference either of these things (they could be NULL even with
39 * the lock)
40 *
41 * group->mark_lock protects the marks_list anchored inside a given group
42 * and each mark is hooked via the g_list. It also sorta protects the
43 * free_g_list, which when used is anchored by a private list on the stack of the
44 * task which held the group->mark_lock.
45 *
46 * inode->i_lock protects the i_fsnotify_marks list anchored inside a
47 * given inode and each mark is hooked via the i_list. (and sorta the
48 * free_i_list)
49 *
50 *
51 * LIFETIME:
52 * Inode marks survive between when they are added to an inode and when their
53 * refcnt==0.
54 *
55 * The inode mark can be cleared for a number of different reasons including:
56 * - The inode is unlinked for the last time. (fsnotify_inode_remove)
57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
58 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
59 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark)
60 * - The fsnotify_group associated with the mark is going away and all such marks
61 * need to be cleaned up. (fsnotify_clear_marks_by_group)
62 *
63 * Worst case we are given an inode and need to clean up all the marks on that
64 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no
68 * longer fear anything finding the mark using the inode's list of marks.
69 *
70 * We can safely and locklessly run the private list on the stack of everything
71 * we just unattached from the original inode. For each mark on the private list
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list.
80 *
81 * This has the very interesting property of being able to run concurrently with
82 * any (or all) other directions.
83 */
84
85#include <linux/fs.h>
86#include <linux/init.h>
87#include <linux/kernel.h>
88#include <linux/kthread.h>
89#include <linux/module.h>
90#include <linux/mutex.h>
91#include <linux/slab.h>
92#include <linux/spinlock.h>
93#include <linux/srcu.h>
94#include <linux/writeback.h> /* for inode_lock */
95
96#include <asm/atomic.h>
97
98#include <linux/fsnotify_backend.h>
99#include "fsnotify.h"
100
101struct srcu_struct fsnotify_mark_srcu;
102static DEFINE_SPINLOCK(destroy_lock);
103static LIST_HEAD(destroy_list);
104static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
105
106void fsnotify_get_mark(struct fsnotify_mark *mark)
107{
108 atomic_inc(&mark->refcnt);
109}
110
111void fsnotify_put_mark(struct fsnotify_mark *mark)
112{
113 if (atomic_dec_and_test(&mark->refcnt))
114 mark->free_mark(mark);
115}
116
117/*
118 * Any time a mark is getting freed we end up here.
119 * The caller had better be holding a reference to this mark so we don't actually
120 * do the final put under the mark->lock
121 */
122void fsnotify_destroy_mark(struct fsnotify_mark *mark)
123{
124 struct fsnotify_group *group;
125 struct inode *inode = NULL;
126
127 spin_lock(&mark->lock);
128
129 group = mark->group;
130
131 /* something else already called this function on this mark */
132 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
133 spin_unlock(&mark->lock);
134 return;
135 }
136
137 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
138
139 /* 1 from caller and 1 for being on i_list/g_list */
140 BUG_ON(atomic_read(&mark->refcnt) < 2);
141
142 spin_lock(&group->mark_lock);
143
144 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
145 inode = mark->i.inode;
146 fsnotify_destroy_inode_mark(mark);
147 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
148 fsnotify_destroy_vfsmount_mark(mark);
149 else
150 BUG();
151
152 list_del_init(&mark->g_list);
153
154 spin_unlock(&group->mark_lock);
155 spin_unlock(&mark->lock);
156
157 spin_lock(&destroy_lock);
158 list_add(&mark->destroy_list, &destroy_list);
159 spin_unlock(&destroy_lock);
160 wake_up(&destroy_waitq);
161
162 /*
163 * Some groups like to know that marks are being freed. This is a
164 * callback to the group function to let it know that this mark
165 * is being freed.
166 */
167 if (group->ops->freeing_mark)
168 group->ops->freeing_mark(mark, group);
169
170 /*
171 * __fsnotify_update_child_dentry_flags(inode);
172 *
173 * I really want to call that, but we can't, we have no idea if the inode
174 * still exists the second we drop the mark->lock.
175 *
176 * The next time an event arrive to this inode from one of it's children
177 * __fsnotify_parent will see that the inode doesn't care about it's
178 * children and will update all of these flags then. So really this
179 * is just a lazy update (and could be a perf win...)
180 */
181
182 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
183 iput(inode);
184
185 /*
186 * it's possible that this group tried to destroy itself, but this
187 * this mark was simultaneously being freed by inode. If that's the
188 * case, we finish freeing the group here.
189 */
190 if (unlikely(atomic_dec_and_test(&group->num_marks)))
191 fsnotify_final_destroy_group(group);
192}
193
194void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
195{
196 assert_spin_locked(&mark->lock);
197
198 mark->mask = mask;
199
200 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
201 fsnotify_set_inode_mark_mask_locked(mark, mask);
202}
203
204void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
205{
206 assert_spin_locked(&mark->lock);
207
208 mark->ignored_mask = mask;
209}
210
211/*
212 * Attach an initialized mark to a given group and fs object.
213 * These marks may be used for the fsnotify backend to determine which
214 * event types should be delivered to which group.
215 */
216int fsnotify_add_mark(struct fsnotify_mark *mark,
217 struct fsnotify_group *group, struct inode *inode,
218 struct vfsmount *mnt, int allow_dups)
219{
220 int ret = 0;
221
222 BUG_ON(inode && mnt);
223 BUG_ON(!inode && !mnt);
224
225 /*
226 * LOCKING ORDER!!!!
227 * mark->lock
228 * group->mark_lock
229 * inode->i_lock
230 */
231 spin_lock(&mark->lock);
232 spin_lock(&group->mark_lock);
233
234 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
235
236 mark->group = group;
237 list_add(&mark->g_list, &group->marks_list);
238 atomic_inc(&group->num_marks);
239 fsnotify_get_mark(mark); /* for i_list and g_list */
240
241 if (inode) {
242 ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
243 if (ret)
244 goto err;
245 } else if (mnt) {
246 ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
247 if (ret)
248 goto err;
249 } else {
250 BUG();
251 }
252
253 spin_unlock(&group->mark_lock);
254
255 /* this will pin the object if appropriate */
256 fsnotify_set_mark_mask_locked(mark, mark->mask);
257
258 spin_unlock(&mark->lock);
259
260 if (inode)
261 __fsnotify_update_child_dentry_flags(inode);
262
263 return ret;
264err:
265 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
266 list_del_init(&mark->g_list);
267 mark->group = NULL;
268 atomic_dec(&group->num_marks);
269
270 spin_unlock(&group->mark_lock);
271 spin_unlock(&mark->lock);
272
273 spin_lock(&destroy_lock);
274 list_add(&mark->destroy_list, &destroy_list);
275 spin_unlock(&destroy_lock);
276 wake_up(&destroy_waitq);
277
278 return ret;
279}
280
281/*
282 * clear any marks in a group in which mark->flags & flags is true
283 */
284void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
285 unsigned int flags)
286{
287 struct fsnotify_mark *lmark, *mark;
288 LIST_HEAD(free_list);
289
290 spin_lock(&group->mark_lock);
291 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
292 if (mark->flags & flags) {
293 list_add(&mark->free_g_list, &free_list);
294 list_del_init(&mark->g_list);
295 fsnotify_get_mark(mark);
296 }
297 }
298 spin_unlock(&group->mark_lock);
299
300 list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
301 fsnotify_destroy_mark(mark);
302 fsnotify_put_mark(mark);
303 }
304}
305
306/*
307 * Given a group, destroy all of the marks associated with that group.
308 */
309void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
310{
311 fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
312}
313
314void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
315{
316 assert_spin_locked(&old->lock);
317 new->i.inode = old->i.inode;
318 new->m.mnt = old->m.mnt;
319 new->group = old->group;
320 new->mask = old->mask;
321 new->free_mark = old->free_mark;
322}
323
324/*
325 * Nothing fancy, just initialize lists and locks and counters.
326 */
327void fsnotify_init_mark(struct fsnotify_mark *mark,
328 void (*free_mark)(struct fsnotify_mark *mark))
329{
330 memset(mark, 0, sizeof(*mark));
331 spin_lock_init(&mark->lock);
332 atomic_set(&mark->refcnt, 1);
333 mark->free_mark = free_mark;
334}
335
336static int fsnotify_mark_destroy(void *ignored)
337{
338 struct fsnotify_mark *mark, *next;
339 LIST_HEAD(private_destroy_list);
340
341 for (;;) {
342 spin_lock(&destroy_lock);
343 /* exchange the list head */
344 list_replace_init(&destroy_list, &private_destroy_list);
345 spin_unlock(&destroy_lock);
346
347 synchronize_srcu(&fsnotify_mark_srcu);
348
349 list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) {
350 list_del_init(&mark->destroy_list);
351 fsnotify_put_mark(mark);
352 }
353
354 wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
355 }
356
357 return 0;
358}
359
360static int __init fsnotify_mark_init(void)
361{
362 struct task_struct *thread;
363
364 thread = kthread_run(fsnotify_mark_destroy, NULL,
365 "fsnotify_mark");
366 if (IS_ERR(thread))
367 panic("unable to start fsnotify mark destruction thread.");
368
369 return 0;
370}
371device_initcall(fsnotify_mark_init);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b8bf53b4c108..f39260f8f865 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -56,7 +56,7 @@ static struct kmem_cache *fsnotify_event_holder_cachep;
56 * it is needed. It's refcnt is set 1 at kernel init time and will never 56 * it is needed. It's refcnt is set 1 at kernel init time and will never
57 * get set to 0 so it will never get 'freed' 57 * get set to 0 so it will never get 'freed'
58 */ 58 */
59static struct fsnotify_event q_overflow_event; 59static struct fsnotify_event *q_overflow_event;
60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); 60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
61 61
62/** 62/**
@@ -87,12 +87,15 @@ void fsnotify_put_event(struct fsnotify_event *event)
87 return; 87 return;
88 88
89 if (atomic_dec_and_test(&event->refcnt)) { 89 if (atomic_dec_and_test(&event->refcnt)) {
90 pr_debug("%s: event=%p\n", __func__, event);
91
90 if (event->data_type == FSNOTIFY_EVENT_PATH) 92 if (event->data_type == FSNOTIFY_EVENT_PATH)
91 path_put(&event->path); 93 path_put(&event->path);
92 94
93 BUG_ON(!list_empty(&event->private_data_list)); 95 BUG_ON(!list_empty(&event->private_data_list));
94 96
95 kfree(event->file_name); 97 kfree(event->file_name);
98 put_pid(event->tgid);
96 kmem_cache_free(fsnotify_event_cachep, event); 99 kmem_cache_free(fsnotify_event_cachep, event);
97 } 100 }
98} 101}
@@ -104,7 +107,8 @@ struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
104 107
105void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) 108void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
106{ 109{
107 kmem_cache_free(fsnotify_event_holder_cachep, holder); 110 if (holder)
111 kmem_cache_free(fsnotify_event_holder_cachep, holder);
108} 112}
109 113
110/* 114/*
@@ -129,53 +133,20 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
129} 133}
130 134
131/* 135/*
132 * Check if 2 events contain the same information. We do not compare private data
133 * but at this moment that isn't a problem for any know fsnotify listeners.
134 */
135static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
136{
137 if ((old->mask == new->mask) &&
138 (old->to_tell == new->to_tell) &&
139 (old->data_type == new->data_type) &&
140 (old->name_len == new->name_len)) {
141 switch (old->data_type) {
142 case (FSNOTIFY_EVENT_INODE):
143 /* remember, after old was put on the wait_q we aren't
144 * allowed to look at the inode any more, only thing
145 * left to check was if the file_name is the same */
146 if (!old->name_len ||
147 !strcmp(old->file_name, new->file_name))
148 return true;
149 break;
150 case (FSNOTIFY_EVENT_PATH):
151 if ((old->path.mnt == new->path.mnt) &&
152 (old->path.dentry == new->path.dentry))
153 return true;
154 break;
155 case (FSNOTIFY_EVENT_NONE):
156 if (old->mask & FS_Q_OVERFLOW)
157 return true;
158 else if (old->mask & FS_IN_IGNORED)
159 return false;
160 return false;
161 };
162 }
163 return false;
164}
165
166/*
167 * Add an event to the group notification queue. The group can later pull this 136 * Add an event to the group notification queue. The group can later pull this
168 * event off the queue to deal with. If the event is successfully added to the 137 * event off the queue to deal with. If the event is successfully added to the
169 * group's notification queue, a reference is taken on event. 138 * group's notification queue, a reference is taken on event.
170 */ 139 */
171int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, 140struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
172 struct fsnotify_event_private_data *priv) 141 struct fsnotify_event_private_data *priv,
142 struct fsnotify_event *(*merge)(struct list_head *,
143 struct fsnotify_event *))
173{ 144{
145 struct fsnotify_event *return_event = NULL;
174 struct fsnotify_event_holder *holder = NULL; 146 struct fsnotify_event_holder *holder = NULL;
175 struct list_head *list = &group->notification_list; 147 struct list_head *list = &group->notification_list;
176 struct fsnotify_event_holder *last_holder; 148
177 struct fsnotify_event *last_event; 149 pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
178 int ret = 0;
179 150
180 /* 151 /*
181 * There is one fsnotify_event_holder embedded inside each fsnotify_event. 152 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -189,18 +160,40 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
189alloc_holder: 160alloc_holder:
190 holder = fsnotify_alloc_event_holder(); 161 holder = fsnotify_alloc_event_holder();
191 if (!holder) 162 if (!holder)
192 return -ENOMEM; 163 return ERR_PTR(-ENOMEM);
193 } 164 }
194 165
195 mutex_lock(&group->notification_mutex); 166 mutex_lock(&group->notification_mutex);
196 167
197 if (group->q_len >= group->max_events) { 168 if (group->q_len >= group->max_events) {
198 event = &q_overflow_event; 169 event = q_overflow_event;
199 ret = -EOVERFLOW; 170
171 /*
172 * we need to return the overflow event
173 * which means we need a ref
174 */
175 fsnotify_get_event(event);
176 return_event = event;
177
200 /* sorry, no private data on the overflow event */ 178 /* sorry, no private data on the overflow event */
201 priv = NULL; 179 priv = NULL;
202 } 180 }
203 181
182 if (!list_empty(list) && merge) {
183 struct fsnotify_event *tmp;
184
185 tmp = merge(list, event);
186 if (tmp) {
187 mutex_unlock(&group->notification_mutex);
188
189 if (return_event)
190 fsnotify_put_event(return_event);
191 if (holder != &event->holder)
192 fsnotify_destroy_event_holder(holder);
193 return tmp;
194 }
195 }
196
204 spin_lock(&event->lock); 197 spin_lock(&event->lock);
205 198
206 if (list_empty(&event->holder.event_list)) { 199 if (list_empty(&event->holder.event_list)) {
@@ -212,19 +205,13 @@ alloc_holder:
212 * event holder was used, go back and get a new one */ 205 * event holder was used, go back and get a new one */
213 spin_unlock(&event->lock); 206 spin_unlock(&event->lock);
214 mutex_unlock(&group->notification_mutex); 207 mutex_unlock(&group->notification_mutex);
215 goto alloc_holder;
216 }
217 208
218 if (!list_empty(list)) { 209 if (return_event) {
219 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); 210 fsnotify_put_event(return_event);
220 last_event = last_holder->event; 211 return_event = NULL;
221 if (event_compare(last_event, event)) {
222 spin_unlock(&event->lock);
223 mutex_unlock(&group->notification_mutex);
224 if (holder != &event->holder)
225 fsnotify_destroy_event_holder(holder);
226 return -EEXIST;
227 } 212 }
213
214 goto alloc_holder;
228 } 215 }
229 216
230 group->q_len++; 217 group->q_len++;
@@ -238,7 +225,7 @@ alloc_holder:
238 mutex_unlock(&group->notification_mutex); 225 mutex_unlock(&group->notification_mutex);
239 226
240 wake_up(&group->notification_waitq); 227 wake_up(&group->notification_waitq);
241 return ret; 228 return return_event;
242} 229}
243 230
244/* 231/*
@@ -253,6 +240,8 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
253 240
254 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 241 BUG_ON(!mutex_is_locked(&group->notification_mutex));
255 242
243 pr_debug("%s: group=%p\n", __func__, group);
244
256 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 245 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
257 246
258 event = holder->event; 247 event = holder->event;
@@ -314,25 +303,82 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
314 303
315static void initialize_event(struct fsnotify_event *event) 304static void initialize_event(struct fsnotify_event *event)
316{ 305{
317 event->holder.event = NULL;
318 INIT_LIST_HEAD(&event->holder.event_list); 306 INIT_LIST_HEAD(&event->holder.event_list);
319 atomic_set(&event->refcnt, 1); 307 atomic_set(&event->refcnt, 1);
320 308
321 spin_lock_init(&event->lock); 309 spin_lock_init(&event->lock);
322 310
323 event->path.dentry = NULL;
324 event->path.mnt = NULL;
325 event->inode = NULL;
326 event->data_type = FSNOTIFY_EVENT_NONE;
327
328 INIT_LIST_HEAD(&event->private_data_list); 311 INIT_LIST_HEAD(&event->private_data_list);
312}
313
314/*
315 * Caller damn well better be holding whatever mutex is protecting the
316 * old_holder->event_list and the new_event must be a clean event which
317 * cannot be found anywhere else in the kernel.
318 */
319int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
320 struct fsnotify_event *new_event)
321{
322 struct fsnotify_event *old_event = old_holder->event;
323 struct fsnotify_event_holder *new_holder = &new_event->holder;
329 324
330 event->to_tell = NULL; 325 enum event_spinlock_class {
326 SPINLOCK_OLD,
327 SPINLOCK_NEW,
328 };
331 329
332 event->file_name = NULL; 330 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
333 event->name_len = 0;
334 331
335 event->sync_cookie = 0; 332 /*
333 * if the new_event's embedded holder is in use someone
334 * screwed up and didn't give us a clean new event.
335 */
336 BUG_ON(!list_empty(&new_holder->event_list));
337
338 spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
339 spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
340
341 new_holder->event = new_event;
342 list_replace_init(&old_holder->event_list, &new_holder->event_list);
343
344 spin_unlock(&new_event->lock);
345 spin_unlock(&old_event->lock);
346
347 /* event == holder means we are referenced through the in event holder */
348 if (old_holder != &old_event->holder)
349 fsnotify_destroy_event_holder(old_holder);
350
351 fsnotify_get_event(new_event); /* on the list take reference */
352 fsnotify_put_event(old_event); /* off the list, drop reference */
353
354 return 0;
355}
356
357struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
358{
359 struct fsnotify_event *event;
360
361 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
362 if (!event)
363 return NULL;
364
365 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
366
367 memcpy(event, old_event, sizeof(*event));
368 initialize_event(event);
369
370 if (event->name_len) {
371 event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
372 if (!event->file_name) {
373 kmem_cache_free(fsnotify_event_cachep, event);
374 return NULL;
375 }
376 }
377 event->tgid = get_pid(old_event->tgid);
378 if (event->data_type == FSNOTIFY_EVENT_PATH)
379 path_get(&event->path);
380
381 return event;
336} 382}
337 383
338/* 384/*
@@ -348,15 +394,18 @@ static void initialize_event(struct fsnotify_event *event)
348 * @name the filename, if available 394 * @name the filename, if available
349 */ 395 */
350struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 396struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
351 int data_type, const char *name, u32 cookie, 397 int data_type, const unsigned char *name,
352 gfp_t gfp) 398 u32 cookie, gfp_t gfp)
353{ 399{
354 struct fsnotify_event *event; 400 struct fsnotify_event *event;
355 401
356 event = kmem_cache_alloc(fsnotify_event_cachep, gfp); 402 event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
357 if (!event) 403 if (!event)
358 return NULL; 404 return NULL;
359 405
406 pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
407 __func__, event, to_tell, mask, data, data_type);
408
360 initialize_event(event); 409 initialize_event(event);
361 410
362 if (name) { 411 if (name) {
@@ -368,30 +417,21 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
368 event->name_len = strlen(event->file_name); 417 event->name_len = strlen(event->file_name);
369 } 418 }
370 419
420 event->tgid = get_pid(task_tgid(current));
371 event->sync_cookie = cookie; 421 event->sync_cookie = cookie;
372 event->to_tell = to_tell; 422 event->to_tell = to_tell;
423 event->data_type = data_type;
373 424
374 switch (data_type) { 425 switch (data_type) {
375 case FSNOTIFY_EVENT_FILE: {
376 struct file *file = data;
377 struct path *path = &file->f_path;
378 event->path.dentry = path->dentry;
379 event->path.mnt = path->mnt;
380 path_get(&event->path);
381 event->data_type = FSNOTIFY_EVENT_PATH;
382 break;
383 }
384 case FSNOTIFY_EVENT_PATH: { 426 case FSNOTIFY_EVENT_PATH: {
385 struct path *path = data; 427 struct path *path = data;
386 event->path.dentry = path->dentry; 428 event->path.dentry = path->dentry;
387 event->path.mnt = path->mnt; 429 event->path.mnt = path->mnt;
388 path_get(&event->path); 430 path_get(&event->path);
389 event->data_type = FSNOTIFY_EVENT_PATH;
390 break; 431 break;
391 } 432 }
392 case FSNOTIFY_EVENT_INODE: 433 case FSNOTIFY_EVENT_INODE:
393 event->inode = data; 434 event->inode = data;
394 event->data_type = FSNOTIFY_EVENT_INODE;
395 break; 435 break;
396 case FSNOTIFY_EVENT_NONE: 436 case FSNOTIFY_EVENT_NONE:
397 event->inode = NULL; 437 event->inode = NULL;
@@ -412,8 +452,11 @@ __init int fsnotify_notification_init(void)
412 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); 452 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
413 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); 453 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
414 454
415 initialize_event(&q_overflow_event); 455 q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
416 q_overflow_event.mask = FS_Q_OVERFLOW; 456 FSNOTIFY_EVENT_NONE, NULL, 0,
457 GFP_KERNEL);
458 if (!q_overflow_event)
459 panic("unable to allocate fsnotify q_overflow_event\n");
417 460
418 return 0; 461 return 0;
419} 462}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
new file mode 100644
index 000000000000..56772b578fbd
--- /dev/null
+++ b/fs/notify/vfsmount_mark.c
@@ -0,0 +1,187 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; see the file COPYING. If not, write to
16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/init.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/mount.h>
24#include <linux/mutex.h>
25#include <linux/spinlock.h>
26#include <linux/writeback.h> /* for inode_lock */
27
28#include <asm/atomic.h>
29
30#include <linux/fsnotify_backend.h>
31#include "fsnotify.h"
32
33void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
34{
35 struct fsnotify_mark *mark, *lmark;
36 struct hlist_node *pos, *n;
37 LIST_HEAD(free_list);
38
39 spin_lock(&mnt->mnt_root->d_lock);
40 hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) {
41 list_add(&mark->m.free_m_list, &free_list);
42 hlist_del_init_rcu(&mark->m.m_list);
43 fsnotify_get_mark(mark);
44 }
45 spin_unlock(&mnt->mnt_root->d_lock);
46
47 list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
48 fsnotify_destroy_mark(mark);
49 fsnotify_put_mark(mark);
50 }
51}
52
53void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
54{
55 fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
56}
57
58/*
59 * Recalculate the mask of events relevant to a given vfsmount locked.
60 */
61static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
62{
63 struct fsnotify_mark *mark;
64 struct hlist_node *pos;
65 __u32 new_mask = 0;
66
67 assert_spin_locked(&mnt->mnt_root->d_lock);
68
69 hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list)
70 new_mask |= mark->mask;
71 mnt->mnt_fsnotify_mask = new_mask;
72}
73
74/*
75 * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
76 * any notifier is interested in hearing for this mount point
77 */
78void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
79{
80 spin_lock(&mnt->mnt_root->d_lock);
81 fsnotify_recalc_vfsmount_mask_locked(mnt);
82 spin_unlock(&mnt->mnt_root->d_lock);
83}
84
85void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
86{
87 struct vfsmount *mnt = mark->m.mnt;
88
89 assert_spin_locked(&mark->lock);
90 assert_spin_locked(&mark->group->mark_lock);
91
92 spin_lock(&mnt->mnt_root->d_lock);
93
94 hlist_del_init_rcu(&mark->m.m_list);
95 mark->m.mnt = NULL;
96
97 fsnotify_recalc_vfsmount_mask_locked(mnt);
98
99 spin_unlock(&mnt->mnt_root->d_lock);
100}
101
102static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
103 struct vfsmount *mnt)
104{
105 struct fsnotify_mark *mark;
106 struct hlist_node *pos;
107
108 assert_spin_locked(&mnt->mnt_root->d_lock);
109
110 hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) {
111 if (mark->group == group) {
112 fsnotify_get_mark(mark);
113 return mark;
114 }
115 }
116 return NULL;
117}
118
119/*
120 * given a group and vfsmount, find the mark associated with that combination.
121 * if found take a reference to that mark and return it, else return NULL
122 */
123struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
124 struct vfsmount *mnt)
125{
126 struct fsnotify_mark *mark;
127
128 spin_lock(&mnt->mnt_root->d_lock);
129 mark = fsnotify_find_vfsmount_mark_locked(group, mnt);
130 spin_unlock(&mnt->mnt_root->d_lock);
131
132 return mark;
133}
134
135/*
136 * Attach an initialized mark to a given group and vfsmount.
137 * These marks may be used for the fsnotify backend to determine which
138 * event types should be delivered to which groups.
139 */
140int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
141 struct fsnotify_group *group, struct vfsmount *mnt,
142 int allow_dups)
143{
144 struct fsnotify_mark *lmark;
145 struct hlist_node *node, *last = NULL;
146 int ret = 0;
147
148 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
149
150 assert_spin_locked(&mark->lock);
151 assert_spin_locked(&group->mark_lock);
152
153 spin_lock(&mnt->mnt_root->d_lock);
154
155 mark->m.mnt = mnt;
156
157 /* is mark the first mark? */
158 if (hlist_empty(&mnt->mnt_fsnotify_marks)) {
159 hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks);
160 goto out;
161 }
162
163 /* should mark be in the middle of the current list? */
164 hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
165 last = node;
166
167 if ((lmark->group == group) && !allow_dups) {
168 ret = -EEXIST;
169 goto out;
170 }
171
172 if (mark->group < lmark->group)
173 continue;
174
175 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
176 goto out;
177 }
178
179 BUG_ON(last == NULL);
180 /* mark should be the last entry. last is the current last entry */
181 hlist_add_after_rcu(last, &mark->m.m_list);
182out:
183 fsnotify_recalc_vfsmount_mask_locked(mnt);
184 spin_unlock(&mnt->mnt_root->d_lock);
185
186 return ret;
187}
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3feee4a..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
1527 * this problem for now. We do write the $BITMAP attribute if it is present 1527 * this problem for now. We do write the $BITMAP attribute if it is present
1528 * which is the important one for a directory so things are not too bad. 1528 * which is the important one for a directory so things are not too bad.
1529 */ 1529 */
1530static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, 1530static int ntfs_dir_fsync(struct file *filp, int datasync)
1531 int datasync)
1532{ 1531{
1533 struct inode *bmp_vi, *vi = dentry->d_inode; 1532 struct inode *bmp_vi, *vi = filp->f_mapping->host;
1534 int err, ret; 1533 int err, ret;
1535 ntfs_attr na; 1534 ntfs_attr na;
1536 1535
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba75..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
98 * the page at all. For a more detailed explanation see ntfs_truncate() in 98 * the page at all. For a more detailed explanation see ntfs_truncate() in
99 * fs/ntfs/inode.c. 99 * fs/ntfs/inode.c.
100 * 100 *
101 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
102 * pages.
103 *
104 * Return 0 on success and -errno on error. In the case that an error is 101 * Return 0 on success and -errno on error. In the case that an error is
105 * encountered it is possible that the initialized size will already have been 102 * encountered it is possible that the initialized size will already have been
106 * incremented some way towards @new_init_size but it is guaranteed that if 103 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
110 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 107 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
111 * held by the caller. 108 * held by the caller.
112 */ 109 */
113static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, 110static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
114 struct page **cached_page, struct pagevec *lru_pvec)
115{ 111{
116 s64 old_init_size; 112 s64 old_init_size;
117 loff_t old_i_size; 113 loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
403 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 399 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
404 * starting at index @index. 400 * starting at index @index.
405 * 401 *
406 * If a page is newly created, increment its refcount and add it to the 402 * If a page is newly created, add it to lru list
407 * caller's lru-buffering pagevec @lru_pvec.
408 *
409 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
410 * are obtained at once instead of just one page and that 0 is returned on
411 * success and -errno on error.
412 * 403 *
413 * Note, the page locks are obtained in ascending page index order. 404 * Note, the page locks are obtained in ascending page index order.
414 */ 405 */
415static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 406static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
416 pgoff_t index, const unsigned nr_pages, struct page **pages, 407 pgoff_t index, const unsigned nr_pages, struct page **pages,
417 struct page **cached_page, struct pagevec *lru_pvec) 408 struct page **cached_page)
418{ 409{
419 int err, nr; 410 int err, nr;
420 411
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
430 goto err_out; 421 goto err_out;
431 } 422 }
432 } 423 }
433 err = add_to_page_cache(*cached_page, mapping, index, 424 err = add_to_page_cache_lru(*cached_page, mapping, index,
434 GFP_KERNEL); 425 GFP_KERNEL);
435 if (unlikely(err)) { 426 if (unlikely(err)) {
436 if (err == -EEXIST) 427 if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
438 goto err_out; 429 goto err_out;
439 } 430 }
440 pages[nr] = *cached_page; 431 pages[nr] = *cached_page;
441 page_cache_get(*cached_page);
442 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
443 __pagevec_lru_add_file(lru_pvec);
444 *cached_page = NULL; 432 *cached_page = NULL;
445 } 433 }
446 index++; 434 index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1800 ssize_t status, written; 1788 ssize_t status, written;
1801 unsigned nr_pages; 1789 unsigned nr_pages;
1802 int err; 1790 int err;
1803 struct pagevec lru_pvec;
1804 1791
1805 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 1792 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1806 "pos 0x%llx, count 0x%lx.", 1793 "pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1912 } 1899 }
1913 } 1900 }
1914 } 1901 }
1915 pagevec_init(&lru_pvec, 0);
1916 written = 0; 1902 written = 0;
1917 /* 1903 /*
1918 * If the write starts beyond the initialized size, extend it up to the 1904 * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1925 ll = ni->initialized_size; 1911 ll = ni->initialized_size;
1926 read_unlock_irqrestore(&ni->size_lock, flags); 1912 read_unlock_irqrestore(&ni->size_lock, flags);
1927 if (pos > ll) { 1913 if (pos > ll) {
1928 err = ntfs_attr_extend_initialized(ni, pos, &cached_page, 1914 err = ntfs_attr_extend_initialized(ni, pos);
1929 &lru_pvec);
1930 if (err < 0) { 1915 if (err < 0) {
1931 ntfs_error(vol->sb, "Cannot perform write to inode " 1916 ntfs_error(vol->sb, "Cannot perform write to inode "
1932 "0x%lx, attribute type 0x%x, because " 1917 "0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2012 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); 1997 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2013 /* Get and lock @do_pages starting at index @start_idx. */ 1998 /* Get and lock @do_pages starting at index @start_idx. */
2014 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1999 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2015 pages, &cached_page, &lru_pvec); 2000 pages, &cached_page);
2016 if (unlikely(status)) 2001 if (unlikely(status))
2017 break; 2002 break;
2018 /* 2003 /*
@@ -2077,7 +2062,6 @@ err_out:
2077 *ppos = pos; 2062 *ppos = pos;
2078 if (cached_page) 2063 if (cached_page)
2079 page_cache_release(cached_page); 2064 page_cache_release(cached_page);
2080 pagevec_lru_add_file(&lru_pvec);
2081 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2065 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2082 written ? "written" : "status", (unsigned long)written, 2066 written ? "written" : "status", (unsigned long)written,
2083 (long)status); 2067 (long)status);
@@ -2149,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2149/** 2133/**
2150 * ntfs_file_fsync - sync a file to disk 2134 * ntfs_file_fsync - sync a file to disk
2151 * @filp: file to be synced 2135 * @filp: file to be synced
2152 * @dentry: dentry describing the file to sync
2153 * @datasync: if non-zero only flush user data and not metadata 2136 * @datasync: if non-zero only flush user data and not metadata
2154 * 2137 *
2155 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 2138 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
@@ -2165,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2165 * Also, if @datasync is true, we do not wait on the inode to be written out 2148 * Also, if @datasync is true, we do not wait on the inode to be written out
2166 * but we always wait on the page cache pages to be written out. 2149 * but we always wait on the page cache pages to be written out.
2167 * 2150 *
2168 * Note: In the past @filp could be NULL so we ignore it as we don't need it
2169 * anyway.
2170 *
2171 * Locking: Caller must hold i_mutex on the inode. 2151 * Locking: Caller must hold i_mutex on the inode.
2172 * 2152 *
2173 * TODO: We should probably also write all attribute/index inodes associated 2153 * TODO: We should probably also write all attribute/index inodes associated
2174 * with this inode but since we have no simple way of getting to them we ignore 2154 * with this inode but since we have no simple way of getting to them we ignore
2175 * this problem for now. 2155 * this problem for now.
2176 */ 2156 */
2177static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, 2157static int ntfs_file_fsync(struct file *filp, int datasync)
2178 int datasync)
2179{ 2158{
2180 struct inode *vi = dentry->d_inode; 2159 struct inode *vi = filp->f_mapping->host;
2181 int err, ret = 0; 2160 int err, ret = 0;
2182 2161
2183 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2162 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4b57fb1eac2a..93622b175fc7 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2238,7 +2238,7 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
2238} 2238}
2239 2239
2240/** 2240/**
2241 * ntfs_clear_big_inode - clean up the ntfs specific part of an inode 2241 * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
2242 * @vi: vfs inode pending annihilation 2242 * @vi: vfs inode pending annihilation
2243 * 2243 *
2244 * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode() 2244 * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
@@ -2247,10 +2247,13 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
2247 * 2247 *
2248 * If the MFT record is dirty, we commit it before doing anything else. 2248 * If the MFT record is dirty, we commit it before doing anything else.
2249 */ 2249 */
2250void ntfs_clear_big_inode(struct inode *vi) 2250void ntfs_evict_big_inode(struct inode *vi)
2251{ 2251{
2252 ntfs_inode *ni = NTFS_I(vi); 2252 ntfs_inode *ni = NTFS_I(vi);
2253 2253
2254 truncate_inode_pages(&vi->i_data, 0);
2255 end_writeback(vi);
2256
2254#ifdef NTFS_RW 2257#ifdef NTFS_RW
2255 if (NInoDirty(ni)) { 2258 if (NInoDirty(ni)) {
2256 bool was_bad = (is_bad_inode(vi)); 2259 bool was_bad = (is_bad_inode(vi));
@@ -2879,9 +2882,6 @@ void ntfs_truncate_vfs(struct inode *vi) {
2879 * 2882 *
2880 * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also 2883 * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also
2881 * called with ->i_alloc_sem held for writing. 2884 * called with ->i_alloc_sem held for writing.
2882 *
2883 * Basically this is a copy of generic notify_change() and inode_setattr()
2884 * functionality, except we intercept and abort changes in i_size.
2885 */ 2885 */
2886int ntfs_setattr(struct dentry *dentry, struct iattr *attr) 2886int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2887{ 2887{
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 9a113544605d..2dabf813456c 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -279,7 +279,7 @@ extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
279 279
280extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); 280extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
281extern void ntfs_destroy_big_inode(struct inode *inode); 281extern void ntfs_destroy_big_inode(struct inode *inode);
282extern void ntfs_clear_big_inode(struct inode *vi); 282extern void ntfs_evict_big_inode(struct inode *vi);
283 283
284extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); 284extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
285 285
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0de1db6cddbf..512806171bfa 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2700,7 +2700,7 @@ static const struct super_operations ntfs_sops = {
2700 .put_super = ntfs_put_super, /* Syscall: umount. */ 2700 .put_super = ntfs_put_super, /* Syscall: umount. */
2701 .statfs = ntfs_statfs, /* Syscall: statfs */ 2701 .statfs = ntfs_statfs, /* Syscall: statfs */
2702 .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ 2702 .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */
2703 .clear_inode = ntfs_clear_big_inode, /* VFS: Called when an inode is 2703 .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is
2704 removed from memory. */ 2704 removed from memory. */
2705 //.umount_begin = NULL, /* Forced umount. */ 2705 //.umount_begin = NULL, /* Forced umount. */
2706 .show_options = ntfs_show_options, /* Show mount options in 2706 .show_options = ntfs_show_options, /* Show mount options in
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c060..07d9fd854350 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
29 mmap.o \ 29 mmap.o \
30 namei.o \ 30 namei.o \
31 refcounttree.o \ 31 refcounttree.o \
32 reservations.o \
32 resize.o \ 33 resize.o \
33 slot_map.o \ 34 slot_map.o \
34 suballoc.o \ 35 suballoc.o \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e13fc9e8fcdc..391915093fe1 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
209 } 209 }
210 210
211 inode->i_mode = new_mode; 211 inode->i_mode = new_mode;
212 inode->i_ctime = CURRENT_TIME;
212 di->i_mode = cpu_to_le16(inode->i_mode); 213 di->i_mode = cpu_to_le16(inode->i_mode);
214 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
215 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
213 216
214 ocfs2_journal_dirty(handle, di_bh); 217 ocfs2_journal_dirty(handle, di_bh);
215 218
@@ -290,12 +293,30 @@ static int ocfs2_set_acl(handle_t *handle,
290 293
291int ocfs2_check_acl(struct inode *inode, int mask) 294int ocfs2_check_acl(struct inode *inode, int mask)
292{ 295{
293 struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); 296 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
297 struct buffer_head *di_bh = NULL;
298 struct posix_acl *acl;
299 int ret = -EAGAIN;
294 300
295 if (IS_ERR(acl)) 301 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
302 return ret;
303
304 ret = ocfs2_read_inode_block(inode, &di_bh);
305 if (ret < 0) {
306 mlog_errno(ret);
307 return ret;
308 }
309
310 acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
311
312 brelse(di_bh);
313
314 if (IS_ERR(acl)) {
315 mlog_errno(PTR_ERR(acl));
296 return PTR_ERR(acl); 316 return PTR_ERR(acl);
317 }
297 if (acl) { 318 if (acl) {
298 int ret = posix_acl_permission(inode, acl, mask); 319 ret = posix_acl_permission(inode, acl, mask);
299 posix_acl_release(acl); 320 posix_acl_release(acl);
300 return ret; 321 return ret;
301 } 322 }
@@ -344,7 +365,7 @@ int ocfs2_init_acl(handle_t *handle,
344{ 365{
345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 366 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
346 struct posix_acl *acl = NULL; 367 struct posix_acl *acl = NULL;
347 int ret = 0; 368 int ret = 0, ret2;
348 mode_t mode; 369 mode_t mode;
349 370
350 if (!S_ISLNK(inode->i_mode)) { 371 if (!S_ISLNK(inode->i_mode)) {
@@ -381,7 +402,12 @@ int ocfs2_init_acl(handle_t *handle,
381 mode = inode->i_mode; 402 mode = inode->i_mode;
382 ret = posix_acl_create_masq(clone, &mode); 403 ret = posix_acl_create_masq(clone, &mode);
383 if (ret >= 0) { 404 if (ret >= 0) {
384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); 405 ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
406 if (ret2) {
407 mlog_errno(ret2);
408 ret = ret2;
409 goto cleanup;
410 }
385 if (ret > 0) { 411 if (ret > 0) {
386 ret = ocfs2_set_acl(handle, inode, 412 ret = ocfs2_set_acl(handle, inode,
387 di_bh, ACL_TYPE_ACCESS, 413 di_bh, ACL_TYPE_ACCESS,
@@ -489,7 +515,7 @@ cleanup:
489 return ret; 515 return ret;
490} 516}
491 517
492struct xattr_handler ocfs2_xattr_acl_access_handler = { 518const struct xattr_handler ocfs2_xattr_acl_access_handler = {
493 .prefix = POSIX_ACL_XATTR_ACCESS, 519 .prefix = POSIX_ACL_XATTR_ACCESS,
494 .flags = ACL_TYPE_ACCESS, 520 .flags = ACL_TYPE_ACCESS,
495 .list = ocfs2_xattr_list_acl_access, 521 .list = ocfs2_xattr_list_acl_access,
@@ -497,7 +523,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
497 .set = ocfs2_xattr_set_acl, 523 .set = ocfs2_xattr_set_acl,
498}; 524};
499 525
500struct xattr_handler ocfs2_xattr_acl_default_handler = { 526const struct xattr_handler ocfs2_xattr_acl_default_handler = {
501 .prefix = POSIX_ACL_XATTR_DEFAULT, 527 .prefix = POSIX_ACL_XATTR_DEFAULT,
502 .flags = ACL_TYPE_DEFAULT, 528 .flags = ACL_TYPE_DEFAULT,
503 .list = ocfs2_xattr_list_acl_default, 529 .list = ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd913c51e..592fae5007d1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1006 int count, status, i; 1006 int count, status, i;
1007 u16 suballoc_bit_start; 1007 u16 suballoc_bit_start;
1008 u32 num_got; 1008 u32 num_got;
1009 u64 first_blkno; 1009 u64 suballoc_loc, first_blkno;
1010 struct ocfs2_super *osb = 1010 struct ocfs2_super *osb =
1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1012 struct ocfs2_extent_block *eb; 1012 struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1015 1015
1016 count = 0; 1016 count = 0;
1017 while (count < wanted) { 1017 while (count < wanted) {
1018 status = ocfs2_claim_metadata(osb, 1018 status = ocfs2_claim_metadata(handle,
1019 handle,
1020 meta_ac, 1019 meta_ac,
1021 wanted - count, 1020 wanted - count,
1021 &suballoc_loc,
1022 &suballoc_bit_start, 1022 &suballoc_bit_start,
1023 &num_got, 1023 &num_got,
1024 &first_blkno); 1024 &first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot); 1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1055 eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1056 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1056 eb->h_list.l_count = 1057 eb->h_list.l_count =
1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1058 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1061 1062
1062 /* We'll also be dirtied by the caller, so 1063 /* We'll also be dirtied by the caller, so
1063 * this isn't absolutely necessary. */ 1064 * this isn't absolutely necessary. */
1064 status = ocfs2_journal_dirty(handle, bhs[i]); 1065 ocfs2_journal_dirty(handle, bhs[i]);
1065 if (status < 0) {
1066 mlog_errno(status);
1067 goto bail;
1068 }
1069 } 1066 }
1070 1067
1071 count += num_got; 1068 count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1129 goto out; 1126 goto out;
1130 } 1127 }
1131 1128
1132 status = ocfs2_extend_trans(handle, path_num_items(path) + 1129 status = ocfs2_extend_trans(handle, path_num_items(path));
1133 handle->h_buffer_credits);
1134 if (status < 0) { 1130 if (status < 0) {
1135 mlog_errno(status); 1131 mlog_errno(status);
1136 goto out; 1132 goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
1270 if (!eb_el->l_tree_depth) 1266 if (!eb_el->l_tree_depth)
1271 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 1267 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1272 1268
1273 status = ocfs2_journal_dirty(handle, bh); 1269 ocfs2_journal_dirty(handle, bh);
1274 if (status < 0) {
1275 mlog_errno(status);
1276 goto bail;
1277 }
1278
1279 next_blkno = le64_to_cpu(eb->h_blkno); 1270 next_blkno = le64_to_cpu(eb->h_blkno);
1280 } 1271 }
1281 1272
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
1321 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 1312 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1322 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 1313 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1323 1314
1324 status = ocfs2_journal_dirty(handle, *last_eb_bh); 1315 ocfs2_journal_dirty(handle, *last_eb_bh);
1325 if (status < 0) 1316 ocfs2_journal_dirty(handle, et->et_root_bh);
1326 mlog_errno(status); 1317 if (eb_bh)
1327 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1318 ocfs2_journal_dirty(handle, eb_bh);
1328 if (status < 0)
1329 mlog_errno(status);
1330 if (eb_bh) {
1331 status = ocfs2_journal_dirty(handle, eb_bh);
1332 if (status < 0)
1333 mlog_errno(status);
1334 }
1335 1319
1336 /* 1320 /*
1337 * Some callers want to track the rightmost leaf so pass it 1321 * Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1399 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1383 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1400 eb_el->l_recs[i] = root_el->l_recs[i]; 1384 eb_el->l_recs[i] = root_el->l_recs[i];
1401 1385
1402 status = ocfs2_journal_dirty(handle, new_eb_bh); 1386 ocfs2_journal_dirty(handle, new_eb_bh);
1403 if (status < 0) {
1404 mlog_errno(status);
1405 goto bail;
1406 }
1407 1387
1408 status = ocfs2_et_root_journal_access(handle, et, 1388 status = ocfs2_et_root_journal_access(handle, et,
1409 OCFS2_JOURNAL_ACCESS_WRITE); 1389 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1428 if (root_el->l_tree_depth == cpu_to_le16(1)) 1408 if (root_el->l_tree_depth == cpu_to_le16(1))
1429 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 1409 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1430 1410
1431 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1411 ocfs2_journal_dirty(handle, et->et_root_bh);
1432 if (status < 0) {
1433 mlog_errno(status);
1434 goto bail;
1435 }
1436 1412
1437 *ret_new_eb_bh = new_eb_bh; 1413 *ret_new_eb_bh = new_eb_bh;
1438 new_eb_bh = NULL; 1414 new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2064 struct ocfs2_path *right_path, 2040 struct ocfs2_path *right_path,
2065 int subtree_index) 2041 int subtree_index)
2066{ 2042{
2067 int ret, i, idx; 2043 int i, idx;
2068 struct ocfs2_extent_list *el, *left_el, *right_el; 2044 struct ocfs2_extent_list *el, *left_el, *right_el;
2069 struct ocfs2_extent_rec *left_rec, *right_rec; 2045 struct ocfs2_extent_rec *left_rec, *right_rec;
2070 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2046 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2102 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 2078 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2103 right_el); 2079 right_el);
2104 2080
2105 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 2081 ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2106 if (ret) 2082 ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2107 mlog_errno(ret);
2108
2109 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2110 if (ret)
2111 mlog_errno(ret);
2112 2083
2113 /* 2084 /*
2114 * Setup our list pointers now so that the current 2085 * Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2132 2103
2133 root_bh = left_path->p_node[subtree_index].bh; 2104 root_bh = left_path->p_node[subtree_index].bh;
2134 2105
2135 ret = ocfs2_journal_dirty(handle, root_bh); 2106 ocfs2_journal_dirty(handle, root_bh);
2136 if (ret)
2137 mlog_errno(ret);
2138} 2107}
2139 2108
2140static int ocfs2_rotate_subtree_right(handle_t *handle, 2109static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
2207 2176
2208 ocfs2_create_empty_extent(right_el); 2177 ocfs2_create_empty_extent(right_el);
2209 2178
2210 ret = ocfs2_journal_dirty(handle, right_leaf_bh); 2179 ocfs2_journal_dirty(handle, right_leaf_bh);
2211 if (ret) {
2212 mlog_errno(ret);
2213 goto out;
2214 }
2215 2180
2216 /* Do the copy now. */ 2181 /* Do the copy now. */
2217 i = le16_to_cpu(left_el->l_next_free_rec) - 1; 2182 i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
2230 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2195 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2231 le16_add_cpu(&left_el->l_next_free_rec, 1); 2196 le16_add_cpu(&left_el->l_next_free_rec, 1);
2232 2197
2233 ret = ocfs2_journal_dirty(handle, left_leaf_bh); 2198 ocfs2_journal_dirty(handle, left_leaf_bh);
2234 if (ret) {
2235 mlog_errno(ret);
2236 goto out;
2237 }
2238 2199
2239 ocfs2_complete_edge_insert(handle, left_path, right_path, 2200 ocfs2_complete_edge_insert(handle, left_path, right_path,
2240 subtree_index); 2201 subtree_index);
@@ -2249,8 +2210,8 @@ out:
2249 * 2210 *
2250 * Will return zero if the path passed in is already the leftmost path. 2211 * Will return zero if the path passed in is already the leftmost path.
2251 */ 2212 */
2252static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 2213int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2253 struct ocfs2_path *path, u32 *cpos) 2214 struct ocfs2_path *path, u32 *cpos)
2254{ 2215{
2255 int i, j, ret = 0; 2216 int i, j, ret = 0;
2256 u64 blkno; 2217 u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2327 int op_credits, 2288 int op_credits,
2328 struct ocfs2_path *path) 2289 struct ocfs2_path *path)
2329{ 2290{
2330 int ret; 2291 int ret = 0;
2331 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2292 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2332 2293
2333 if (handle->h_buffer_credits < credits) { 2294 if (handle->h_buffer_credits < credits)
2334 ret = ocfs2_extend_trans(handle, 2295 ret = ocfs2_extend_trans(handle,
2335 credits - handle->h_buffer_credits); 2296 credits - handle->h_buffer_credits);
2336 if (ret)
2337 return ret;
2338 2297
2339 if (unlikely(handle->h_buffer_credits < credits)) 2298 return ret;
2340 return ocfs2_extend_trans(handle, credits);
2341 }
2342
2343 return 0;
2344} 2299}
2345 2300
2346/* 2301/*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
2584 * records for all the bh in the path. 2539 * records for all the bh in the path.
2585 * So we have to allocate extra credits and access them. 2540 * So we have to allocate extra credits and access them.
2586 */ 2541 */
2587 ret = ocfs2_extend_trans(handle, 2542 ret = ocfs2_extend_trans(handle, subtree_index);
2588 handle->h_buffer_credits + subtree_index);
2589 if (ret) { 2543 if (ret) {
2590 mlog_errno(ret); 2544 mlog_errno(ret);
2591 goto out; 2545 goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
2823 ocfs2_remove_empty_extent(right_leaf_el); 2777 ocfs2_remove_empty_extent(right_leaf_el);
2824 } 2778 }
2825 2779
2826 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2780 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2827 if (ret) 2781 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2828 mlog_errno(ret);
2829 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2830 if (ret)
2831 mlog_errno(ret);
2832 2782
2833 if (del_right_subtree) { 2783 if (del_right_subtree) {
2834 ocfs2_unlink_subtree(handle, et, left_path, right_path, 2784 ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
2851 if (right_has_empty) 2801 if (right_has_empty)
2852 ocfs2_remove_empty_extent(left_leaf_el); 2802 ocfs2_remove_empty_extent(left_leaf_el);
2853 2803
2854 ret = ocfs2_journal_dirty(handle, et_root_bh); 2804 ocfs2_journal_dirty(handle, et_root_bh);
2855 if (ret)
2856 mlog_errno(ret);
2857 2805
2858 *deleted = 1; 2806 *deleted = 1;
2859 } else 2807 } else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2962 } 2910 }
2963 2911
2964 ocfs2_remove_empty_extent(el); 2912 ocfs2_remove_empty_extent(el);
2965 2913 ocfs2_journal_dirty(handle, bh);
2966 ret = ocfs2_journal_dirty(handle, bh);
2967 if (ret)
2968 mlog_errno(ret);
2969 2914
2970out: 2915out:
2971 return ret; 2916 return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3506 3451
3507 ocfs2_cleanup_merge(el, index); 3452 ocfs2_cleanup_merge(el, index);
3508 3453
3509 ret = ocfs2_journal_dirty(handle, bh); 3454 ocfs2_journal_dirty(handle, bh);
3510 if (ret)
3511 mlog_errno(ret);
3512
3513 if (right_path) { 3455 if (right_path) {
3514 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 3456 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3515 if (ret)
3516 mlog_errno(ret);
3517
3518 ocfs2_complete_edge_insert(handle, left_path, right_path, 3457 ocfs2_complete_edge_insert(handle, left_path, right_path,
3519 subtree_index); 3458 subtree_index);
3520 } 3459 }
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3683 3622
3684 ocfs2_cleanup_merge(el, index); 3623 ocfs2_cleanup_merge(el, index);
3685 3624
3686 ret = ocfs2_journal_dirty(handle, bh); 3625 ocfs2_journal_dirty(handle, bh);
3687 if (ret)
3688 mlog_errno(ret);
3689
3690 if (left_path) { 3626 if (left_path) {
3691 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 3627 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3692 if (ret)
3693 mlog_errno(ret);
3694 3628
3695 /* 3629 /*
3696 * In the situation that the right_rec is empty and the extent 3630 * In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
4016 le32_add_cpu(&rec->e_int_clusters, 3950 le32_add_cpu(&rec->e_int_clusters,
4017 -le32_to_cpu(rec->e_cpos)); 3951 -le32_to_cpu(rec->e_cpos));
4018 3952
4019 ret = ocfs2_journal_dirty(handle, bh); 3953 ocfs2_journal_dirty(handle, bh);
4020 if (ret)
4021 mlog_errno(ret);
4022
4023 } 3954 }
4024} 3955}
4025 3956
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
4203 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 4134 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4204 4135
4205 if (left_path) { 4136 if (left_path) {
4206 int credits = handle->h_buffer_credits;
4207
4208 /* 4137 /*
4209 * There's a chance that left_path got passed back to 4138 * There's a chance that left_path got passed back to
4210 * us without being accounted for in the 4139 * us without being accounted for in the
4211 * journal. Extend our transaction here to be sure we 4140 * journal. Extend our transaction here to be sure we
4212 * can change those blocks. 4141 * can change those blocks.
4213 */ 4142 */
4214 credits += left_path->p_tree_depth; 4143 ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4215
4216 ret = ocfs2_extend_trans(handle, credits);
4217 if (ret < 0) { 4144 if (ret < 0) {
4218 mlog_errno(ret); 4145 mlog_errno(ret);
4219 goto out; 4146 goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
4251 * dirty this for us. 4178 * dirty this for us.
4252 */ 4179 */
4253 if (left_path) 4180 if (left_path)
4254 ret = ocfs2_journal_dirty(handle, 4181 ocfs2_journal_dirty(handle,
4255 path_leaf_bh(left_path)); 4182 path_leaf_bh(left_path));
4256 if (ret)
4257 mlog_errno(ret);
4258 } else 4183 } else
4259 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), 4184 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4260 insert); 4185 insert);
4261 4186
4262 ret = ocfs2_journal_dirty(handle, leaf_bh); 4187 ocfs2_journal_dirty(handle, leaf_bh);
4263 if (ret)
4264 mlog_errno(ret);
4265 4188
4266 if (left_path) { 4189 if (left_path) {
4267 /* 4190 /*
@@ -4384,9 +4307,7 @@ out_update_clusters:
4384 ocfs2_et_update_clusters(et, 4307 ocfs2_et_update_clusters(et,
4385 le16_to_cpu(insert_rec->e_leaf_clusters)); 4308 le16_to_cpu(insert_rec->e_leaf_clusters));
4386 4309
4387 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4310 ocfs2_journal_dirty(handle, et->et_root_bh);
4388 if (ret)
4389 mlog_errno(ret);
4390 4311
4391out: 4312out:
4392 ocfs2_free_path(left_path); 4313 ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4866 goto leave; 4787 goto leave;
4867 } 4788 }
4868 4789
4869 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4790 status = __ocfs2_claim_clusters(handle, data_ac, 1,
4870 clusters_to_add, &bit_off, &num_bits); 4791 clusters_to_add, &bit_off, &num_bits);
4871 if (status < 0) { 4792 if (status < 0) {
4872 if (status != -ENOSPC) 4793 if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4895 goto leave; 4816 goto leave;
4896 } 4817 }
4897 4818
4898 status = ocfs2_journal_dirty(handle, et->et_root_bh); 4819 ocfs2_journal_dirty(handle, et->et_root_bh);
4899 if (status < 0) {
4900 mlog_errno(status);
4901 goto leave;
4902 }
4903 4820
4904 clusters_to_add -= num_bits; 4821 clusters_to_add -= num_bits;
4905 *logical_offset += num_bits; 4822 *logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5309 int index, u32 new_range, 5226 int index, u32 new_range,
5310 struct ocfs2_alloc_context *meta_ac) 5227 struct ocfs2_alloc_context *meta_ac)
5311{ 5228{
5312 int ret, depth, credits = handle->h_buffer_credits; 5229 int ret, depth, credits;
5313 struct buffer_head *last_eb_bh = NULL; 5230 struct buffer_head *last_eb_bh = NULL;
5314 struct ocfs2_extent_block *eb; 5231 struct ocfs2_extent_block *eb;
5315 struct ocfs2_extent_list *rightmost_el, *el; 5232 struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5340 } else 5257 } else
5341 rightmost_el = path_leaf_el(path); 5258 rightmost_el = path_leaf_el(path);
5342 5259
5343 credits += path->p_tree_depth + 5260 credits = path->p_tree_depth +
5344 ocfs2_extend_meta_needed(et->et_root_el); 5261 ocfs2_extend_meta_needed(et->et_root_el);
5345 ret = ocfs2_extend_trans(handle, credits); 5262 ret = ocfs2_extend_trans(handle, credits);
5346 if (ret) { 5263 if (ret) {
5347 mlog_errno(ret); 5264 mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
5671 return ret; 5588 return ret;
5672} 5589}
5673 5590
5591/*
5592 * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5593 * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5594 * number to reserve some extra blocks, and it only handles meta
5595 * data allocations.
5596 *
5597 * Currently, only ocfs2_remove_btree_range() uses it for truncating
5598 * and punching holes.
5599 */
5600static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5601 struct ocfs2_extent_tree *et,
5602 u32 extents_to_split,
5603 struct ocfs2_alloc_context **ac,
5604 int extra_blocks)
5605{
5606 int ret = 0, num_free_extents;
5607 unsigned int max_recs_needed = 2 * extents_to_split;
5608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5609
5610 *ac = NULL;
5611
5612 num_free_extents = ocfs2_num_free_extents(osb, et);
5613 if (num_free_extents < 0) {
5614 ret = num_free_extents;
5615 mlog_errno(ret);
5616 goto out;
5617 }
5618
5619 if (!num_free_extents ||
5620 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
5621 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
5622
5623 if (extra_blocks) {
5624 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
5625 if (ret < 0) {
5626 if (ret != -ENOSPC)
5627 mlog_errno(ret);
5628 goto out;
5629 }
5630 }
5631
5632out:
5633 if (ret) {
5634 if (*ac) {
5635 ocfs2_free_alloc_context(*ac);
5636 *ac = NULL;
5637 }
5638 }
5639
5640 return ret;
5641}
5642
5674int ocfs2_remove_btree_range(struct inode *inode, 5643int ocfs2_remove_btree_range(struct inode *inode,
5675 struct ocfs2_extent_tree *et, 5644 struct ocfs2_extent_tree *et,
5676 u32 cpos, u32 phys_cpos, u32 len, 5645 u32 cpos, u32 phys_cpos, u32 len, int flags,
5677 struct ocfs2_cached_dealloc_ctxt *dealloc) 5646 struct ocfs2_cached_dealloc_ctxt *dealloc,
5647 u64 refcount_loc)
5678{ 5648{
5679 int ret; 5649 int ret, credits = 0, extra_blocks = 0;
5680 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 5650 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5681 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5651 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5682 struct inode *tl_inode = osb->osb_tl_inode; 5652 struct inode *tl_inode = osb->osb_tl_inode;
5683 handle_t *handle; 5653 handle_t *handle;
5684 struct ocfs2_alloc_context *meta_ac = NULL; 5654 struct ocfs2_alloc_context *meta_ac = NULL;
5655 struct ocfs2_refcount_tree *ref_tree = NULL;
5656
5657 if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
5658 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
5659 OCFS2_HAS_REFCOUNT_FL));
5660
5661 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
5662 &ref_tree, NULL);
5663 if (ret) {
5664 mlog_errno(ret);
5665 goto out;
5666 }
5685 5667
5686 ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); 5668 ret = ocfs2_prepare_refcount_change_for_del(inode,
5669 refcount_loc,
5670 phys_blkno,
5671 len,
5672 &credits,
5673 &extra_blocks);
5674 if (ret < 0) {
5675 mlog_errno(ret);
5676 goto out;
5677 }
5678 }
5679
5680 ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
5681 extra_blocks);
5687 if (ret) { 5682 if (ret) {
5688 mlog_errno(ret); 5683 mlog_errno(ret);
5689 return ret; 5684 return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
5699 } 5694 }
5700 } 5695 }
5701 5696
5702 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); 5697 handle = ocfs2_start_trans(osb,
5698 ocfs2_remove_extent_credits(osb->sb) + credits);
5703 if (IS_ERR(handle)) { 5699 if (IS_ERR(handle)) {
5704 ret = PTR_ERR(handle); 5700 ret = PTR_ERR(handle);
5705 mlog_errno(ret); 5701 mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
5724 5720
5725 ocfs2_et_update_clusters(et, -len); 5721 ocfs2_et_update_clusters(et, -len);
5726 5722
5727 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 5723 ocfs2_journal_dirty(handle, et->et_root_bh);
5728 if (ret) {
5729 mlog_errno(ret);
5730 goto out_commit;
5731 }
5732 5724
5733 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 5725 if (phys_blkno) {
5734 if (ret) 5726 if (flags & OCFS2_EXT_REFCOUNTED)
5735 mlog_errno(ret); 5727 ret = ocfs2_decrease_refcount(inode, handle,
5728 ocfs2_blocks_to_clusters(osb->sb,
5729 phys_blkno),
5730 len, meta_ac,
5731 dealloc, 1);
5732 else
5733 ret = ocfs2_truncate_log_append(osb, handle,
5734 phys_blkno, len);
5735 if (ret)
5736 mlog_errno(ret);
5737
5738 }
5736 5739
5737out_commit: 5740out_commit:
5738 ocfs2_commit_trans(osb, handle); 5741 ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
5742 if (meta_ac) 5745 if (meta_ac)
5743 ocfs2_free_alloc_context(meta_ac); 5746 ocfs2_free_alloc_context(meta_ac);
5744 5747
5748 if (ref_tree)
5749 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
5750
5745 return ret; 5751 return ret;
5746} 5752}
5747 5753
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5850 } 5856 }
5851 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 5857 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5852 5858
5853 status = ocfs2_journal_dirty(handle, tl_bh); 5859 ocfs2_journal_dirty(handle, tl_bh);
5854 if (status < 0) {
5855 mlog_errno(status);
5856 goto bail;
5857 }
5858 5860
5859bail: 5861bail:
5860 mlog_exit(status); 5862 mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5893 5895
5894 tl->tl_used = cpu_to_le16(i); 5896 tl->tl_used = cpu_to_le16(i);
5895 5897
5896 status = ocfs2_journal_dirty(handle, tl_bh); 5898 ocfs2_journal_dirty(handle, tl_bh);
5897 if (status < 0) {
5898 mlog_errno(status);
5899 goto bail;
5900 }
5901 5899
5902 /* TODO: Perhaps we can calculate the bulk of the 5900 /* TODO: Perhaps we can calculate the bulk of the
5903 * credits up front rather than extending like 5901 * credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6298 */ 6296 */
6299struct ocfs2_cached_block_free { 6297struct ocfs2_cached_block_free {
6300 struct ocfs2_cached_block_free *free_next; 6298 struct ocfs2_cached_block_free *free_next;
6299 u64 free_bg;
6301 u64 free_blk; 6300 u64 free_blk;
6302 unsigned int free_bit; 6301 unsigned int free_bit;
6303}; 6302};
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6344 } 6343 }
6345 6344
6346 while (head) { 6345 while (head) {
6347 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 6346 if (head->free_bg)
6348 head->free_bit); 6347 bg_blkno = head->free_bg;
6348 else
6349 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6350 head->free_bit);
6349 mlog(0, "Free bit: (bit %u, blkno %llu)\n", 6351 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6350 head->free_bit, (unsigned long long)head->free_blk); 6352 head->free_bit, (unsigned long long)head->free_blk);
6351 6353
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6393 int ret = 0; 6395 int ret = 0;
6394 struct ocfs2_cached_block_free *item; 6396 struct ocfs2_cached_block_free *item;
6395 6397
6396 item = kmalloc(sizeof(*item), GFP_NOFS); 6398 item = kzalloc(sizeof(*item), GFP_NOFS);
6397 if (item == NULL) { 6399 if (item == NULL) {
6398 ret = -ENOMEM; 6400 ret = -ENOMEM;
6399 mlog_errno(ret); 6401 mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
6533} 6535}
6534 6536
6535int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6537int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6536 int type, int slot, u64 blkno, 6538 int type, int slot, u64 suballoc,
6537 unsigned int bit) 6539 u64 blkno, unsigned int bit)
6538{ 6540{
6539 int ret; 6541 int ret;
6540 struct ocfs2_per_slot_free_list *fl; 6542 struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6547 goto out; 6549 goto out;
6548 } 6550 }
6549 6551
6550 item = kmalloc(sizeof(*item), GFP_NOFS); 6552 item = kzalloc(sizeof(*item), GFP_NOFS);
6551 if (item == NULL) { 6553 if (item == NULL) {
6552 ret = -ENOMEM; 6554 ret = -ENOMEM;
6553 mlog_errno(ret); 6555 mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6557 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 6559 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6558 type, slot, bit, (unsigned long long)blkno); 6560 type, slot, bit, (unsigned long long)blkno);
6559 6561
6562 item->free_bg = suballoc;
6560 item->free_blk = blkno; 6563 item->free_blk = blkno;
6561 item->free_bit = bit; 6564 item->free_bit = bit;
6562 item->free_next = fl->f_first; 6565 item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6573{ 6576{
6574 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 6577 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6575 le16_to_cpu(eb->h_suballoc_slot), 6578 le16_to_cpu(eb->h_suballoc_slot),
6579 le64_to_cpu(eb->h_suballoc_loc),
6576 le64_to_cpu(eb->h_blkno), 6580 le64_to_cpu(eb->h_blkno),
6577 le16_to_cpu(eb->h_suballoc_bit)); 6581 le16_to_cpu(eb->h_suballoc_bit));
6578} 6582}
6579 6583
6580/* This function will figure out whether the currently last extent
6581 * block will be deleted, and if it will, what the new last extent
6582 * block will be so we can update his h_next_leaf_blk field, as well
6583 * as the dinodes i_last_eb_blk */
6584static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6585 unsigned int clusters_to_del,
6586 struct ocfs2_path *path,
6587 struct buffer_head **new_last_eb)
6588{
6589 int next_free, ret = 0;
6590 u32 cpos;
6591 struct ocfs2_extent_rec *rec;
6592 struct ocfs2_extent_block *eb;
6593 struct ocfs2_extent_list *el;
6594 struct buffer_head *bh = NULL;
6595
6596 *new_last_eb = NULL;
6597
6598 /* we have no tree, so of course, no last_eb. */
6599 if (!path->p_tree_depth)
6600 goto out;
6601
6602 /* trunc to zero special case - this makes tree_depth = 0
6603 * regardless of what it is. */
6604 if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6605 goto out;
6606
6607 el = path_leaf_el(path);
6608 BUG_ON(!el->l_next_free_rec);
6609
6610 /*
6611 * Make sure that this extent list will actually be empty
6612 * after we clear away the data. We can shortcut out if
6613 * there's more than one non-empty extent in the
6614 * list. Otherwise, a check of the remaining extent is
6615 * necessary.
6616 */
6617 next_free = le16_to_cpu(el->l_next_free_rec);
6618 rec = NULL;
6619 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6620 if (next_free > 2)
6621 goto out;
6622
6623 /* We may have a valid extent in index 1, check it. */
6624 if (next_free == 2)
6625 rec = &el->l_recs[1];
6626
6627 /*
6628 * Fall through - no more nonempty extents, so we want
6629 * to delete this leaf.
6630 */
6631 } else {
6632 if (next_free > 1)
6633 goto out;
6634
6635 rec = &el->l_recs[0];
6636 }
6637
6638 if (rec) {
6639 /*
6640 * Check it we'll only be trimming off the end of this
6641 * cluster.
6642 */
6643 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6644 goto out;
6645 }
6646
6647 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6648 if (ret) {
6649 mlog_errno(ret);
6650 goto out;
6651 }
6652
6653 ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
6654 if (ret) {
6655 mlog_errno(ret);
6656 goto out;
6657 }
6658
6659 eb = (struct ocfs2_extent_block *) bh->b_data;
6660 el = &eb->h_list;
6661
6662 /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6663 * Any corruption is a code bug. */
6664 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6665
6666 *new_last_eb = bh;
6667 get_bh(*new_last_eb);
6668 mlog(0, "returning block %llu, (cpos: %u)\n",
6669 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6670out:
6671 brelse(bh);
6672
6673 return ret;
6674}
6675
6676/*
6677 * Trim some clusters off the rightmost edge of a tree. Only called
6678 * during truncate.
6679 *
6680 * The caller needs to:
6681 * - start journaling of each path component.
6682 * - compute and fully set up any new last ext block
6683 */
6684static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6685 handle_t *handle, struct ocfs2_truncate_context *tc,
6686 u32 clusters_to_del, u64 *delete_start, u8 *flags)
6687{
6688 int ret, i, index = path->p_tree_depth;
6689 u32 new_edge = 0;
6690 u64 deleted_eb = 0;
6691 struct buffer_head *bh;
6692 struct ocfs2_extent_list *el;
6693 struct ocfs2_extent_rec *rec;
6694
6695 *delete_start = 0;
6696 *flags = 0;
6697
6698 while (index >= 0) {
6699 bh = path->p_node[index].bh;
6700 el = path->p_node[index].el;
6701
6702 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6703 index, (unsigned long long)bh->b_blocknr);
6704
6705 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6706
6707 if (index !=
6708 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6709 ocfs2_error(inode->i_sb,
6710 "Inode %lu has invalid ext. block %llu",
6711 inode->i_ino,
6712 (unsigned long long)bh->b_blocknr);
6713 ret = -EROFS;
6714 goto out;
6715 }
6716
6717find_tail_record:
6718 i = le16_to_cpu(el->l_next_free_rec) - 1;
6719 rec = &el->l_recs[i];
6720
6721 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6722 "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6723 ocfs2_rec_clusters(el, rec),
6724 (unsigned long long)le64_to_cpu(rec->e_blkno),
6725 le16_to_cpu(el->l_next_free_rec));
6726
6727 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6728
6729 if (le16_to_cpu(el->l_tree_depth) == 0) {
6730 /*
6731 * If the leaf block contains a single empty
6732 * extent and no records, we can just remove
6733 * the block.
6734 */
6735 if (i == 0 && ocfs2_is_empty_extent(rec)) {
6736 memset(rec, 0,
6737 sizeof(struct ocfs2_extent_rec));
6738 el->l_next_free_rec = cpu_to_le16(0);
6739
6740 goto delete;
6741 }
6742
6743 /*
6744 * Remove any empty extents by shifting things
6745 * left. That should make life much easier on
6746 * the code below. This condition is rare
6747 * enough that we shouldn't see a performance
6748 * hit.
6749 */
6750 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6751 le16_add_cpu(&el->l_next_free_rec, -1);
6752
6753 for(i = 0;
6754 i < le16_to_cpu(el->l_next_free_rec); i++)
6755 el->l_recs[i] = el->l_recs[i + 1];
6756
6757 memset(&el->l_recs[i], 0,
6758 sizeof(struct ocfs2_extent_rec));
6759
6760 /*
6761 * We've modified our extent list. The
6762 * simplest way to handle this change
6763 * is to being the search from the
6764 * start again.
6765 */
6766 goto find_tail_record;
6767 }
6768
6769 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6770
6771 /*
6772 * We'll use "new_edge" on our way back up the
6773 * tree to know what our rightmost cpos is.
6774 */
6775 new_edge = le16_to_cpu(rec->e_leaf_clusters);
6776 new_edge += le32_to_cpu(rec->e_cpos);
6777
6778 /*
6779 * The caller will use this to delete data blocks.
6780 */
6781 *delete_start = le64_to_cpu(rec->e_blkno)
6782 + ocfs2_clusters_to_blocks(inode->i_sb,
6783 le16_to_cpu(rec->e_leaf_clusters));
6784 *flags = rec->e_flags;
6785
6786 /*
6787 * If it's now empty, remove this record.
6788 */
6789 if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6790 memset(rec, 0,
6791 sizeof(struct ocfs2_extent_rec));
6792 le16_add_cpu(&el->l_next_free_rec, -1);
6793 }
6794 } else {
6795 if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6796 memset(rec, 0,
6797 sizeof(struct ocfs2_extent_rec));
6798 le16_add_cpu(&el->l_next_free_rec, -1);
6799
6800 goto delete;
6801 }
6802
6803 /* Can this actually happen? */
6804 if (le16_to_cpu(el->l_next_free_rec) == 0)
6805 goto delete;
6806
6807 /*
6808 * We never actually deleted any clusters
6809 * because our leaf was empty. There's no
6810 * reason to adjust the rightmost edge then.
6811 */
6812 if (new_edge == 0)
6813 goto delete;
6814
6815 rec->e_int_clusters = cpu_to_le32(new_edge);
6816 le32_add_cpu(&rec->e_int_clusters,
6817 -le32_to_cpu(rec->e_cpos));
6818
6819 /*
6820 * A deleted child record should have been
6821 * caught above.
6822 */
6823 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6824 }
6825
6826delete:
6827 ret = ocfs2_journal_dirty(handle, bh);
6828 if (ret) {
6829 mlog_errno(ret);
6830 goto out;
6831 }
6832
6833 mlog(0, "extent list container %llu, after: record %d: "
6834 "(%u, %u, %llu), next = %u.\n",
6835 (unsigned long long)bh->b_blocknr, i,
6836 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6837 (unsigned long long)le64_to_cpu(rec->e_blkno),
6838 le16_to_cpu(el->l_next_free_rec));
6839
6840 /*
6841 * We must be careful to only attempt delete of an
6842 * extent block (and not the root inode block).
6843 */
6844 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6845 struct ocfs2_extent_block *eb =
6846 (struct ocfs2_extent_block *)bh->b_data;
6847
6848 /*
6849 * Save this for use when processing the
6850 * parent block.
6851 */
6852 deleted_eb = le64_to_cpu(eb->h_blkno);
6853
6854 mlog(0, "deleting this extent block.\n");
6855
6856 ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6857
6858 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6859 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6860 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6861
6862 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6863 /* An error here is not fatal. */
6864 if (ret < 0)
6865 mlog_errno(ret);
6866 } else {
6867 deleted_eb = 0;
6868 }
6869
6870 index--;
6871 }
6872
6873 ret = 0;
6874out:
6875 return ret;
6876}
6877
6878static int ocfs2_do_truncate(struct ocfs2_super *osb,
6879 unsigned int clusters_to_del,
6880 struct inode *inode,
6881 struct buffer_head *fe_bh,
6882 handle_t *handle,
6883 struct ocfs2_truncate_context *tc,
6884 struct ocfs2_path *path,
6885 struct ocfs2_alloc_context *meta_ac)
6886{
6887 int status;
6888 struct ocfs2_dinode *fe;
6889 struct ocfs2_extent_block *last_eb = NULL;
6890 struct ocfs2_extent_list *el;
6891 struct buffer_head *last_eb_bh = NULL;
6892 u64 delete_blk = 0;
6893 u8 rec_flags;
6894
6895 fe = (struct ocfs2_dinode *) fe_bh->b_data;
6896
6897 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6898 path, &last_eb_bh);
6899 if (status < 0) {
6900 mlog_errno(status);
6901 goto bail;
6902 }
6903
6904 /*
6905 * Each component will be touched, so we might as well journal
6906 * here to avoid having to handle errors later.
6907 */
6908 status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6909 if (status < 0) {
6910 mlog_errno(status);
6911 goto bail;
6912 }
6913
6914 if (last_eb_bh) {
6915 status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6916 OCFS2_JOURNAL_ACCESS_WRITE);
6917 if (status < 0) {
6918 mlog_errno(status);
6919 goto bail;
6920 }
6921
6922 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6923 }
6924
6925 el = &(fe->id2.i_list);
6926
6927 /*
6928 * Lower levels depend on this never happening, but it's best
6929 * to check it up here before changing the tree.
6930 */
6931 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6932 ocfs2_error(inode->i_sb,
6933 "Inode %lu has an empty extent record, depth %u\n",
6934 inode->i_ino, le16_to_cpu(el->l_tree_depth));
6935 status = -EROFS;
6936 goto bail;
6937 }
6938
6939 dquot_free_space_nodirty(inode,
6940 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6941 spin_lock(&OCFS2_I(inode)->ip_lock);
6942 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6943 clusters_to_del;
6944 spin_unlock(&OCFS2_I(inode)->ip_lock);
6945 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6946 inode->i_blocks = ocfs2_inode_sector_count(inode);
6947
6948 status = ocfs2_trim_tree(inode, path, handle, tc,
6949 clusters_to_del, &delete_blk, &rec_flags);
6950 if (status) {
6951 mlog_errno(status);
6952 goto bail;
6953 }
6954
6955 if (le32_to_cpu(fe->i_clusters) == 0) {
6956 /* trunc to zero is a special case. */
6957 el->l_tree_depth = 0;
6958 fe->i_last_eb_blk = 0;
6959 } else if (last_eb)
6960 fe->i_last_eb_blk = last_eb->h_blkno;
6961
6962 status = ocfs2_journal_dirty(handle, fe_bh);
6963 if (status < 0) {
6964 mlog_errno(status);
6965 goto bail;
6966 }
6967
6968 if (last_eb) {
6969 /* If there will be a new last extent block, then by
6970 * definition, there cannot be any leaves to the right of
6971 * him. */
6972 last_eb->h_next_leaf_blk = 0;
6973 status = ocfs2_journal_dirty(handle, last_eb_bh);
6974 if (status < 0) {
6975 mlog_errno(status);
6976 goto bail;
6977 }
6978 }
6979
6980 if (delete_blk) {
6981 if (rec_flags & OCFS2_EXT_REFCOUNTED)
6982 status = ocfs2_decrease_refcount(inode, handle,
6983 ocfs2_blocks_to_clusters(osb->sb,
6984 delete_blk),
6985 clusters_to_del, meta_ac,
6986 &tc->tc_dealloc, 1);
6987 else
6988 status = ocfs2_truncate_log_append(osb, handle,
6989 delete_blk,
6990 clusters_to_del);
6991 if (status < 0) {
6992 mlog_errno(status);
6993 goto bail;
6994 }
6995 }
6996 status = 0;
6997bail:
6998 brelse(last_eb_bh);
6999 mlog_exit(status);
7000 return status;
7001}
7002
7003static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) 6584static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
7004{ 6585{
7005 set_buffer_uptodate(bh); 6586 set_buffer_uptodate(bh);
@@ -7091,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
7091 last_page_bytes = PAGE_ALIGN(end); 6672 last_page_bytes = PAGE_ALIGN(end);
7092 index = start >> PAGE_CACHE_SHIFT; 6673 index = start >> PAGE_CACHE_SHIFT;
7093 do { 6674 do {
7094 pages[numpages] = grab_cache_page(mapping, index); 6675 pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
7095 if (!pages[numpages]) { 6676 if (!pages[numpages]) {
7096 ret = -ENOMEM; 6677 ret = -ENOMEM;
7097 mlog_errno(ret); 6678 mlog_errno(ret);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7307 goto out_commit; 6888 goto out_commit;
7308 did_quota = 1; 6889 did_quota = 1;
7309 6890
7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6891 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
6892
6893 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
7311 &num); 6894 &num);
7312 if (ret) { 6895 if (ret) {
7313 mlog_errno(ret); 6896 mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
7406 */ 6989 */
7407int ocfs2_commit_truncate(struct ocfs2_super *osb, 6990int ocfs2_commit_truncate(struct ocfs2_super *osb,
7408 struct inode *inode, 6991 struct inode *inode,
7409 struct buffer_head *fe_bh, 6992 struct buffer_head *di_bh)
7410 struct ocfs2_truncate_context *tc)
7411{ 6993{
7412 int status, i, credits, tl_sem = 0; 6994 int status = 0, i, flags = 0;
7413 u32 clusters_to_del, new_highest_cpos, range; 6995 u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
7414 u64 blkno = 0; 6996 u64 blkno = 0;
7415 struct ocfs2_extent_list *el; 6997 struct ocfs2_extent_list *el;
7416 handle_t *handle = NULL; 6998 struct ocfs2_extent_rec *rec;
7417 struct inode *tl_inode = osb->osb_tl_inode;
7418 struct ocfs2_path *path = NULL; 6999 struct ocfs2_path *path = NULL;
7419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 7000 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7420 struct ocfs2_alloc_context *meta_ac = NULL; 7001 struct ocfs2_extent_list *root_el = &(di->id2.i_list);
7421 struct ocfs2_refcount_tree *ref_tree = NULL; 7002 u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
7003 struct ocfs2_extent_tree et;
7004 struct ocfs2_cached_dealloc_ctxt dealloc;
7422 7005
7423 mlog_entry_void(); 7006 mlog_entry_void();
7424 7007
7008 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7009 ocfs2_init_dealloc_ctxt(&dealloc);
7010
7425 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7011 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7426 i_size_read(inode)); 7012 i_size_read(inode));
7427 7013
7428 path = ocfs2_new_path(fe_bh, &di->id2.i_list, 7014 path = ocfs2_new_path(di_bh, &di->id2.i_list,
7429 ocfs2_journal_access_di); 7015 ocfs2_journal_access_di);
7430 if (!path) { 7016 if (!path) {
7431 status = -ENOMEM; 7017 status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
7444 goto bail; 7030 goto bail;
7445 } 7031 }
7446 7032
7447 credits = 0;
7448
7449 /* 7033 /*
7450 * Truncate always works against the rightmost tree branch. 7034 * Truncate always works against the rightmost tree branch.
7451 */ 7035 */
@@ -7480,101 +7064,62 @@ start:
7480 } 7064 }
7481 7065
7482 i = le16_to_cpu(el->l_next_free_rec) - 1; 7066 i = le16_to_cpu(el->l_next_free_rec) - 1;
7483 range = le32_to_cpu(el->l_recs[i].e_cpos) + 7067 rec = &el->l_recs[i];
7484 ocfs2_rec_clusters(el, &el->l_recs[i]); 7068 flags = rec->e_flags;
7485 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 7069 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
7486 clusters_to_del = 0; 7070
7487 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 7071 if (i == 0 && ocfs2_is_empty_extent(rec)) {
7488 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 7072 /*
7489 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 7073 * Lower levels depend on this never happening, but it's best
7074 * to check it up here before changing the tree.
7075 */
7076 if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7077 ocfs2_error(inode->i_sb, "Inode %lu has an empty "
7078 "extent record, depth %u\n", inode->i_ino,
7079 le16_to_cpu(root_el->l_tree_depth));
7080 status = -EROFS;
7081 goto bail;
7082 }
7083 trunc_cpos = le32_to_cpu(rec->e_cpos);
7084 trunc_len = 0;
7085 blkno = 0;
7086 } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7087 /*
7088 * Truncate entire record.
7089 */
7090 trunc_cpos = le32_to_cpu(rec->e_cpos);
7091 trunc_len = ocfs2_rec_clusters(el, rec);
7092 blkno = le64_to_cpu(rec->e_blkno);
7490 } else if (range > new_highest_cpos) { 7093 } else if (range > new_highest_cpos) {
7491 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 7094 /*
7492 le32_to_cpu(el->l_recs[i].e_cpos)) - 7095 * Partial truncate. it also should be
7493 new_highest_cpos; 7096 * the last truncate we're doing.
7494 blkno = le64_to_cpu(el->l_recs[i].e_blkno) + 7097 */
7495 ocfs2_clusters_to_blocks(inode->i_sb, 7098 trunc_cpos = new_highest_cpos;
7496 ocfs2_rec_clusters(el, &el->l_recs[i]) - 7099 trunc_len = range - new_highest_cpos;
7497 clusters_to_del); 7100 coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
7101 blkno = le64_to_cpu(rec->e_blkno) +
7102 ocfs2_clusters_to_blocks(inode->i_sb, coff);
7498 } else { 7103 } else {
7104 /*
7105 * Truncate completed, leave happily.
7106 */
7499 status = 0; 7107 status = 0;
7500 goto bail; 7108 goto bail;
7501 } 7109 }
7502 7110
7503 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 7111 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
7504 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7505
7506 if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
7507 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
7508 OCFS2_HAS_REFCOUNT_FL));
7509
7510 status = ocfs2_lock_refcount_tree(osb,
7511 le64_to_cpu(di->i_refcount_loc),
7512 1, &ref_tree, NULL);
7513 if (status) {
7514 mlog_errno(status);
7515 goto bail;
7516 }
7517
7518 status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
7519 blkno,
7520 clusters_to_del,
7521 &credits,
7522 &meta_ac);
7523 if (status < 0) {
7524 mlog_errno(status);
7525 goto bail;
7526 }
7527 }
7528
7529 mutex_lock(&tl_inode->i_mutex);
7530 tl_sem = 1;
7531 /* ocfs2_truncate_log_needs_flush guarantees us at least one
7532 * record is free for use. If there isn't any, we flush to get
7533 * an empty truncate log. */
7534 if (ocfs2_truncate_log_needs_flush(osb)) {
7535 status = __ocfs2_flush_truncate_log(osb);
7536 if (status < 0) {
7537 mlog_errno(status);
7538 goto bail;
7539 }
7540 }
7541 7112
7542 credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 7113 status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
7543 (struct ocfs2_dinode *)fe_bh->b_data, 7114 phys_cpos, trunc_len, flags, &dealloc,
7544 el); 7115 refcount_loc);
7545 handle = ocfs2_start_trans(osb, credits);
7546 if (IS_ERR(handle)) {
7547 status = PTR_ERR(handle);
7548 handle = NULL;
7549 mlog_errno(status);
7550 goto bail;
7551 }
7552
7553 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7554 tc, path, meta_ac);
7555 if (status < 0) { 7116 if (status < 0) {
7556 mlog_errno(status); 7117 mlog_errno(status);
7557 goto bail; 7118 goto bail;
7558 } 7119 }
7559 7120
7560 mutex_unlock(&tl_inode->i_mutex);
7561 tl_sem = 0;
7562
7563 ocfs2_commit_trans(osb, handle);
7564 handle = NULL;
7565
7566 ocfs2_reinit_path(path, 1); 7121 ocfs2_reinit_path(path, 1);
7567 7122
7568 if (meta_ac) {
7569 ocfs2_free_alloc_context(meta_ac);
7570 meta_ac = NULL;
7571 }
7572
7573 if (ref_tree) {
7574 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7575 ref_tree = NULL;
7576 }
7577
7578 /* 7123 /*
7579 * The check above will catch the case where we've truncated 7124 * The check above will catch the case where we've truncated
7580 * away all allocation. 7125 * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
7585 7130
7586 ocfs2_schedule_truncate_log_flush(osb, 1); 7131 ocfs2_schedule_truncate_log_flush(osb, 1);
7587 7132
7588 if (tl_sem) 7133 ocfs2_run_deallocs(osb, &dealloc);
7589 mutex_unlock(&tl_inode->i_mutex);
7590
7591 if (handle)
7592 ocfs2_commit_trans(osb, handle);
7593
7594 if (meta_ac)
7595 ocfs2_free_alloc_context(meta_ac);
7596
7597 if (ref_tree)
7598 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7599
7600 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7601 7134
7602 ocfs2_free_path(path); 7135 ocfs2_free_path(path);
7603 7136
7604 /* This will drop the ext_alloc cluster lock for us */
7605 ocfs2_free_truncate_context(tc);
7606
7607 mlog_exit(status); 7137 mlog_exit(status);
7608 return status; 7138 return status;
7609} 7139}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb90..55762b554b99 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
140 struct ocfs2_cached_dealloc_ctxt *dealloc); 140 struct ocfs2_cached_dealloc_ctxt *dealloc);
141int ocfs2_remove_btree_range(struct inode *inode, 141int ocfs2_remove_btree_range(struct inode *inode,
142 struct ocfs2_extent_tree *et, 142 struct ocfs2_extent_tree *et,
143 u32 cpos, u32 phys_cpos, u32 len, 143 u32 cpos, u32 phys_cpos, u32 len, int flags,
144 struct ocfs2_cached_dealloc_ctxt *dealloc); 144 struct ocfs2_cached_dealloc_ctxt *dealloc,
145 u64 refcount_loc);
145 146
146int ocfs2_num_free_extents(struct ocfs2_super *osb, 147int ocfs2_num_free_extents(struct ocfs2_super *osb,
147 struct ocfs2_extent_tree *et); 148 struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
209int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 210int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
210 u64 blkno, unsigned int bit); 211 u64 blkno, unsigned int bit);
211int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 212int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
212 int type, int slot, u64 blkno, 213 int type, int slot, u64 suballoc, u64 blkno,
213 unsigned int bit); 214 unsigned int bit);
214static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) 215static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
215{ 216{
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
233 struct ocfs2_truncate_context **tc); 234 struct ocfs2_truncate_context **tc);
234int ocfs2_commit_truncate(struct ocfs2_super *osb, 235int ocfs2_commit_truncate(struct ocfs2_super *osb,
235 struct inode *inode, 236 struct inode *inode,
236 struct buffer_head *fe_bh, 237 struct buffer_head *di_bh);
237 struct ocfs2_truncate_context *tc);
238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
239 unsigned int start, unsigned int end, int trunc); 239 unsigned int start, unsigned int end, int trunc);
240 240
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
319 struct ocfs2_path *path); 319 struct ocfs2_path *path);
320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
321 struct ocfs2_path *path, u32 *cpos); 321 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
323 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, 324int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
323 struct ocfs2_path *left, 325 struct ocfs2_path *left,
324 struct ocfs2_path *right); 326 struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441ddb5506..0de69c9a08be 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
196 dump_stack(); 196 dump_stack();
197 goto bail; 197 goto bail;
198 } 198 }
199
200 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
201 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
202 (unsigned long long)past_eof);
203
204 if (create && (iblock >= past_eof))
205 set_buffer_new(bh_result);
206 } 199 }
207 200
201 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
202 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
203 (unsigned long long)past_eof);
204 if (create && (iblock >= past_eof))
205 set_buffer_new(bh_result);
206
208bail: 207bail:
209 if (err < 0) 208 if (err < 0)
210 err = -EIO; 209 err = -EIO;
@@ -459,36 +458,6 @@ int walk_page_buffers( handle_t *handle,
459 return ret; 458 return ret;
460} 459}
461 460
462handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
463 struct page *page,
464 unsigned from,
465 unsigned to)
466{
467 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
468 handle_t *handle;
469 int ret = 0;
470
471 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
472 if (IS_ERR(handle)) {
473 ret = -ENOMEM;
474 mlog_errno(ret);
475 goto out;
476 }
477
478 if (ocfs2_should_order_data(inode)) {
479 ret = ocfs2_jbd2_file_inode(handle, inode);
480 if (ret < 0)
481 mlog_errno(ret);
482 }
483out:
484 if (ret) {
485 if (!IS_ERR(handle))
486 ocfs2_commit_trans(osb, handle);
487 handle = ERR_PTR(ret);
488 }
489 return handle;
490}
491
492static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) 461static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
493{ 462{
494 sector_t status; 463 sector_t status;
@@ -609,7 +578,9 @@ bail:
609static void ocfs2_dio_end_io(struct kiocb *iocb, 578static void ocfs2_dio_end_io(struct kiocb *iocb,
610 loff_t offset, 579 loff_t offset,
611 ssize_t bytes, 580 ssize_t bytes,
612 void *private) 581 void *private,
582 int ret,
583 bool is_async)
613{ 584{
614 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 585 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
615 int level; 586 int level;
@@ -623,6 +594,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
623 if (!level) 594 if (!level)
624 up_read(&inode->i_alloc_sem); 595 up_read(&inode->i_alloc_sem);
625 ocfs2_rw_unlock(inode, level); 596 ocfs2_rw_unlock(inode, level);
597
598 if (is_async)
599 aio_complete(iocb, ret, 0);
626} 600}
627 601
628/* 602/*
@@ -669,11 +643,10 @@ static ssize_t ocfs2_direct_IO(int rw,
669 if (i_size_read(inode) <= offset) 643 if (i_size_read(inode) <= offset)
670 return 0; 644 return 0;
671 645
672 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 646 ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
673 inode->i_sb->s_bdev, iov, offset, 647 iov, offset, nr_segs,
674 nr_segs, 648 ocfs2_direct_IO_get_blocks,
675 ocfs2_direct_IO_get_blocks, 649 ocfs2_dio_end_io, NULL, 0);
676 ocfs2_dio_end_io);
677 650
678 mlog_exit(ret); 651 mlog_exit(ret);
679 return ret; 652 return ret;
@@ -1131,23 +1104,37 @@ out:
1131 */ 1104 */
1132static int ocfs2_grab_pages_for_write(struct address_space *mapping, 1105static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1133 struct ocfs2_write_ctxt *wc, 1106 struct ocfs2_write_ctxt *wc,
1134 u32 cpos, loff_t user_pos, int new, 1107 u32 cpos, loff_t user_pos,
1108 unsigned user_len, int new,
1135 struct page *mmap_page) 1109 struct page *mmap_page)
1136{ 1110{
1137 int ret = 0, i; 1111 int ret = 0, i;
1138 unsigned long start, target_index, index; 1112 unsigned long start, target_index, end_index, index;
1139 struct inode *inode = mapping->host; 1113 struct inode *inode = mapping->host;
1114 loff_t last_byte;
1140 1115
1141 target_index = user_pos >> PAGE_CACHE_SHIFT; 1116 target_index = user_pos >> PAGE_CACHE_SHIFT;
1142 1117
1143 /* 1118 /*
1144 * Figure out how many pages we'll be manipulating here. For 1119 * Figure out how many pages we'll be manipulating here. For
1145 * non allocating write, we just change the one 1120 * non allocating write, we just change the one
1146 * page. Otherwise, we'll need a whole clusters worth. 1121 * page. Otherwise, we'll need a whole clusters worth. If we're
1122 * writing past i_size, we only need enough pages to cover the
1123 * last page of the write.
1147 */ 1124 */
1148 if (new) { 1125 if (new) {
1149 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); 1126 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1150 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); 1127 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1128 /*
1129 * We need the index *past* the last page we could possibly
1130 * touch. This is the page past the end of the write or
1131 * i_size, whichever is greater.
1132 */
1133 last_byte = max(user_pos + user_len, i_size_read(inode));
1134 BUG_ON(last_byte < 1);
1135 end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
1136 if ((start + wc->w_num_pages) > end_index)
1137 wc->w_num_pages = end_index - start;
1151 } else { 1138 } else {
1152 wc->w_num_pages = 1; 1139 wc->w_num_pages = 1;
1153 start = target_index; 1140 start = target_index;
@@ -1620,21 +1607,20 @@ out:
1620 * write path can treat it as an non-allocating write, which has no 1607 * write path can treat it as an non-allocating write, which has no
1621 * special case code for sparse/nonsparse files. 1608 * special case code for sparse/nonsparse files.
1622 */ 1609 */
1623static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, 1610static int ocfs2_expand_nonsparse_inode(struct inode *inode,
1624 unsigned len, 1611 struct buffer_head *di_bh,
1612 loff_t pos, unsigned len,
1625 struct ocfs2_write_ctxt *wc) 1613 struct ocfs2_write_ctxt *wc)
1626{ 1614{
1627 int ret; 1615 int ret;
1628 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1629 loff_t newsize = pos + len; 1616 loff_t newsize = pos + len;
1630 1617
1631 if (ocfs2_sparse_alloc(osb)) 1618 BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1632 return 0;
1633 1619
1634 if (newsize <= i_size_read(inode)) 1620 if (newsize <= i_size_read(inode))
1635 return 0; 1621 return 0;
1636 1622
1637 ret = ocfs2_extend_no_holes(inode, newsize, pos); 1623 ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
1638 if (ret) 1624 if (ret)
1639 mlog_errno(ret); 1625 mlog_errno(ret);
1640 1626
@@ -1644,6 +1630,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
1644 return ret; 1630 return ret;
1645} 1631}
1646 1632
1633static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1634 loff_t pos)
1635{
1636 int ret = 0;
1637
1638 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1639 if (pos > i_size_read(inode))
1640 ret = ocfs2_zero_extend(inode, di_bh, pos);
1641
1642 return ret;
1643}
1644
1647int ocfs2_write_begin_nolock(struct address_space *mapping, 1645int ocfs2_write_begin_nolock(struct address_space *mapping,
1648 loff_t pos, unsigned len, unsigned flags, 1646 loff_t pos, unsigned len, unsigned flags,
1649 struct page **pagep, void **fsdata, 1647 struct page **pagep, void **fsdata,
@@ -1679,7 +1677,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1679 } 1677 }
1680 } 1678 }
1681 1679
1682 ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); 1680 if (ocfs2_sparse_alloc(osb))
1681 ret = ocfs2_zero_tail(inode, di_bh, pos);
1682 else
1683 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
1684 wc);
1683 if (ret) { 1685 if (ret) {
1684 mlog_errno(ret); 1686 mlog_errno(ret);
1685 goto out; 1687 goto out;
@@ -1735,6 +1737,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1735 goto out; 1737 goto out;
1736 } 1738 }
1737 1739
1740 if (data_ac)
1741 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
1742
1738 credits = ocfs2_calc_extend_credits(inode->i_sb, 1743 credits = ocfs2_calc_extend_credits(inode->i_sb,
1739 &di->id2.i_list, 1744 &di->id2.i_list,
1740 clusters_to_alloc); 1745 clusters_to_alloc);
@@ -1786,7 +1791,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1786 * that we can zero and flush if we error after adding the 1791 * that we can zero and flush if we error after adding the
1787 * extent. 1792 * extent.
1788 */ 1793 */
1789 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, 1794 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
1790 cluster_of_pages, mmap_page); 1795 cluster_of_pages, mmap_page);
1791 if (ret) { 1796 if (ret) {
1792 mlog_errno(ret); 1797 mlog_errno(ret);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..c7ee03c22226 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
404 * larger than 16 bits. 404 * larger than 16 bits.
405 */ 405 */
406 BUG_ON(ecc > USHORT_MAX); 406 BUG_ON(ecc > USHRT_MAX);
407 407
408 bc->bc_crc32e = cpu_to_le32(crc); 408 bc->bc_crc32e = cpu_to_le32(crc);
409 bc->bc_ecc = cpu_to_le16((u16)ecc); 409 bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
439 439
440 ocfs2_blockcheck_inc_failure(stats); 440 ocfs2_blockcheck_inc_failure(stats);
441 mlog(ML_ERROR, 441 mlog(ML_ERROR,
442 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", 442 "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
443 (unsigned int)check.bc_crc32e, (unsigned int)crc); 443 (unsigned int)check.bc_crc32e, (unsigned int)crc);
444 444
445 /* Ok, try ECC fixups */ 445 /* Ok, try ECC fixups */
@@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
453 goto out; 453 goto out;
454 } 454 }
455 455
456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", 456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
457 (unsigned int)check.bc_crc32e, (unsigned int)crc); 457 (unsigned int)check.bc_crc32e, (unsigned int)crc);
458 458
459 rc = -EIO; 459 rc = -EIO;
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
509 * larger than 16 bits. 509 * larger than 16 bits.
510 */ 510 */
511 BUG_ON(ecc > USHORT_MAX); 511 BUG_ON(ecc > USHRT_MAX);
512 512
513 bc->bc_crc32e = cpu_to_le32(crc); 513 bc->bc_crc32e = cpu_to_le32(crc);
514 bc->bc_ecc = cpu_to_le16((u16)ecc); 514 bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 3bb928a2bf7d..c7fba396392d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
116 define_mask(ERROR), 116 define_mask(ERROR),
117 define_mask(NOTICE), 117 define_mask(NOTICE),
118 define_mask(KTHREAD), 118 define_mask(KTHREAD),
119 define_mask(RESERVATIONS),
119}; 120};
120 121
121static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; 122static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f2..fd96e2a2fa56 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
122 123
123#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 124#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
124#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 125#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 73e743eea2c8..cbe2f057cc28 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
583 o2net_sc_queue_work(sc, &sc->sc_connect_work); 583 o2net_sc_queue_work(sc, &sc->sc_connect_work);
584 break; 584 break;
585 default: 585 default:
586 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
587 " shutdown, state %d\n",
588 SC_NODEF_ARGS(sc), sk->sk_state);
586 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 589 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
587 break; 590 break;
588 } 591 }
@@ -974,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
974int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 977int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
975 size_t caller_veclen, u8 target_node, int *status) 978 size_t caller_veclen, u8 target_node, int *status)
976{ 979{
977 int ret; 980 int ret = 0;
978 struct o2net_msg *msg = NULL; 981 struct o2net_msg *msg = NULL;
979 size_t veclen, caller_bytes = 0; 982 size_t veclen, caller_bytes = 0;
980 struct kvec *vec = NULL; 983 struct kvec *vec = NULL;
@@ -1756,6 +1759,7 @@ static int o2net_accept_one(struct socket *sock)
1756 struct sockaddr_in sin; 1759 struct sockaddr_in sin;
1757 struct socket *new_sock = NULL; 1760 struct socket *new_sock = NULL;
1758 struct o2nm_node *node = NULL; 1761 struct o2nm_node *node = NULL;
1762 struct o2nm_node *local_node = NULL;
1759 struct o2net_sock_container *sc = NULL; 1763 struct o2net_sock_container *sc = NULL;
1760 struct o2net_node *nn; 1764 struct o2net_node *nn;
1761 1765
@@ -1793,11 +1797,15 @@ static int o2net_accept_one(struct socket *sock)
1793 goto out; 1797 goto out;
1794 } 1798 }
1795 1799
1796 if (o2nm_this_node() > node->nd_num) { 1800 if (o2nm_this_node() >= node->nd_num) {
1797 mlog(ML_NOTICE, "unexpected connect attempted from a lower " 1801 local_node = o2nm_get_node_by_num(o2nm_this_node());
1798 "numbered node '%s' at " "%pI4:%d with num %u\n", 1802 mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
1799 node->nd_name, &sin.sin_addr.s_addr, 1803 "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
1800 ntohs(sin.sin_port), node->nd_num); 1804 local_node->nd_name, local_node->nd_num,
1805 &(local_node->nd_ipv4_address),
1806 ntohs(local_node->nd_ipv4_port),
1807 node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
1808 ntohs(sin.sin_port));
1801 ret = -EINVAL; 1809 ret = -EINVAL;
1802 goto out; 1810 goto out;
1803 } 1811 }
@@ -1854,6 +1862,8 @@ out:
1854 sock_release(new_sock); 1862 sock_release(new_sock);
1855 if (node) 1863 if (node)
1856 o2nm_node_put(node); 1864 o2nm_node_put(node);
1865 if (local_node)
1866 o2nm_node_put(local_node);
1857 if (sc) 1867 if (sc)
1858 sc_put(sc); 1868 sc_put(sc);
1859 return ret; 1869 return ret;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d071c80..c49f6de0e7ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1194 else 1194 else
1195 de->inode = 0; 1195 de->inode = 0;
1196 dir->i_version++; 1196 dir->i_version++;
1197 status = ocfs2_journal_dirty(handle, bh); 1197 ocfs2_journal_dirty(handle, bh);
1198 goto bail; 1198 goto bail;
1199 } 1199 }
1200 i += le16_to_cpu(de->rec_len); 1200 i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
1752 ocfs2_recalc_free_list(dir, handle, lookup); 1752 ocfs2_recalc_free_list(dir, handle, lookup);
1753 1753
1754 dir->i_version++; 1754 dir->i_version++;
1755 status = ocfs2_journal_dirty(handle, insert_bh); 1755 ocfs2_journal_dirty(handle, insert_bh);
1756 retval = 0; 1756 retval = 0;
1757 goto bail; 1757 goto bail;
1758 } 1758 }
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2297 } 2297 }
2298 2298
2299 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); 2299 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
2300
2301 ocfs2_journal_dirty(handle, di_bh); 2300 ocfs2_journal_dirty(handle, di_bh);
2302 if (ret) {
2303 mlog_errno(ret);
2304 goto out;
2305 }
2306 2301
2307 i_size_write(inode, size); 2302 i_size_write(inode, size);
2308 inode->i_nlink = 2; 2303 inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2366 ocfs2_init_dir_trailer(inode, new_bh, size); 2361 ocfs2_init_dir_trailer(inode, new_bh, size);
2367 } 2362 }
2368 2363
2369 status = ocfs2_journal_dirty(handle, new_bh); 2364 ocfs2_journal_dirty(handle, new_bh);
2370 if (status < 0) {
2371 mlog_errno(status);
2372 goto bail;
2373 }
2374 2365
2375 i_size_write(inode, inode->i_sb->s_blocksize); 2366 i_size_write(inode, inode->i_sb->s_blocksize);
2376 inode->i_nlink = 2; 2367 inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2404 int ret; 2395 int ret;
2405 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 2396 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2406 u16 dr_suballoc_bit; 2397 u16 dr_suballoc_bit;
2407 u64 dr_blkno; 2398 u64 suballoc_loc, dr_blkno;
2408 unsigned int num_bits; 2399 unsigned int num_bits;
2409 struct buffer_head *dx_root_bh = NULL; 2400 struct buffer_head *dx_root_bh = NULL;
2410 struct ocfs2_dx_root_block *dx_root; 2401 struct ocfs2_dx_root_block *dx_root;
2411 struct ocfs2_dir_block_trailer *trailer = 2402 struct ocfs2_dir_block_trailer *trailer =
2412 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); 2403 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2413 2404
2414 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit, 2405 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
2415 &num_bits, &dr_blkno); 2406 &dr_suballoc_bit, &num_bits, &dr_blkno);
2416 if (ret) { 2407 if (ret) {
2417 mlog_errno(ret); 2408 mlog_errno(ret);
2418 goto out; 2409 goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2431 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2432 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 2433 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2434 dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2435 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2436 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2437 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2458 dx_root->dr_list.l_count = 2450 dx_root->dr_list.l_count =
2459 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); 2451 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2460 } 2452 }
2461 2453 ocfs2_journal_dirty(handle, dx_root_bh);
2462 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2463 if (ret)
2464 mlog_errno(ret);
2465 2454
2466 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, 2455 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2467 OCFS2_JOURNAL_ACCESS_CREATE); 2456 OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2475 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; 2464 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2476 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 2465 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2477 2466
2478 ret = ocfs2_journal_dirty(handle, di_bh); 2467 ocfs2_journal_dirty(handle, di_bh);
2479 if (ret)
2480 mlog_errno(ret);
2481 2468
2482 *ret_dx_root_bh = dx_root_bh; 2469 *ret_dx_root_bh = dx_root_bh;
2483 dx_root_bh = NULL; 2470 dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2558 * chance of contiguousness as the directory grows in number 2545 * chance of contiguousness as the directory grows in number
2559 * of entries. 2546 * of entries.
2560 */ 2547 */
2561 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num); 2548 ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
2562 if (ret) { 2549 if (ret) {
2563 mlog_errno(ret); 2550 mlog_errno(ret);
2564 goto out; 2551 goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2991 * if we only get one now, that's enough to continue. The rest 2978 * if we only get one now, that's enough to continue. The rest
2992 * will be claimed after the conversion to extents. 2979 * will be claimed after the conversion to extents.
2993 */ 2980 */
2994 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); 2981 if (ocfs2_dir_resv_allowed(osb))
2982 data_ac->ac_resv = &oi->ip_la_data_resv;
2983 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
2995 if (ret) { 2984 if (ret) {
2996 mlog_errno(ret); 2985 mlog_errno(ret);
2997 goto out_commit; 2986 goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3034 ocfs2_init_dir_trailer(dir, dirdata_bh, i); 3023 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3035 } 3024 }
3036 3025
3037 ret = ocfs2_journal_dirty(handle, dirdata_bh); 3026 ocfs2_journal_dirty(handle, dirdata_bh);
3038 if (ret) {
3039 mlog_errno(ret);
3040 goto out_commit;
3041 }
3042 3027
3043 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 3028 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3044 /* 3029 /*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3104 */ 3089 */
3105 dir->i_blocks = ocfs2_inode_sector_count(dir); 3090 dir->i_blocks = ocfs2_inode_sector_count(dir);
3106 3091
3107 ret = ocfs2_journal_dirty(handle, di_bh); 3092 ocfs2_journal_dirty(handle, di_bh);
3108 if (ret) {
3109 mlog_errno(ret);
3110 goto out_commit;
3111 }
3112 3093
3113 if (ocfs2_supports_indexed_dirs(osb)) { 3094 if (ocfs2_supports_indexed_dirs(osb)) {
3114 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, 3095 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3138 * pass. Claim the 2nd cluster as a separate extent. 3119 * pass. Claim the 2nd cluster as a separate extent.
3139 */ 3120 */
3140 if (alloc > len) { 3121 if (alloc > len) {
3141 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 3122 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
3142 &len); 3123 &len);
3143 if (ret) { 3124 if (ret) {
3144 mlog_errno(ret); 3125 mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3369 goto bail; 3350 goto bail;
3370 } 3351 }
3371 3352
3353 if (ocfs2_dir_resv_allowed(osb))
3354 data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
3355
3372 credits = ocfs2_calc_extend_credits(sb, el, 1); 3356 credits = ocfs2_calc_extend_credits(sb, el, 1);
3373 } else { 3357 } else {
3374 spin_unlock(&OCFS2_I(dir)->ip_lock); 3358 spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
3423 } else { 3407 } else {
3424 de->rec_len = cpu_to_le16(sb->s_blocksize); 3408 de->rec_len = cpu_to_le16(sb->s_blocksize);
3425 } 3409 }
3426 status = ocfs2_journal_dirty(handle, new_bh); 3410 ocfs2_journal_dirty(handle, new_bh);
3427 if (status < 0) {
3428 mlog_errno(status);
3429 goto bail;
3430 }
3431 3411
3432 dir_i_size += dir->i_sb->s_blocksize; 3412 dir_i_size += dir->i_sb->s_blocksize;
3433 i_size_write(dir, dir_i_size); 3413 i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3906 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, 3886 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3907 dx_leaf_sort_swap); 3887 dx_leaf_sort_swap);
3908 3888
3909 ret = ocfs2_journal_dirty(handle, dx_leaf_bh); 3889 ocfs2_journal_dirty(handle, dx_leaf_bh);
3910 if (ret) {
3911 mlog_errno(ret);
3912 goto out_commit;
3913 }
3914 3890
3915 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, 3891 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3916 &split_hash); 3892 &split_hash);
@@ -3955,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3955 goto out_commit; 3931 goto out_commit;
3956 } 3932 }
3957 3933
3934 cpos = split_hash;
3935 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3936 data_ac, meta_ac, new_dx_leaves,
3937 num_dx_leaves);
3938 if (ret) {
3939 mlog_errno(ret);
3940 goto out_commit;
3941 }
3942
3958 for (i = 0; i < num_dx_leaves; i++) { 3943 for (i = 0; i < num_dx_leaves; i++) {
3959 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), 3944 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3960 orig_dx_leaves[i], 3945 orig_dx_leaves[i],
@@ -3963,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3963 mlog_errno(ret); 3948 mlog_errno(ret);
3964 goto out_commit; 3949 goto out_commit;
3965 } 3950 }
3966 }
3967 3951
3968 cpos = split_hash; 3952 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3969 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, 3953 new_dx_leaves[i],
3970 data_ac, meta_ac, new_dx_leaves, 3954 OCFS2_JOURNAL_ACCESS_WRITE);
3971 num_dx_leaves); 3955 if (ret) {
3972 if (ret) { 3956 mlog_errno(ret);
3973 mlog_errno(ret); 3957 goto out_commit;
3974 goto out_commit; 3958 }
3975 } 3959 }
3976 3960
3977 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, 3961 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
@@ -4490,7 +4474,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4490 4474
4491 blk = le64_to_cpu(dx_root->dr_blkno); 4475 blk = le64_to_cpu(dx_root->dr_blkno);
4492 bit = le16_to_cpu(dx_root->dr_suballoc_bit); 4476 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4493 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 4477 if (dx_root->dr_suballoc_loc)
4478 bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
4479 else
4480 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4494 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, 4481 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4495 bit, bg_blkno, 1); 4482 bit, bg_blkno, 1);
4496 if (ret) 4483 if (ret)
@@ -4551,8 +4538,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4551 4538
4552 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); 4539 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4553 4540
4554 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 4541 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
4555 &dealloc); 4542 &dealloc, 0);
4556 if (ret) { 4543 if (ret) {
4557 mlog_errno(ret); 4544 mlog_errno(ret);
4558 goto out; 4545 goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 12d5eb78a11a..f44999156839 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -88,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
88 return 0; 88 return 0;
89} 89}
90 90
91static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
92{ 92{
93 mlog_entry_void(); 93 mlog_entry_void();
94 94
@@ -145,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
145} 145}
146 146
147 147
148static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 148void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
149{ 149{
150 mlog_entry_void(); 150 mlog_entry_void();
151 151
@@ -451,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
451 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, 451 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
452 lock->ml.node, &status); 452 lock->ml.node, &status);
453 if (ret < 0) 453 if (ret < 0)
454 mlog_errno(ret); 454 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
455 "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
456 lock->ml.node);
455 else { 457 else {
456 if (status == DLM_RECOVERING) { 458 if (status == DLM_RECOVERING) {
457 mlog(ML_ERROR, "sent AST to node %u, it thinks this " 459 mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980c..765298908f1d 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes 37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
38#define DLM_THREAD_MS 200 // flush at least every 200 ms 38#define DLM_THREAD_MS 200 // flush at least every 200 ms
39 39
40#define DLM_HASH_SIZE_DEFAULT (1 << 14) 40#define DLM_HASH_SIZE_DEFAULT (1 << 17)
41#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE 41#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
42# define DLM_HASH_PAGES 1 42# define DLM_HASH_PAGES 1
43#else 43#else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
904 904
905void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 905void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
906void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 906void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
907void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
908void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
907void dlm_do_local_ast(struct dlm_ctxt *dlm, 909void dlm_do_local_ast(struct dlm_ctxt *dlm,
908 struct dlm_lock_resource *res, 910 struct dlm_lock_resource *res,
909 struct dlm_lock *lock); 911 struct dlm_lock *lock);
@@ -1028,6 +1030,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
1028 struct dlm_lock_resource *res); 1030 struct dlm_lock_resource *res);
1029void dlm_clean_master_list(struct dlm_ctxt *dlm, 1031void dlm_clean_master_list(struct dlm_ctxt *dlm,
1030 u8 dead_node); 1032 u8 dead_node);
1033void dlm_force_free_mles(struct dlm_ctxt *dlm);
1031int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); 1034int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
1032int __dlm_lockres_has_locks(struct dlm_lock_resource *res); 1035int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
1033int __dlm_lockres_unused(struct dlm_lock_resource *res); 1036int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 90803b47cd8c..9f30491e5e88 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -390,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
390 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) 390 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
391 dlm_error(ret); 391 dlm_error(ret);
392 } else { 392 } else {
393 mlog_errno(tmpret); 393 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
394 "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
395 res->owner);
394 if (dlm_is_host_down(tmpret)) { 396 if (dlm_is_host_down(tmpret)) {
395 /* instead of logging the same network error over 397 /* instead of logging the same network error over
396 * and over, sleep here and wait for the heartbeat 398 * and over, sleep here and wait for the heartbeat
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0cd24cf54396..901ca52bf86b 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -419,7 +419,7 @@ static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
419 419
420static int debug_buffer_release(struct inode *inode, struct file *file) 420static int debug_buffer_release(struct inode *inode, struct file *file)
421{ 421{
422 struct debug_buffer *db = (struct debug_buffer *)file->private_data; 422 struct debug_buffer *db = file->private_data;
423 423
424 if (db) 424 if (db)
425 kfree(db->buf); 425 kfree(db->buf);
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
636 spin_lock(&dlm->track_lock); 636 spin_lock(&dlm->track_lock);
637 if (oldres) 637 if (oldres)
638 track_list = &oldres->tracking; 638 track_list = &oldres->tracking;
639 else 639 else {
640 track_list = &dlm->tracking_list; 640 track_list = &dlm->tracking_list;
641 if (list_empty(track_list)) {
642 dl = NULL;
643 spin_unlock(&dlm->track_lock);
644 goto bail;
645 }
646 }
641 647
642 list_for_each_entry(res, track_list, tracking) { 648 list_for_each_entry(res, track_list, tracking) {
643 if (&res->tracking == &dlm->tracking_list) 649 if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
660 } else 666 } else
661 dl = NULL; 667 dl = NULL;
662 668
669bail:
663 /* passed to seq_show */ 670 /* passed to seq_show */
664 return dl; 671 return dl;
665} 672}
@@ -715,7 +722,7 @@ static int debug_lockres_open(struct inode *inode, struct file *file)
715 goto bail; 722 goto bail;
716 } 723 }
717 724
718 seq = (struct seq_file *) file->private_data; 725 seq = file->private_data;
719 seq->private = dl; 726 seq->private = dl;
720 727
721 dlm_grab(dlm); 728 dlm_grab(dlm);
@@ -731,7 +738,7 @@ bail:
731 738
732static int debug_lockres_release(struct inode *inode, struct file *file) 739static int debug_lockres_release(struct inode *inode, struct file *file)
733{ 740{
734 struct seq_file *seq = (struct seq_file *)file->private_data; 741 struct seq_file *seq = file->private_data;
735 struct debug_lockres *dl = (struct debug_lockres *)seq->private; 742 struct debug_lockres *dl = (struct debug_lockres *)seq->private;
736 743
737 if (dl->dl_res) 744 if (dl->dl_res)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c9055fd4e..11a5c87fd7f7 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
511 511
512 assert_spin_locked(&dlm->spinlock); 512 assert_spin_locked(&dlm->spinlock);
513 513
514 printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name); 514 printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
515 515
516 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 516 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
517 node + 1)) < O2NM_MAX_NODES) { 517 node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
534 534
535 node = exit_msg->node_idx; 535 node = exit_msg->node_idx;
536 536
537 printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name); 537 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
538 538
539 spin_lock(&dlm->spinlock); 539 spin_lock(&dlm->spinlock);
540 clear_bit(node, dlm->domain_map); 540 clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
565 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, 565 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
566 &leave_msg, sizeof(leave_msg), node, 566 &leave_msg, sizeof(leave_msg), node,
567 NULL); 567 NULL);
568 568 if (status < 0)
569 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
570 "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
569 mlog(0, "status return %d from o2net_send_message\n", status); 571 mlog(0, "status return %d from o2net_send_message\n", status);
570 572
571 return status; 573 return status;
@@ -691,6 +693,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
691 693
692 dlm_mark_domain_leaving(dlm); 694 dlm_mark_domain_leaving(dlm);
693 dlm_leave_domain(dlm); 695 dlm_leave_domain(dlm);
696 dlm_force_free_mles(dlm);
694 dlm_complete_dlm_shutdown(dlm); 697 dlm_complete_dlm_shutdown(dlm);
695 } 698 }
696 dlm_put(dlm); 699 dlm_put(dlm);
@@ -904,7 +907,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
904 set_bit(assert->node_idx, dlm->domain_map); 907 set_bit(assert->node_idx, dlm->domain_map);
905 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 908 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
906 909
907 printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n", 910 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
908 assert->node_idx, dlm->name); 911 assert->node_idx, dlm->name);
909 __dlm_print_nodes(dlm); 912 __dlm_print_nodes(dlm);
910 913
@@ -962,7 +965,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
962 &cancel_msg, sizeof(cancel_msg), node, 965 &cancel_msg, sizeof(cancel_msg), node,
963 NULL); 966 NULL);
964 if (status < 0) { 967 if (status < 0) {
965 mlog_errno(status); 968 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
969 "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
970 node);
966 goto bail; 971 goto bail;
967 } 972 }
968 973
@@ -1029,10 +1034,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
1029 byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); 1034 byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
1030 1035
1031 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, 1036 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
1032 sizeof(join_msg), node, 1037 sizeof(join_msg), node, &join_resp);
1033 &join_resp);
1034 if (status < 0 && status != -ENOPROTOOPT) { 1038 if (status < 0 && status != -ENOPROTOOPT) {
1035 mlog_errno(status); 1039 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
1040 "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
1041 node);
1036 goto bail; 1042 goto bail;
1037 } 1043 }
1038 dlm_query_join_wire_to_packet(join_resp, &packet); 1044 dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1109,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
1103 &assert_msg, sizeof(assert_msg), node, 1109 &assert_msg, sizeof(assert_msg), node,
1104 NULL); 1110 NULL);
1105 if (status < 0) 1111 if (status < 0)
1106 mlog_errno(status); 1112 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
1113 "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1114 node);
1107 1115
1108 return status; 1116 return status;
1109} 1117}
@@ -1516,7 +1524,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1516 goto leave; 1524 goto leave;
1517 } 1525 }
1518 1526
1519 dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); 1527 dlm->name = kstrdup(domain, GFP_KERNEL);
1520 if (dlm->name == NULL) { 1528 if (dlm->name == NULL) {
1521 mlog_errno(-ENOMEM); 1529 mlog_errno(-ENOMEM);
1522 kfree(dlm); 1530 kfree(dlm);
@@ -1550,7 +1558,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1550 for (i = 0; i < DLM_HASH_BUCKETS; i++) 1558 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1551 INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); 1559 INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
1552 1560
1553 strcpy(dlm->name, domain);
1554 dlm->key = key; 1561 dlm->key = key;
1555 dlm->node_num = o2nm_this_node(); 1562 dlm->node_num = o2nm_this_node();
1556 1563
@@ -1665,7 +1672,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
1665 struct dlm_ctxt *dlm = NULL; 1672 struct dlm_ctxt *dlm = NULL;
1666 struct dlm_ctxt *new_ctxt = NULL; 1673 struct dlm_ctxt *new_ctxt = NULL;
1667 1674
1668 if (strlen(domain) > O2NM_MAX_NAME_LEN) { 1675 if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
1669 ret = -ENAMETOOLONG; 1676 ret = -ENAMETOOLONG;
1670 mlog(ML_ERROR, "domain name length too long\n"); 1677 mlog(ML_ERROR, "domain name length too long\n");
1671 goto leave; 1678 goto leave;
@@ -1703,6 +1710,7 @@ retry:
1703 } 1710 }
1704 1711
1705 if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) { 1712 if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
1713 spin_unlock(&dlm_domain_lock);
1706 mlog(ML_ERROR, 1714 mlog(ML_ERROR,
1707 "Requested locking protocol version is not " 1715 "Requested locking protocol version is not "
1708 "compatible with already registered domain " 1716 "compatible with already registered domain "
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 733337772671..69cf369961c4 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
329 BUG(); 329 BUG();
330 } 330 }
331 } else { 331 } else {
332 mlog_errno(tmpret); 332 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
333 "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
334 res->owner);
333 if (dlm_is_host_down(tmpret)) { 335 if (dlm_is_host_down(tmpret)) {
334 ret = DLM_RECOVERING; 336 ret = DLM_RECOVERING;
335 mlog(0, "node %u died so returning DLM_RECOVERING " 337 mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
429 struct dlm_lock *lock; 431 struct dlm_lock *lock;
430 int kernel_allocated = 0; 432 int kernel_allocated = 0;
431 433
432 lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); 434 lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
433 if (!lock) 435 if (!lock)
434 return NULL; 436 return NULL;
435 437
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9289b4357d27..f564b0e5f80d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
511 511
512 atomic_dec(&dlm->res_cur_count); 512 atomic_dec(&dlm->res_cur_count);
513 513
514 dlm_put(dlm);
515
516 if (!hlist_unhashed(&res->hash_node) || 514 if (!hlist_unhashed(&res->hash_node) ||
517 !list_empty(&res->granted) || 515 !list_empty(&res->granted) ||
518 !list_empty(&res->converting) || 516 !list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
585 res->migration_pending = 0; 583 res->migration_pending = 0;
586 res->inflight_locks = 0; 584 res->inflight_locks = 0;
587 585
588 /* put in dlm_lockres_release */
589 dlm_grab(dlm);
590 res->dlm = dlm; 586 res->dlm = dlm;
591 587
592 kref_init(&res->refs); 588 kref_init(&res->refs);
@@ -617,13 +613,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
617{ 613{
618 struct dlm_lock_resource *res = NULL; 614 struct dlm_lock_resource *res = NULL;
619 615
620 res = (struct dlm_lock_resource *) 616 res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
621 kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
622 if (!res) 617 if (!res)
623 goto error; 618 goto error;
624 619
625 res->lockname.name = (char *) 620 res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
626 kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
627 if (!res->lockname.name) 621 if (!res->lockname.name)
628 goto error; 622 goto error;
629 623
@@ -757,8 +751,7 @@ lookup:
757 spin_unlock(&dlm->spinlock); 751 spin_unlock(&dlm->spinlock);
758 mlog(0, "allocating a new resource\n"); 752 mlog(0, "allocating a new resource\n");
759 /* nothing found and we need to allocate one. */ 753 /* nothing found and we need to allocate one. */
760 alloc_mle = (struct dlm_master_list_entry *) 754 alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
761 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
762 if (!alloc_mle) 755 if (!alloc_mle)
763 goto leave; 756 goto leave;
764 res = dlm_new_lockres(dlm, lockid, namelen); 757 res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1535,7 @@ way_up_top:
1542 spin_unlock(&dlm->master_lock); 1535 spin_unlock(&dlm->master_lock);
1543 spin_unlock(&dlm->spinlock); 1536 spin_unlock(&dlm->spinlock);
1544 1537
1545 mle = (struct dlm_master_list_entry *) 1538 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1546 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1547 if (!mle) { 1539 if (!mle) {
1548 response = DLM_MASTER_RESP_ERROR; 1540 response = DLM_MASTER_RESP_ERROR;
1549 mlog_errno(-ENOMEM); 1541 mlog_errno(-ENOMEM);
@@ -1666,7 +1658,9 @@ again:
1666 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, 1658 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1667 &assert, sizeof(assert), to, &r); 1659 &assert, sizeof(assert), to, &r);
1668 if (tmpret < 0) { 1660 if (tmpret < 0) {
1669 mlog(0, "assert_master returned %d!\n", tmpret); 1661 mlog(ML_ERROR, "Error %d when sending message %u (key "
1662 "0x%x) to node %u\n", tmpret,
1663 DLM_ASSERT_MASTER_MSG, dlm->key, to);
1670 if (!dlm_is_host_down(tmpret)) { 1664 if (!dlm_is_host_down(tmpret)) {
1671 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); 1665 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
1672 BUG(); 1666 BUG();
@@ -2205,7 +2199,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2205 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2199 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2206 &deref, sizeof(deref), res->owner, &r); 2200 &deref, sizeof(deref), res->owner, &r);
2207 if (ret < 0) 2201 if (ret < 0)
2208 mlog_errno(ret); 2202 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
2203 "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
2204 res->owner);
2209 else if (r < 0) { 2205 else if (r < 0) {
2210 /* BAD. other node says I did not have a ref. */ 2206 /* BAD. other node says I did not have a ref. */
2211 mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2207 mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2452,8 +2448,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2452 goto leave; 2448 goto leave;
2453 } 2449 }
2454 2450
2455 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 2451 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
2456 GFP_NOFS);
2457 if (!mle) { 2452 if (!mle) {
2458 mlog_errno(ret); 2453 mlog_errno(ret);
2459 goto leave; 2454 goto leave;
@@ -2809,14 +2804,8 @@ again:
2809 mlog(0, "trying again...\n"); 2804 mlog(0, "trying again...\n");
2810 goto again; 2805 goto again;
2811 } 2806 }
2812 /* now that we are sure the MIGRATING state is there, drop
2813 * the unneded state which blocked threads trying to DIRTY */
2814 spin_lock(&res->spinlock);
2815 BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2816 BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
2817 res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2818 spin_unlock(&res->spinlock);
2819 2807
2808 ret = 0;
2820 /* did the target go down or die? */ 2809 /* did the target go down or die? */
2821 spin_lock(&dlm->spinlock); 2810 spin_lock(&dlm->spinlock);
2822 if (!test_bit(target, dlm->domain_map)) { 2811 if (!test_bit(target, dlm->domain_map)) {
@@ -2827,9 +2816,21 @@ again:
2827 spin_unlock(&dlm->spinlock); 2816 spin_unlock(&dlm->spinlock);
2828 2817
2829 /* 2818 /*
2819 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
2820 * another try; otherwise, we are sure the MIGRATING state is there,
2821 * drop the unneded state which blocked threads trying to DIRTY
2822 */
2823 spin_lock(&res->spinlock);
2824 BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2825 res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2826 if (!ret)
2827 BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
2828 spin_unlock(&res->spinlock);
2829
2830 /*
2830 * at this point: 2831 * at this point:
2831 * 2832 *
2832 * o the DLM_LOCK_RES_MIGRATING flag is set 2833 * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
2833 * o there are no pending asts on this lockres 2834 * o there are no pending asts on this lockres
2834 * o all processes trying to reserve an ast on this 2835 * o all processes trying to reserve an ast on this
2835 * lockres must wait for the MIGRATING flag to clear 2836 * lockres must wait for the MIGRATING flag to clear
@@ -2975,7 +2976,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2975 &migrate, sizeof(migrate), nodenum, 2976 &migrate, sizeof(migrate), nodenum,
2976 &status); 2977 &status);
2977 if (ret < 0) { 2978 if (ret < 0) {
2978 mlog(0, "migrate_request returned %d!\n", ret); 2979 mlog(ML_ERROR, "Error %d when sending message %u (key "
2980 "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
2981 dlm->key, nodenum);
2979 if (!dlm_is_host_down(ret)) { 2982 if (!dlm_is_host_down(ret)) {
2980 mlog(ML_ERROR, "unhandled error=%d!\n", ret); 2983 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
2981 BUG(); 2984 BUG();
@@ -3033,8 +3036,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3033 hash = dlm_lockid_hash(name, namelen); 3036 hash = dlm_lockid_hash(name, namelen);
3034 3037
3035 /* preallocate.. if this fails, abort */ 3038 /* preallocate.. if this fails, abort */
3036 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 3039 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
3037 GFP_NOFS);
3038 3040
3039 if (!mle) { 3041 if (!mle) {
3040 ret = -ENOMEM; 3042 ret = -ENOMEM;
@@ -3044,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3044 /* check for pre-existing lock */ 3046 /* check for pre-existing lock */
3045 spin_lock(&dlm->spinlock); 3047 spin_lock(&dlm->spinlock);
3046 res = __dlm_lookup_lockres(dlm, name, namelen, hash); 3048 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
3047 spin_lock(&dlm->master_lock);
3048
3049 if (res) { 3049 if (res) {
3050 spin_lock(&res->spinlock); 3050 spin_lock(&res->spinlock);
3051 if (res->state & DLM_LOCK_RES_RECOVERING) { 3051 if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3063,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3063 spin_unlock(&res->spinlock); 3063 spin_unlock(&res->spinlock);
3064 } 3064 }
3065 3065
3066 spin_lock(&dlm->master_lock);
3066 /* ignore status. only nonzero status would BUG. */ 3067 /* ignore status. only nonzero status would BUG. */
3067 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, 3068 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
3068 name, namelen, 3069 name, namelen,
3069 migrate->new_master, 3070 migrate->new_master,
3070 migrate->master); 3071 migrate->master);
3071 3072
3072unlock:
3073 spin_unlock(&dlm->master_lock); 3073 spin_unlock(&dlm->master_lock);
3074unlock:
3074 spin_unlock(&dlm->spinlock); 3075 spin_unlock(&dlm->spinlock);
3075 3076
3076 if (oldmle) { 3077 if (oldmle) {
@@ -3432,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3432 wake_up(&res->wq); 3433 wake_up(&res->wq);
3433 wake_up(&dlm->migration_wq); 3434 wake_up(&dlm->migration_wq);
3434} 3435}
3436
3437void dlm_force_free_mles(struct dlm_ctxt *dlm)
3438{
3439 int i;
3440 struct hlist_head *bucket;
3441 struct dlm_master_list_entry *mle;
3442 struct hlist_node *tmp, *list;
3443
3444 /*
3445 * We notified all other nodes that we are exiting the domain and
3446 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3447 * around we force free them and wake any processes that are waiting
3448 * on the mles
3449 */
3450 spin_lock(&dlm->spinlock);
3451 spin_lock(&dlm->master_lock);
3452
3453 BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
3454 BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
3455
3456 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3457 bucket = dlm_master_hash(dlm, i);
3458 hlist_for_each_safe(list, tmp, bucket) {
3459 mle = hlist_entry(list, struct dlm_master_list_entry,
3460 master_hash_node);
3461 if (mle->type != DLM_MLE_BLOCK) {
3462 mlog(ML_ERROR, "bad mle: %p\n", mle);
3463 dlm_print_one_mle(mle);
3464 }
3465 atomic_set(&mle->woken, 1);
3466 wake_up(&mle->wq);
3467
3468 __dlm_unlink_mle(dlm, mle);
3469 __dlm_mle_detach_hb_events(dlm, mle);
3470 __dlm_put_mle(mle);
3471 }
3472 }
3473 spin_unlock(&dlm->master_lock);
3474 spin_unlock(&dlm->spinlock);
3475}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de2caf3..aaaffbcbe916 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
463 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { 463 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
464 int bit; 464 int bit;
465 465
466 bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); 466 bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
467 if (bit >= O2NM_MAX_NODES || bit < 0) 467 if (bit >= O2NM_MAX_NODES || bit < 0)
468 dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); 468 dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
469 else 469 else
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
803 803
804 /* negative status is handled by caller */ 804 /* negative status is handled by caller */
805 if (ret < 0) 805 if (ret < 0)
806 mlog_errno(ret); 806 mlog(ML_ERROR, "Error %d when sending message %u (key "
807 "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
808 dlm->key, request_from);
807 809
808 // return from here, then 810 // return from here, then
809 // sleep until all received or error 811 // sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
955 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 957 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
956 sizeof(done_msg), send_to, &tmpret); 958 sizeof(done_msg), send_to, &tmpret);
957 if (ret < 0) { 959 if (ret < 0) {
960 mlog(ML_ERROR, "Error %d when sending message %u (key "
961 "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
962 dlm->key, send_to);
958 if (!dlm_is_host_down(ret)) { 963 if (!dlm_is_host_down(ret)) {
959 mlog_errno(ret);
960 mlog(ML_ERROR, "%s: unknown error sending data-done "
961 "to %u\n", dlm->name, send_to);
962 BUG(); 964 BUG();
963 } 965 }
964 } else 966 } else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1126 if (ret < 0) { 1128 if (ret < 0) {
1127 /* XXX: negative status is not handled. 1129 /* XXX: negative status is not handled.
1128 * this will end up killing this node. */ 1130 * this will end up killing this node. */
1129 mlog_errno(ret); 1131 mlog(ML_ERROR, "Error %d when sending message %u (key "
1132 "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
1133 dlm->key, send_to);
1130 } else { 1134 } else {
1131 /* might get an -ENOMEM back here */ 1135 /* might get an -ENOMEM back here */
1132 ret = status; 1136 ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1642 &req, sizeof(req), nodenum, &status); 1646 &req, sizeof(req), nodenum, &status);
1643 /* XXX: negative status not handled properly here. */ 1647 /* XXX: negative status not handled properly here. */
1644 if (ret < 0) 1648 if (ret < 0)
1645 mlog_errno(ret); 1649 mlog(ML_ERROR, "Error %d when sending message %u (key "
1650 "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
1651 dlm->key, nodenum);
1646 else { 1652 else {
1647 BUG_ON(status < 0); 1653 BUG_ON(status < 0);
1648 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); 1654 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -1991,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1991 struct list_head *queue; 1997 struct list_head *queue;
1992 struct dlm_lock *lock, *next; 1998 struct dlm_lock *lock, *next;
1993 1999
2000 assert_spin_locked(&dlm->spinlock);
2001 assert_spin_locked(&res->spinlock);
1994 res->state |= DLM_LOCK_RES_RECOVERING; 2002 res->state |= DLM_LOCK_RES_RECOVERING;
1995 if (!list_empty(&res->recovering)) { 2003 if (!list_empty(&res->recovering)) {
1996 mlog(0, 2004 mlog(0,
@@ -2320,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2320 /* zero the lvb if necessary */ 2328 /* zero the lvb if necessary */
2321 dlm_revalidate_lvb(dlm, res, dead_node); 2329 dlm_revalidate_lvb(dlm, res, dead_node);
2322 if (res->owner == dead_node) { 2330 if (res->owner == dead_node) {
2323 if (res->state & DLM_LOCK_RES_DROPPING_REF) 2331 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2324 mlog(0, "%s:%.*s: owned by " 2332 mlog(ML_NOTICE, "Ignore %.*s for "
2325 "dead node %u, this node was " 2333 "recovery as it is being freed\n",
2326 "dropping its ref when it died. " 2334 res->lockname.len,
2327 "continue, dropping the flag.\n", 2335 res->lockname.name);
2328 dlm->name, res->lockname.len, 2336 } else
2329 res->lockname.name, dead_node); 2337 dlm_move_lockres_to_recovery_list(dlm,
2330 2338 res);
2331 /* the wake_up for this will happen when the
2332 * RECOVERING flag is dropped later */
2333 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2334 2339
2335 dlm_move_lockres_to_recovery_list(dlm, res);
2336 } else if (res->owner == dlm->node_num) { 2340 } else if (res->owner == dlm->node_num) {
2337 dlm_free_dead_locks(dlm, res, dead_node); 2341 dlm_free_dead_locks(dlm, res, dead_node);
2338 __dlm_lockres_calc_usage(dlm, res); 2342 __dlm_lockres_calc_usage(dlm, res);
@@ -2640,7 +2644,7 @@ retry:
2640 if (dlm_is_host_down(ret)) { 2644 if (dlm_is_host_down(ret)) {
2641 /* node is down. not involved in recovery 2645 /* node is down. not involved in recovery
2642 * so just keep going */ 2646 * so just keep going */
2643 mlog(0, "%s: node %u was down when sending " 2647 mlog(ML_NOTICE, "%s: node %u was down when sending "
2644 "begin reco msg (%d)\n", dlm->name, nodenum, ret); 2648 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2645 ret = 0; 2649 ret = 0;
2646 } 2650 }
@@ -2660,11 +2664,12 @@ retry:
2660 } 2664 }
2661 if (ret < 0) { 2665 if (ret < 0) {
2662 struct dlm_lock_resource *res; 2666 struct dlm_lock_resource *res;
2667
2663 /* this is now a serious problem, possibly ENOMEM 2668 /* this is now a serious problem, possibly ENOMEM
2664 * in the network stack. must retry */ 2669 * in the network stack. must retry */
2665 mlog_errno(ret); 2670 mlog_errno(ret);
2666 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2671 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
2667 " returned %d\n", dlm->name, nodenum, ret); 2672 "returned %d\n", dlm->name, nodenum, ret);
2668 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, 2673 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2669 DLM_RECOVERY_LOCK_NAME_LEN); 2674 DLM_RECOVERY_LOCK_NAME_LEN);
2670 if (res) { 2675 if (res) {
@@ -2789,7 +2794,9 @@ stage2:
2789 if (ret >= 0) 2794 if (ret >= 0)
2790 ret = status; 2795 ret = status;
2791 if (ret < 0) { 2796 if (ret < 0) {
2792 mlog_errno(ret); 2797 mlog(ML_ERROR, "Error %d when sending message %u (key "
2798 "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
2799 dlm->key, nodenum);
2793 if (dlm_is_host_down(ret)) { 2800 if (dlm_is_host_down(ret)) {
2794 /* this has no effect on this recovery 2801 /* this has no effect on this recovery
2795 * session, so set the status to zero to 2802 * session, so set the status to zero to
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 11a6d1fd1d35..2211acf33d9b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
92 * truly ready to be freed. */ 92 * truly ready to be freed. */
93int __dlm_lockres_unused(struct dlm_lock_resource *res) 93int __dlm_lockres_unused(struct dlm_lock_resource *res)
94{ 94{
95 if (!__dlm_lockres_has_locks(res) && 95 int bit;
96 (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { 96
97 /* try not to scan the bitmap unless the first two 97 if (__dlm_lockres_has_locks(res))
98 * conditions are already true */ 98 return 0;
99 int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 99
100 if (bit >= O2NM_MAX_NODES) { 100 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
101 /* since the bit for dlm->node_num is not 101 return 0;
102 * set, inflight_locks better be zero */ 102
103 BUG_ON(res->inflight_locks != 0); 103 if (res->state & DLM_LOCK_RES_RECOVERING)
104 return 1; 104 return 0;
105 } 105
106 } 106 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
107 return 0; 107 if (bit < O2NM_MAX_NODES)
108 return 0;
109
110 /*
111 * since the bit for dlm->node_num is not set, inflight_locks better
112 * be zero
113 */
114 BUG_ON(res->inflight_locks != 0);
115 return 1;
108} 116}
109 117
110 118
@@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
152 spin_unlock(&dlm->spinlock); 160 spin_unlock(&dlm->spinlock);
153} 161}
154 162
155static int dlm_purge_lockres(struct dlm_ctxt *dlm, 163static void dlm_purge_lockres(struct dlm_ctxt *dlm,
156 struct dlm_lock_resource *res) 164 struct dlm_lock_resource *res)
157{ 165{
158 int master; 166 int master;
159 int ret = 0; 167 int ret = 0;
160 168
161 spin_lock(&res->spinlock); 169 assert_spin_locked(&dlm->spinlock);
162 if (!__dlm_lockres_unused(res)) { 170 assert_spin_locked(&res->spinlock);
163 mlog(0, "%s:%.*s: tried to purge but not unused\n",
164 dlm->name, res->lockname.len, res->lockname.name);
165 __dlm_print_one_lock_resource(res);
166 spin_unlock(&res->spinlock);
167 BUG();
168 }
169
170 if (res->state & DLM_LOCK_RES_MIGRATING) {
171 mlog(0, "%s:%.*s: Delay dropref as this lockres is "
172 "being remastered\n", dlm->name, res->lockname.len,
173 res->lockname.name);
174 /* Re-add the lockres to the end of the purge list */
175 if (!list_empty(&res->purge)) {
176 list_del_init(&res->purge);
177 list_add_tail(&res->purge, &dlm->purge_list);
178 }
179 spin_unlock(&res->spinlock);
180 return 0;
181 }
182 171
183 master = (res->owner == dlm->node_num); 172 master = (res->owner == dlm->node_num);
184 173
185 if (!master)
186 res->state |= DLM_LOCK_RES_DROPPING_REF;
187 spin_unlock(&res->spinlock);
188 174
189 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, 175 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
190 res->lockname.name, master); 176 res->lockname.name, master);
191 177
192 if (!master) { 178 if (!master) {
179 res->state |= DLM_LOCK_RES_DROPPING_REF;
193 /* drop spinlock... retake below */ 180 /* drop spinlock... retake below */
181 spin_unlock(&res->spinlock);
194 spin_unlock(&dlm->spinlock); 182 spin_unlock(&dlm->spinlock);
195 183
196 spin_lock(&res->spinlock); 184 spin_lock(&res->spinlock);
@@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
208 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", 196 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
209 dlm->name, res->lockname.len, res->lockname.name, ret); 197 dlm->name, res->lockname.len, res->lockname.name, ret);
210 spin_lock(&dlm->spinlock); 198 spin_lock(&dlm->spinlock);
199 spin_lock(&res->spinlock);
211 } 200 }
212 201
213 spin_lock(&res->spinlock);
214 if (!list_empty(&res->purge)) { 202 if (!list_empty(&res->purge)) {
215 mlog(0, "removing lockres %.*s:%p from purgelist, " 203 mlog(0, "removing lockres %.*s:%p from purgelist, "
216 "master = %d\n", res->lockname.len, res->lockname.name, 204 "master = %d\n", res->lockname.len, res->lockname.name,
217 res, master); 205 res, master);
218 list_del_init(&res->purge); 206 list_del_init(&res->purge);
219 spin_unlock(&res->spinlock);
220 dlm_lockres_put(res); 207 dlm_lockres_put(res);
221 dlm->purge_count--; 208 dlm->purge_count--;
222 } else 209 }
223 spin_unlock(&res->spinlock); 210
211 if (!__dlm_lockres_unused(res)) {
212 mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
213 dlm->name, res->lockname.len, res->lockname.name);
214 __dlm_print_one_lock_resource(res);
215 BUG();
216 }
224 217
225 __dlm_unhash_lockres(res); 218 __dlm_unhash_lockres(res);
226 219
227 /* lockres is not in the hash now. drop the flag and wake up 220 /* lockres is not in the hash now. drop the flag and wake up
228 * any processes waiting in dlm_get_lock_resource. */ 221 * any processes waiting in dlm_get_lock_resource. */
229 if (!master) { 222 if (!master) {
230 spin_lock(&res->spinlock);
231 res->state &= ~DLM_LOCK_RES_DROPPING_REF; 223 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
232 spin_unlock(&res->spinlock); 224 spin_unlock(&res->spinlock);
233 wake_up(&res->wq); 225 wake_up(&res->wq);
234 } 226 } else
235 return 0; 227 spin_unlock(&res->spinlock);
236} 228}
237 229
238static void dlm_run_purge_list(struct dlm_ctxt *dlm, 230static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
251 lockres = list_entry(dlm->purge_list.next, 243 lockres = list_entry(dlm->purge_list.next,
252 struct dlm_lock_resource, purge); 244 struct dlm_lock_resource, purge);
253 245
254 /* Status of the lockres *might* change so double
255 * check. If the lockres is unused, holding the dlm
256 * spinlock will prevent people from getting and more
257 * refs on it -- there's no need to keep the lockres
258 * spinlock. */
259 spin_lock(&lockres->spinlock); 246 spin_lock(&lockres->spinlock);
260 unused = __dlm_lockres_unused(lockres);
261 spin_unlock(&lockres->spinlock);
262
263 if (!unused)
264 continue;
265 247
266 purge_jiffies = lockres->last_used + 248 purge_jiffies = lockres->last_used +
267 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); 249 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
273 * in tail order, we can stop at the first 255 * in tail order, we can stop at the first
274 * unpurgable resource -- anyone added after 256 * unpurgable resource -- anyone added after
275 * him will have a greater last_used value */ 257 * him will have a greater last_used value */
258 spin_unlock(&lockres->spinlock);
276 break; 259 break;
277 } 260 }
278 261
262 /* Status of the lockres *might* change so double
263 * check. If the lockres is unused, holding the dlm
264 * spinlock will prevent people from getting and more
265 * refs on it. */
266 unused = __dlm_lockres_unused(lockres);
267 if (!unused ||
268 (lockres->state & DLM_LOCK_RES_MIGRATING)) {
269 mlog(0, "lockres %s:%.*s: is in use or "
270 "being remastered, used %d, state %d\n",
271 dlm->name, lockres->lockname.len,
272 lockres->lockname.name, !unused, lockres->state);
273 list_move_tail(&dlm->purge_list, &lockres->purge);
274 spin_unlock(&lockres->spinlock);
275 continue;
276 }
277
279 dlm_lockres_get(lockres); 278 dlm_lockres_get(lockres);
280 279
281 /* This may drop and reacquire the dlm spinlock if it 280 dlm_purge_lockres(dlm, lockres);
282 * has to do migration. */
283 if (dlm_purge_lockres(dlm, lockres))
284 BUG();
285 281
286 dlm_lockres_put(lockres); 282 dlm_lockres_put(lockres);
287 283
@@ -309,6 +305,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
309 * spinlock, and because we know that it is not migrating/ 305 * spinlock, and because we know that it is not migrating/
310 * recovering/in-progress, it is fine to reserve asts and 306 * recovering/in-progress, it is fine to reserve asts and
311 * basts right before queueing them all throughout */ 307 * basts right before queueing them all throughout */
308 assert_spin_locked(&dlm->ast_lock);
312 assert_spin_locked(&res->spinlock); 309 assert_spin_locked(&res->spinlock);
313 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| 310 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
314 DLM_LOCK_RES_RECOVERING| 311 DLM_LOCK_RES_RECOVERING|
@@ -337,7 +334,7 @@ converting:
337 /* queue the BAST if not already */ 334 /* queue the BAST if not already */
338 if (lock->ml.highest_blocked == LKM_IVMODE) { 335 if (lock->ml.highest_blocked == LKM_IVMODE) {
339 __dlm_lockres_reserve_ast(res); 336 __dlm_lockres_reserve_ast(res);
340 dlm_queue_bast(dlm, lock); 337 __dlm_queue_bast(dlm, lock);
341 } 338 }
342 /* update the highest_blocked if needed */ 339 /* update the highest_blocked if needed */
343 if (lock->ml.highest_blocked < target->ml.convert_type) 340 if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -355,7 +352,7 @@ converting:
355 can_grant = 0; 352 can_grant = 0;
356 if (lock->ml.highest_blocked == LKM_IVMODE) { 353 if (lock->ml.highest_blocked == LKM_IVMODE) {
357 __dlm_lockres_reserve_ast(res); 354 __dlm_lockres_reserve_ast(res);
358 dlm_queue_bast(dlm, lock); 355 __dlm_queue_bast(dlm, lock);
359 } 356 }
360 if (lock->ml.highest_blocked < target->ml.convert_type) 357 if (lock->ml.highest_blocked < target->ml.convert_type)
361 lock->ml.highest_blocked = 358 lock->ml.highest_blocked =
@@ -383,7 +380,7 @@ converting:
383 spin_unlock(&target->spinlock); 380 spin_unlock(&target->spinlock);
384 381
385 __dlm_lockres_reserve_ast(res); 382 __dlm_lockres_reserve_ast(res);
386 dlm_queue_ast(dlm, target); 383 __dlm_queue_ast(dlm, target);
387 /* go back and check for more */ 384 /* go back and check for more */
388 goto converting; 385 goto converting;
389 } 386 }
@@ -402,7 +399,7 @@ blocked:
402 can_grant = 0; 399 can_grant = 0;
403 if (lock->ml.highest_blocked == LKM_IVMODE) { 400 if (lock->ml.highest_blocked == LKM_IVMODE) {
404 __dlm_lockres_reserve_ast(res); 401 __dlm_lockres_reserve_ast(res);
405 dlm_queue_bast(dlm, lock); 402 __dlm_queue_bast(dlm, lock);
406 } 403 }
407 if (lock->ml.highest_blocked < target->ml.type) 404 if (lock->ml.highest_blocked < target->ml.type)
408 lock->ml.highest_blocked = target->ml.type; 405 lock->ml.highest_blocked = target->ml.type;
@@ -418,7 +415,7 @@ blocked:
418 can_grant = 0; 415 can_grant = 0;
419 if (lock->ml.highest_blocked == LKM_IVMODE) { 416 if (lock->ml.highest_blocked == LKM_IVMODE) {
420 __dlm_lockres_reserve_ast(res); 417 __dlm_lockres_reserve_ast(res);
421 dlm_queue_bast(dlm, lock); 418 __dlm_queue_bast(dlm, lock);
422 } 419 }
423 if (lock->ml.highest_blocked < target->ml.type) 420 if (lock->ml.highest_blocked < target->ml.type)
424 lock->ml.highest_blocked = target->ml.type; 421 lock->ml.highest_blocked = target->ml.type;
@@ -444,7 +441,7 @@ blocked:
444 spin_unlock(&target->spinlock); 441 spin_unlock(&target->spinlock);
445 442
446 __dlm_lockres_reserve_ast(res); 443 __dlm_lockres_reserve_ast(res);
447 dlm_queue_ast(dlm, target); 444 __dlm_queue_ast(dlm, target);
448 /* go back and check for more */ 445 /* go back and check for more */
449 goto converting; 446 goto converting;
450 } 447 }
@@ -674,6 +671,7 @@ static int dlm_thread(void *data)
674 /* lockres can be re-dirtied/re-added to the 671 /* lockres can be re-dirtied/re-added to the
675 * dirty_list in this gap, but that is ok */ 672 * dirty_list in this gap, but that is ok */
676 673
674 spin_lock(&dlm->ast_lock);
677 spin_lock(&res->spinlock); 675 spin_lock(&res->spinlock);
678 if (res->owner != dlm->node_num) { 676 if (res->owner != dlm->node_num) {
679 __dlm_print_one_lock_resource(res); 677 __dlm_print_one_lock_resource(res);
@@ -694,6 +692,7 @@ static int dlm_thread(void *data)
694 /* move it to the tail and keep going */ 692 /* move it to the tail and keep going */
695 res->state &= ~DLM_LOCK_RES_DIRTY; 693 res->state &= ~DLM_LOCK_RES_DIRTY;
696 spin_unlock(&res->spinlock); 694 spin_unlock(&res->spinlock);
695 spin_unlock(&dlm->ast_lock);
697 mlog(0, "delaying list shuffling for in-" 696 mlog(0, "delaying list shuffling for in-"
698 "progress lockres %.*s, state=%d\n", 697 "progress lockres %.*s, state=%d\n",
699 res->lockname.len, res->lockname.name, 698 res->lockname.len, res->lockname.name,
@@ -715,6 +714,7 @@ static int dlm_thread(void *data)
715 dlm_shuffle_lists(dlm, res); 714 dlm_shuffle_lists(dlm, res);
716 res->state &= ~DLM_LOCK_RES_DIRTY; 715 res->state &= ~DLM_LOCK_RES_DIRTY;
717 spin_unlock(&res->spinlock); 716 spin_unlock(&res->spinlock);
717 spin_unlock(&dlm->ast_lock);
718 718
719 dlm_lockres_calc_usage(dlm, res); 719 dlm_lockres_calc_usage(dlm, res);
720 720
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index b47c1b92b82b..817287c6a6db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -354,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
354 mlog(0, "master was in-progress. retry\n"); 354 mlog(0, "master was in-progress. retry\n");
355 ret = status; 355 ret = status;
356 } else { 356 } else {
357 mlog_errno(tmpret); 357 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
358 "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
358 if (dlm_is_host_down(tmpret)) { 359 if (dlm_is_host_down(tmpret)) {
359 /* NOTE: this seems strange, but it is what we want. 360 /* NOTE: this seems strange, but it is what we want.
360 * when the master goes down during a cancel or 361 * when the master goes down during a cancel or
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b83d6107a1f5..c2903b84bb7a 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -182,8 +182,7 @@ static int dlmfs_file_release(struct inode *inode,
182{ 182{
183 int level, status; 183 int level, status;
184 struct dlmfs_inode_private *ip = DLMFS_I(inode); 184 struct dlmfs_inode_private *ip = DLMFS_I(inode);
185 struct dlmfs_filp_private *fp = 185 struct dlmfs_filp_private *fp = file->private_data;
186 (struct dlmfs_filp_private *) file->private_data;
187 186
188 if (S_ISDIR(inode->i_mode)) 187 if (S_ISDIR(inode->i_mode))
189 BUG(); 188 BUG();
@@ -214,10 +213,12 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
214 213
215 attr->ia_valid &= ~ATTR_SIZE; 214 attr->ia_valid &= ~ATTR_SIZE;
216 error = inode_change_ok(inode, attr); 215 error = inode_change_ok(inode, attr);
217 if (!error) 216 if (error)
218 error = inode_setattr(inode, attr); 217 return error;
219 218
220 return error; 219 setattr_copy(inode, attr);
220 mark_inode_dirty(inode);
221 return 0;
221} 222}
222 223
223static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) 224static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
@@ -355,13 +356,12 @@ static void dlmfs_destroy_inode(struct inode *inode)
355 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); 356 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
356} 357}
357 358
358static void dlmfs_clear_inode(struct inode *inode) 359static void dlmfs_evict_inode(struct inode *inode)
359{ 360{
360 int status; 361 int status;
361 struct dlmfs_inode_private *ip; 362 struct dlmfs_inode_private *ip;
362 363
363 if (!inode) 364 end_writeback(inode);
364 return;
365 365
366 mlog(0, "inode %lu\n", inode->i_ino); 366 mlog(0, "inode %lu\n", inode->i_ino);
367 367
@@ -631,7 +631,7 @@ static const struct super_operations dlmfs_ops = {
631 .statfs = simple_statfs, 631 .statfs = simple_statfs,
632 .alloc_inode = dlmfs_alloc_inode, 632 .alloc_inode = dlmfs_alloc_inode,
633 .destroy_inode = dlmfs_destroy_inode, 633 .destroy_inode = dlmfs_destroy_inode,
634 .clear_inode = dlmfs_clear_inode, 634 .evict_inode = dlmfs_evict_inode,
635 .drop_inode = generic_delete_inode, 635 .drop_inode = generic_delete_inode,
636}; 636};
637 637
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 50c4ee805da4..5e02a893f46e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2966,7 +2966,7 @@ static const struct seq_operations ocfs2_dlm_seq_ops = {
2966 2966
2967static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) 2967static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
2968{ 2968{
2969 struct seq_file *seq = (struct seq_file *) file->private_data; 2969 struct seq_file *seq = file->private_data;
2970 struct ocfs2_dlm_seq_priv *priv = seq->private; 2970 struct ocfs2_dlm_seq_priv *priv = seq->private;
2971 struct ocfs2_lock_res *res = &priv->p_iter_res; 2971 struct ocfs2_lock_res *res = &priv->p_iter_res;
2972 2972
@@ -3000,7 +3000,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
3000 goto out; 3000 goto out;
3001 } 3001 }
3002 3002
3003 seq = (struct seq_file *) file->private_data; 3003 seq = file->private_data;
3004 seq->private = priv; 3004 seq->private = priv;
3005 3005
3006 ocfs2_add_lockres_tracking(&priv->p_iter_res, 3006 ocfs2_add_lockres_tracking(&priv->p_iter_res,
@@ -3897,7 +3897,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
3897 oinfo->dqi_gi.dqi_free_entry = 3897 oinfo->dqi_gi.dqi_free_entry =
3898 be32_to_cpu(lvb->lvb_free_entry); 3898 be32_to_cpu(lvb->lvb_free_entry);
3899 } else { 3899 } else {
3900 status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh); 3900 status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
3901 oinfo->dqi_giblk, &bh);
3901 if (status) { 3902 if (status) {
3902 mlog_errno(status); 3903 mlog_errno(status);
3903 goto bail; 3904 goto bail;
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d6..1d596d8c4a4a 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
84 OI_LS_PARENT, 84 OI_LS_PARENT,
85 OI_LS_RENAME1, 85 OI_LS_RENAME1,
86 OI_LS_RENAME2, 86 OI_LS_RENAME2,
87 OI_LS_REFLINK_TARGET,
87}; 88};
88 89
89int ocfs2_dlm_init(struct ocfs2_super *osb); 90int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5fbd9cea968..9a03c151b5ce 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,6 +36,7 @@
36#include <linux/writeback.h> 36#include <linux/writeback.h>
37#include <linux/falloc.h> 37#include <linux/falloc.h>
38#include <linux/quotaops.h> 38#include <linux/quotaops.h>
39#include <linux/blkdev.h>
39 40
40#define MLOG_MASK_PREFIX ML_INODE 41#define MLOG_MASK_PREFIX ML_INODE
41#include <cluster/masklog.h> 42#include <cluster/masklog.h>
@@ -175,13 +176,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
175 return 0; 176 return 0;
176} 177}
177 178
178static int ocfs2_sync_file(struct file *file, 179static int ocfs2_sync_file(struct file *file, int datasync)
179 struct dentry *dentry,
180 int datasync)
181{ 180{
182 int err = 0; 181 int err = 0;
183 journal_t *journal; 182 journal_t *journal;
184 struct inode *inode = dentry->d_inode; 183 struct dentry *dentry = file->f_path.dentry;
184 struct inode *inode = file->f_mapping->host;
185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
186 186
187 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 187 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -191,8 +191,16 @@ static int ocfs2_sync_file(struct file *file,
191 if (err) 191 if (err)
192 goto bail; 192 goto bail;
193 193
194 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 194 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
195 /*
196 * We still have to flush drive's caches to get data to the
197 * platter
198 */
199 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
200 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
201 NULL, BLKDEV_IFL_WAIT);
195 goto bail; 202 goto bail;
203 }
196 204
197 journal = osb->journal->j_journal; 205 journal = osb->journal->j_journal;
198 err = jbd2_journal_force_commit(journal); 206 err = jbd2_journal_force_commit(journal);
@@ -278,10 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
278 inode->i_atime = CURRENT_TIME; 286 inode->i_atime = CURRENT_TIME;
279 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 287 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
280 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 288 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
281 289 ocfs2_journal_dirty(handle, bh);
282 ret = ocfs2_journal_dirty(handle, bh);
283 if (ret < 0)
284 mlog_errno(ret);
285 290
286out_commit: 291out_commit:
287 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 292 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +435,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
430 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 435 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
431 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 436 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
432 437
433 status = ocfs2_journal_dirty(handle, fe_bh); 438 ocfs2_journal_dirty(handle, fe_bh);
434 if (status < 0)
435 mlog_errno(status);
436 439
437out_commit: 440out_commit:
438 ocfs2_commit_trans(osb, handle); 441 ocfs2_commit_trans(osb, handle);
@@ -449,7 +452,6 @@ static int ocfs2_truncate_file(struct inode *inode,
449 int status = 0; 452 int status = 0;
450 struct ocfs2_dinode *fe = NULL; 453 struct ocfs2_dinode *fe = NULL;
451 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 454 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
452 struct ocfs2_truncate_context *tc = NULL;
453 455
454 mlog_entry("(inode = %llu, new_i_size = %llu\n", 456 mlog_entry("(inode = %llu, new_i_size = %llu\n",
455 (unsigned long long)OCFS2_I(inode)->ip_blkno, 457 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +490,9 @@ static int ocfs2_truncate_file(struct inode *inode,
488 490
489 down_write(&OCFS2_I(inode)->ip_alloc_sem); 491 down_write(&OCFS2_I(inode)->ip_alloc_sem);
490 492
493 ocfs2_resv_discard(&osb->osb_la_resmap,
494 &OCFS2_I(inode)->ip_la_data_resv);
495
491 /* 496 /*
492 * The inode lock forced other nodes to sync and drop their 497 * The inode lock forced other nodes to sync and drop their
493 * pages, which (correctly) happens even if we have a truncate 498 * pages, which (correctly) happens even if we have a truncate
@@ -517,13 +522,7 @@ static int ocfs2_truncate_file(struct inode *inode,
517 goto bail_unlock_sem; 522 goto bail_unlock_sem;
518 } 523 }
519 524
520 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 525 status = ocfs2_commit_truncate(osb, inode, di_bh);
521 if (status < 0) {
522 mlog_errno(status);
523 goto bail_unlock_sem;
524 }
525
526 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
527 if (status < 0) { 526 if (status < 0) {
528 mlog_errno(status); 527 mlog_errno(status);
529 goto bail_unlock_sem; 528 goto bail_unlock_sem;
@@ -666,11 +665,7 @@ restarted_transaction:
666 goto leave; 665 goto leave;
667 } 666 }
668 667
669 status = ocfs2_journal_dirty(handle, bh); 668 ocfs2_journal_dirty(handle, bh);
670 if (status < 0) {
671 mlog_errno(status);
672 goto leave;
673 }
674 669
675 spin_lock(&OCFS2_I(inode)->ip_lock); 670 spin_lock(&OCFS2_I(inode)->ip_lock);
676 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 671 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -738,61 +733,113 @@ leave:
738 return status; 733 return status;
739} 734}
740 735
736/*
737 * While a write will already be ordering the data, a truncate will not.
738 * Thus, we need to explicitly order the zeroed pages.
739 */
740static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
741{
742 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
743 handle_t *handle = NULL;
744 int ret = 0;
745
746 if (!ocfs2_should_order_data(inode))
747 goto out;
748
749 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
750 if (IS_ERR(handle)) {
751 ret = -ENOMEM;
752 mlog_errno(ret);
753 goto out;
754 }
755
756 ret = ocfs2_jbd2_file_inode(handle, inode);
757 if (ret < 0)
758 mlog_errno(ret);
759
760out:
761 if (ret) {
762 if (!IS_ERR(handle))
763 ocfs2_commit_trans(osb, handle);
764 handle = ERR_PTR(ret);
765 }
766 return handle;
767}
768
741/* Some parts of this taken from generic_cont_expand, which turned out 769/* Some parts of this taken from generic_cont_expand, which turned out
742 * to be too fragile to do exactly what we need without us having to 770 * to be too fragile to do exactly what we need without us having to
743 * worry about recursive locking in ->write_begin() and ->write_end(). */ 771 * worry about recursive locking in ->write_begin() and ->write_end(). */
744static int ocfs2_write_zero_page(struct inode *inode, 772static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
745 u64 size) 773 u64 abs_to)
746{ 774{
747 struct address_space *mapping = inode->i_mapping; 775 struct address_space *mapping = inode->i_mapping;
748 struct page *page; 776 struct page *page;
749 unsigned long index; 777 unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
750 unsigned int offset;
751 handle_t *handle = NULL; 778 handle_t *handle = NULL;
752 int ret; 779 int ret = 0;
780 unsigned zero_from, zero_to, block_start, block_end;
753 781
754 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 782 BUG_ON(abs_from >= abs_to);
755 /* ugh. in prepare/commit_write, if from==to==start of block, we 783 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
756 ** skip the prepare. make sure we never send an offset for the start 784 BUG_ON(abs_from & (inode->i_blkbits - 1));
757 ** of a block
758 */
759 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
760 offset++;
761 }
762 index = size >> PAGE_CACHE_SHIFT;
763 785
764 page = grab_cache_page(mapping, index); 786 page = find_or_create_page(mapping, index, GFP_NOFS);
765 if (!page) { 787 if (!page) {
766 ret = -ENOMEM; 788 ret = -ENOMEM;
767 mlog_errno(ret); 789 mlog_errno(ret);
768 goto out; 790 goto out;
769 } 791 }
770 792
771 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 793 /* Get the offsets within the page that we want to zero */
772 if (ret < 0) { 794 zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
773 mlog_errno(ret); 795 zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
774 goto out_unlock; 796 if (!zero_to)
775 } 797 zero_to = PAGE_CACHE_SIZE;
776 798
777 if (ocfs2_should_order_data(inode)) { 799 mlog(0,
778 handle = ocfs2_start_walk_page_trans(inode, page, offset, 800 "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
779 offset); 801 (unsigned long long)abs_from, (unsigned long long)abs_to,
780 if (IS_ERR(handle)) { 802 index, zero_from, zero_to);
781 ret = PTR_ERR(handle); 803
782 handle = NULL; 804 /* We know that zero_from is block aligned */
805 for (block_start = zero_from; block_start < zero_to;
806 block_start = block_end) {
807 block_end = block_start + (1 << inode->i_blkbits);
808
809 /*
810 * block_start is block-aligned. Bump it by one to
811 * force ocfs2_{prepare,commit}_write() to zero the
812 * whole block.
813 */
814 ret = ocfs2_prepare_write_nolock(inode, page,
815 block_start + 1,
816 block_start + 1);
817 if (ret < 0) {
818 mlog_errno(ret);
783 goto out_unlock; 819 goto out_unlock;
784 } 820 }
785 }
786 821
787 /* must not update i_size! */ 822 if (!handle) {
788 ret = block_commit_write(page, offset, offset); 823 handle = ocfs2_zero_start_ordered_transaction(inode);
789 if (ret < 0) 824 if (IS_ERR(handle)) {
790 mlog_errno(ret); 825 ret = PTR_ERR(handle);
791 else 826 handle = NULL;
792 ret = 0; 827 break;
828 }
829 }
830
831 /* must not update i_size! */
832 ret = block_commit_write(page, block_start + 1,
833 block_start + 1);
834 if (ret < 0)
835 mlog_errno(ret);
836 else
837 ret = 0;
838 }
793 839
794 if (handle) 840 if (handle)
795 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 841 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
842
796out_unlock: 843out_unlock:
797 unlock_page(page); 844 unlock_page(page);
798 page_cache_release(page); 845 page_cache_release(page);
@@ -800,22 +847,114 @@ out:
800 return ret; 847 return ret;
801} 848}
802 849
803static int ocfs2_zero_extend(struct inode *inode, 850/*
804 u64 zero_to_size) 851 * Find the next range to zero. We do this in terms of bytes because
852 * that's what ocfs2_zero_extend() wants, and it is dealing with the
853 * pagecache. We may return multiple extents.
854 *
855 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
856 * needs to be zeroed. range_start and range_end return the next zeroing
857 * range. A subsequent call should pass the previous range_end as its
858 * zero_start. If range_end is 0, there's nothing to do.
859 *
860 * Unwritten extents are skipped over. Refcounted extents are CoWd.
861 */
862static int ocfs2_zero_extend_get_range(struct inode *inode,
863 struct buffer_head *di_bh,
864 u64 zero_start, u64 zero_end,
865 u64 *range_start, u64 *range_end)
805{ 866{
806 int ret = 0; 867 int rc = 0, needs_cow = 0;
807 u64 start_off; 868 u32 p_cpos, zero_clusters = 0;
808 struct super_block *sb = inode->i_sb; 869 u32 zero_cpos =
870 zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
871 u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
872 unsigned int num_clusters = 0;
873 unsigned int ext_flags = 0;
809 874
810 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 875 while (zero_cpos < last_cpos) {
811 while (start_off < zero_to_size) { 876 rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
812 ret = ocfs2_write_zero_page(inode, start_off); 877 &num_clusters, &ext_flags);
813 if (ret < 0) { 878 if (rc) {
814 mlog_errno(ret); 879 mlog_errno(rc);
880 goto out;
881 }
882
883 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
884 zero_clusters = num_clusters;
885 if (ext_flags & OCFS2_EXT_REFCOUNTED)
886 needs_cow = 1;
887 break;
888 }
889
890 zero_cpos += num_clusters;
891 }
892 if (!zero_clusters) {
893 *range_end = 0;
894 goto out;
895 }
896
897 while ((zero_cpos + zero_clusters) < last_cpos) {
898 rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
899 &p_cpos, &num_clusters,
900 &ext_flags);
901 if (rc) {
902 mlog_errno(rc);
815 goto out; 903 goto out;
816 } 904 }
817 905
818 start_off += sb->s_blocksize; 906 if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
907 break;
908 if (ext_flags & OCFS2_EXT_REFCOUNTED)
909 needs_cow = 1;
910 zero_clusters += num_clusters;
911 }
912 if ((zero_cpos + zero_clusters) > last_cpos)
913 zero_clusters = last_cpos - zero_cpos;
914
915 if (needs_cow) {
916 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
917 UINT_MAX);
918 if (rc) {
919 mlog_errno(rc);
920 goto out;
921 }
922 }
923
924 *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
925 *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
926 zero_cpos + zero_clusters);
927
928out:
929 return rc;
930}
931
932/*
933 * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
934 * has made sure that the entire range needs zeroing.
935 */
936static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
937 u64 range_end)
938{
939 int rc = 0;
940 u64 next_pos;
941 u64 zero_pos = range_start;
942
943 mlog(0, "range_start = %llu, range_end = %llu\n",
944 (unsigned long long)range_start,
945 (unsigned long long)range_end);
946 BUG_ON(range_start >= range_end);
947
948 while (zero_pos < range_end) {
949 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
950 if (next_pos > range_end)
951 next_pos = range_end;
952 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
953 if (rc < 0) {
954 mlog_errno(rc);
955 break;
956 }
957 zero_pos = next_pos;
819 958
820 /* 959 /*
821 * Very large extends have the potential to lock up 960 * Very large extends have the potential to lock up
@@ -824,16 +963,63 @@ static int ocfs2_zero_extend(struct inode *inode,
824 cond_resched(); 963 cond_resched();
825 } 964 }
826 965
827out: 966 return rc;
967}
968
969int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
970 loff_t zero_to_size)
971{
972 int ret = 0;
973 u64 zero_start, range_start = 0, range_end = 0;
974 struct super_block *sb = inode->i_sb;
975
976 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
977 mlog(0, "zero_start %llu for i_size %llu\n",
978 (unsigned long long)zero_start,
979 (unsigned long long)i_size_read(inode));
980 while (zero_start < zero_to_size) {
981 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
982 zero_to_size,
983 &range_start,
984 &range_end);
985 if (ret) {
986 mlog_errno(ret);
987 break;
988 }
989 if (!range_end)
990 break;
991 /* Trim the ends */
992 if (range_start < zero_start)
993 range_start = zero_start;
994 if (range_end > zero_to_size)
995 range_end = zero_to_size;
996
997 ret = ocfs2_zero_extend_range(inode, range_start,
998 range_end);
999 if (ret) {
1000 mlog_errno(ret);
1001 break;
1002 }
1003 zero_start = range_end;
1004 }
1005
828 return ret; 1006 return ret;
829} 1007}
830 1008
831int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) 1009int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
1010 u64 new_i_size, u64 zero_to)
832{ 1011{
833 int ret; 1012 int ret;
834 u32 clusters_to_add; 1013 u32 clusters_to_add;
835 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1014 struct ocfs2_inode_info *oi = OCFS2_I(inode);
836 1015
1016 /*
1017 * Only quota files call this without a bh, and they can't be
1018 * refcounted.
1019 */
1020 BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
1021 BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1022
837 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); 1023 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
838 if (clusters_to_add < oi->ip_clusters) 1024 if (clusters_to_add < oi->ip_clusters)
839 clusters_to_add = 0; 1025 clusters_to_add = 0;
@@ -854,7 +1040,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
854 * still need to zero the area between the old i_size and the 1040 * still need to zero the area between the old i_size and the
855 * new i_size. 1041 * new i_size.
856 */ 1042 */
857 ret = ocfs2_zero_extend(inode, zero_to); 1043 ret = ocfs2_zero_extend(inode, di_bh, zero_to);
858 if (ret < 0) 1044 if (ret < 0)
859 mlog_errno(ret); 1045 mlog_errno(ret);
860 1046
@@ -876,27 +1062,15 @@ static int ocfs2_extend_file(struct inode *inode,
876 goto out; 1062 goto out;
877 1063
878 if (i_size_read(inode) == new_i_size) 1064 if (i_size_read(inode) == new_i_size)
879 goto out; 1065 goto out;
880 BUG_ON(new_i_size < i_size_read(inode)); 1066 BUG_ON(new_i_size < i_size_read(inode));
881 1067
882 /* 1068 /*
883 * Fall through for converting inline data, even if the fs
884 * supports sparse files.
885 *
886 * The check for inline data here is legal - nobody can add
887 * the feature since we have i_mutex. We must check it again
888 * after acquiring ip_alloc_sem though, as paths like mmap
889 * might have raced us to converting the inode to extents.
890 */
891 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
892 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
893 goto out_update_size;
894
895 /*
896 * The alloc sem blocks people in read/write from reading our 1069 * The alloc sem blocks people in read/write from reading our
897 * allocation until we're done changing it. We depend on 1070 * allocation until we're done changing it. We depend on
898 * i_mutex to block other extend/truncate calls while we're 1071 * i_mutex to block other extend/truncate calls while we're
899 * here. 1072 * here. We even have to hold it for sparse files because there
1073 * might be some tail zeroing.
900 */ 1074 */
901 down_write(&oi->ip_alloc_sem); 1075 down_write(&oi->ip_alloc_sem);
902 1076
@@ -913,14 +1087,16 @@ static int ocfs2_extend_file(struct inode *inode,
913 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1087 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
914 if (ret) { 1088 if (ret) {
915 up_write(&oi->ip_alloc_sem); 1089 up_write(&oi->ip_alloc_sem);
916
917 mlog_errno(ret); 1090 mlog_errno(ret);
918 goto out; 1091 goto out;
919 } 1092 }
920 } 1093 }
921 1094
922 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 1095 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
923 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); 1096 ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1097 else
1098 ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1099 new_i_size);
924 1100
925 up_write(&oi->ip_alloc_sem); 1101 up_write(&oi->ip_alloc_sem);
926 1102
@@ -946,9 +1122,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
946 struct ocfs2_super *osb = OCFS2_SB(sb); 1122 struct ocfs2_super *osb = OCFS2_SB(sb);
947 struct buffer_head *bh = NULL; 1123 struct buffer_head *bh = NULL;
948 handle_t *handle = NULL; 1124 handle_t *handle = NULL;
949 int qtype;
950 struct dquot *transfer_from[MAXQUOTAS] = { };
951 struct dquot *transfer_to[MAXQUOTAS] = { }; 1125 struct dquot *transfer_to[MAXQUOTAS] = { };
1126 int qtype;
952 1127
953 mlog_entry("(0x%p, '%.*s')\n", dentry, 1128 mlog_entry("(0x%p, '%.*s')\n", dentry,
954 dentry->d_name.len, dentry->d_name.name); 1129 dentry->d_name.len, dentry->d_name.name);
@@ -979,10 +1154,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
979 if (status) 1154 if (status)
980 return status; 1155 return status;
981 1156
1157 if (is_quota_modification(inode, attr))
1158 dquot_initialize(inode);
982 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 1159 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
983 if (size_change) { 1160 if (size_change) {
984 dquot_initialize(inode);
985
986 status = ocfs2_rw_lock(inode, 1); 1161 status = ocfs2_rw_lock(inode, 1);
987 if (status < 0) { 1162 if (status < 0) {
988 mlog_errno(status); 1163 mlog_errno(status);
@@ -1032,9 +1207,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1032 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1207 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1033 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, 1208 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
1034 USRQUOTA); 1209 USRQUOTA);
1035 transfer_from[USRQUOTA] = dqget(sb, inode->i_uid, 1210 if (!transfer_to[USRQUOTA]) {
1036 USRQUOTA);
1037 if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
1038 status = -ESRCH; 1211 status = -ESRCH;
1039 goto bail_unlock; 1212 goto bail_unlock;
1040 } 1213 }
@@ -1044,9 +1217,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1044 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1217 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1045 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, 1218 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
1046 GRPQUOTA); 1219 GRPQUOTA);
1047 transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid, 1220 if (!transfer_to[GRPQUOTA]) {
1048 GRPQUOTA);
1049 if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
1050 status = -ESRCH; 1221 status = -ESRCH;
1051 goto bail_unlock; 1222 goto bail_unlock;
1052 } 1223 }
@@ -1058,7 +1229,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1058 mlog_errno(status); 1229 mlog_errno(status);
1059 goto bail_unlock; 1230 goto bail_unlock;
1060 } 1231 }
1061 status = dquot_transfer(inode, attr); 1232 status = __dquot_transfer(inode, transfer_to);
1062 if (status < 0) 1233 if (status < 0)
1063 goto bail_commit; 1234 goto bail_commit;
1064 } else { 1235 } else {
@@ -1071,18 +1242,26 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1071 } 1242 }
1072 1243
1073 /* 1244 /*
1074 * This will intentionally not wind up calling vmtruncate(), 1245 * This will intentionally not wind up calling truncate_setsize(),
1075 * since all the work for a size change has been done above. 1246 * since all the work for a size change has been done above.
1076 * Otherwise, we could get into problems with truncate as 1247 * Otherwise, we could get into problems with truncate as
1077 * ip_alloc_sem is used there to protect against i_size 1248 * ip_alloc_sem is used there to protect against i_size
1078 * changes. 1249 * changes.
1250 *
1251 * XXX: this means the conditional below can probably be removed.
1079 */ 1252 */
1080 status = inode_setattr(inode, attr); 1253 if ((attr->ia_valid & ATTR_SIZE) &&
1081 if (status < 0) { 1254 attr->ia_size != i_size_read(inode)) {
1082 mlog_errno(status); 1255 status = vmtruncate(inode, attr->ia_size);
1083 goto bail_commit; 1256 if (status) {
1257 mlog_errno(status);
1258 goto bail_commit;
1259 }
1084 } 1260 }
1085 1261
1262 setattr_copy(inode, attr);
1263 mark_inode_dirty(inode);
1264
1086 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1265 status = ocfs2_mark_inode_dirty(handle, inode, bh);
1087 if (status < 0) 1266 if (status < 0)
1088 mlog_errno(status); 1267 mlog_errno(status);
@@ -1098,10 +1277,8 @@ bail:
1098 brelse(bh); 1277 brelse(bh);
1099 1278
1100 /* Release quota pointers in case we acquired them */ 1279 /* Release quota pointers in case we acquired them */
1101 for (qtype = 0; qtype < MAXQUOTAS; qtype++) { 1280 for (qtype = 0; qtype < MAXQUOTAS; qtype++)
1102 dqput(transfer_to[qtype]); 1281 dqput(transfer_to[qtype]);
1103 dqput(transfer_from[qtype]);
1104 }
1105 1282
1106 if (!status && attr->ia_valid & ATTR_MODE) { 1283 if (!status && attr->ia_valid & ATTR_MODE) {
1107 status = ocfs2_acl_chmod(inode); 1284 status = ocfs2_acl_chmod(inode);
@@ -1195,9 +1372,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1195 di = (struct ocfs2_dinode *) bh->b_data; 1372 di = (struct ocfs2_dinode *) bh->b_data;
1196 di->i_mode = cpu_to_le16(inode->i_mode); 1373 di->i_mode = cpu_to_le16(inode->i_mode);
1197 1374
1198 ret = ocfs2_journal_dirty(handle, bh); 1375 ocfs2_journal_dirty(handle, bh);
1199 if (ret < 0)
1200 mlog_errno(ret);
1201 1376
1202out_trans: 1377out_trans:
1203 ocfs2_commit_trans(osb, handle); 1378 ocfs2_commit_trans(osb, handle);
@@ -1434,16 +1609,90 @@ out:
1434 return ret; 1609 return ret;
1435} 1610}
1436 1611
1612static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1613{
1614 int i;
1615 struct ocfs2_extent_rec *rec = NULL;
1616
1617 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1618
1619 rec = &el->l_recs[i];
1620
1621 if (le32_to_cpu(rec->e_cpos) < pos)
1622 break;
1623 }
1624
1625 return i;
1626}
1627
1628/*
1629 * Helper to calculate the punching pos and length in one run, we handle the
1630 * following three cases in order:
1631 *
1632 * - remove the entire record
1633 * - remove a partial record
1634 * - no record needs to be removed (hole-punching completed)
1635*/
1636static void ocfs2_calc_trunc_pos(struct inode *inode,
1637 struct ocfs2_extent_list *el,
1638 struct ocfs2_extent_rec *rec,
1639 u32 trunc_start, u32 *trunc_cpos,
1640 u32 *trunc_len, u32 *trunc_end,
1641 u64 *blkno, int *done)
1642{
1643 int ret = 0;
1644 u32 coff, range;
1645
1646 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1647
1648 if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1649 *trunc_cpos = le32_to_cpu(rec->e_cpos);
1650 /*
1651 * Skip holes if any.
1652 */
1653 if (range < *trunc_end)
1654 *trunc_end = range;
1655 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1656 *blkno = le64_to_cpu(rec->e_blkno);
1657 *trunc_end = le32_to_cpu(rec->e_cpos);
1658 } else if (range > trunc_start) {
1659 *trunc_cpos = trunc_start;
1660 *trunc_len = *trunc_end - trunc_start;
1661 coff = trunc_start - le32_to_cpu(rec->e_cpos);
1662 *blkno = le64_to_cpu(rec->e_blkno) +
1663 ocfs2_clusters_to_blocks(inode->i_sb, coff);
1664 *trunc_end = trunc_start;
1665 } else {
1666 /*
1667 * It may have two following possibilities:
1668 *
1669 * - last record has been removed
1670 * - trunc_start was within a hole
1671 *
1672 * both two cases mean the completion of hole punching.
1673 */
1674 ret = 1;
1675 }
1676
1677 *done = ret;
1678}
1679
1437static int ocfs2_remove_inode_range(struct inode *inode, 1680static int ocfs2_remove_inode_range(struct inode *inode,
1438 struct buffer_head *di_bh, u64 byte_start, 1681 struct buffer_head *di_bh, u64 byte_start,
1439 u64 byte_len) 1682 u64 byte_len)
1440{ 1683{
1441 int ret = 0; 1684 int ret = 0, flags = 0, done = 0, i;
1442 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; 1685 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1686 u32 cluster_in_el;
1443 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1687 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1444 struct ocfs2_cached_dealloc_ctxt dealloc; 1688 struct ocfs2_cached_dealloc_ctxt dealloc;
1445 struct address_space *mapping = inode->i_mapping; 1689 struct address_space *mapping = inode->i_mapping;
1446 struct ocfs2_extent_tree et; 1690 struct ocfs2_extent_tree et;
1691 struct ocfs2_path *path = NULL;
1692 struct ocfs2_extent_list *el = NULL;
1693 struct ocfs2_extent_rec *rec = NULL;
1694 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1695 u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1447 1696
1448 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 1697 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1449 ocfs2_init_dealloc_ctxt(&dealloc); 1698 ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1469,17 +1718,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1469 goto out; 1718 goto out;
1470 } 1719 }
1471 1720
1721 /*
1722 * For reflinks, we may need to CoW 2 clusters which might be
1723 * partially zero'd later, if hole's start and end offset were
1724 * within one cluster(means is not exactly aligned to clustersize).
1725 */
1726
1727 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
1728
1729 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1730 if (ret) {
1731 mlog_errno(ret);
1732 goto out;
1733 }
1734
1735 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1736 if (ret) {
1737 mlog_errno(ret);
1738 goto out;
1739 }
1740 }
1741
1472 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1742 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1473 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; 1743 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1474 if (trunc_len >= trunc_start) 1744 cluster_in_el = trunc_end;
1475 trunc_len -= trunc_start;
1476 else
1477 trunc_len = 0;
1478 1745
1479 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", 1746 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
1480 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1747 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1481 (unsigned long long)byte_start, 1748 (unsigned long long)byte_start,
1482 (unsigned long long)byte_len, trunc_start, trunc_len); 1749 (unsigned long long)byte_len, trunc_start, trunc_end);
1483 1750
1484 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1751 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1485 if (ret) { 1752 if (ret) {
@@ -1487,31 +1754,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1487 goto out; 1754 goto out;
1488 } 1755 }
1489 1756
1490 cpos = trunc_start; 1757 path = ocfs2_new_path_from_et(&et);
1491 while (trunc_len) { 1758 if (!path) {
1492 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1759 ret = -ENOMEM;
1493 &alloc_size, NULL); 1760 mlog_errno(ret);
1761 goto out;
1762 }
1763
1764 while (trunc_end > trunc_start) {
1765
1766 ret = ocfs2_find_path(INODE_CACHE(inode), path,
1767 cluster_in_el);
1494 if (ret) { 1768 if (ret) {
1495 mlog_errno(ret); 1769 mlog_errno(ret);
1496 goto out; 1770 goto out;
1497 } 1771 }
1498 1772
1499 if (alloc_size > trunc_len) 1773 el = path_leaf_el(path);
1500 alloc_size = trunc_len; 1774
1775 i = ocfs2_find_rec(el, trunc_end);
1776 /*
1777 * Need to go to previous extent block.
1778 */
1779 if (i < 0) {
1780 if (path->p_tree_depth == 0)
1781 break;
1501 1782
1502 /* Only do work for non-holes */ 1783 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1503 if (phys_cpos != 0) { 1784 path,
1504 ret = ocfs2_remove_btree_range(inode, &et, cpos, 1785 &cluster_in_el);
1505 phys_cpos, alloc_size,
1506 &dealloc);
1507 if (ret) { 1786 if (ret) {
1508 mlog_errno(ret); 1787 mlog_errno(ret);
1509 goto out; 1788 goto out;
1510 } 1789 }
1790
1791 /*
1792 * We've reached the leftmost extent block,
1793 * it's safe to leave.
1794 */
1795 if (cluster_in_el == 0)
1796 break;
1797
1798 /*
1799 * The 'pos' searched for previous extent block is
1800 * always one cluster less than actual trunc_end.
1801 */
1802 trunc_end = cluster_in_el + 1;
1803
1804 ocfs2_reinit_path(path, 1);
1805
1806 continue;
1807
1808 } else
1809 rec = &el->l_recs[i];
1810
1811 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1812 &trunc_len, &trunc_end, &blkno, &done);
1813 if (done)
1814 break;
1815
1816 flags = rec->e_flags;
1817 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1818
1819 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1820 phys_cpos, trunc_len, flags,
1821 &dealloc, refcount_loc);
1822 if (ret < 0) {
1823 mlog_errno(ret);
1824 goto out;
1511 } 1825 }
1512 1826
1513 cpos += alloc_size; 1827 cluster_in_el = trunc_end;
1514 trunc_len -= alloc_size; 1828
1829 ocfs2_reinit_path(path, 1);
1515 } 1830 }
1516 1831
1517 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1832 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
@@ -2001,9 +2316,13 @@ relock:
2001 * direct write may have instantiated a few 2316 * direct write may have instantiated a few
2002 * blocks outside i_size. Trim these off again. 2317 * blocks outside i_size. Trim these off again.
2003 * Don't need i_size_read because we hold i_mutex. 2318 * Don't need i_size_read because we hold i_mutex.
2319 *
2320 * XXX(truncate): this looks buggy because ocfs2 did not
2321 * actually implement ->truncate. Take a look at
2322 * the new truncate sequence and update this accordingly
2004 */ 2323 */
2005 if (*ppos + count > inode->i_size) 2324 if (*ppos + count > inode->i_size)
2006 vmtruncate(inode, inode->i_size); 2325 truncate_setsize(inode, inode->i_size);
2007 ret = written; 2326 ret = written;
2008 goto out_dio; 2327 goto out_dio;
2009 } 2328 }
@@ -2019,7 +2338,7 @@ out_dio:
2019 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2338 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2020 2339
2021 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || 2340 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2022 ((file->f_flags & O_DIRECT) && has_refcount)) { 2341 ((file->f_flags & O_DIRECT) && !direct_io)) {
2023 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2342 ret = filemap_fdatawrite_range(file->f_mapping, pos,
2024 pos + count - 1); 2343 pos + count - 1);
2025 if (ret < 0) 2344 if (ret < 0)
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70e..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
54int ocfs2_simple_size_update(struct inode *inode, 54int ocfs2_simple_size_update(struct inode *inode,
55 struct buffer_head *di_bh, 55 struct buffer_head *di_bh,
56 u64 new_i_size); 56 u64 new_i_size);
57int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, 57int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
58 u64 zero_to); 58 u64 new_i_size, u64 zero_to);
59int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
60 loff_t zero_to);
59int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
60int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
61 struct kstat *stat); 63 struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index af189887201c..eece3e05d9d0 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -376,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
376 376
377 OCFS2_I(inode)->ip_last_used_slot = 0; 377 OCFS2_I(inode)->ip_last_used_slot = 0;
378 OCFS2_I(inode)->ip_last_used_group = 0; 378 OCFS2_I(inode)->ip_last_used_group = 0;
379
380 if (S_ISDIR(inode->i_mode))
381 ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
382 OCFS2_RESV_FLAG_DIR);
379 mlog_exit_void(); 383 mlog_exit_void();
380} 384}
381 385
@@ -484,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
484 OCFS2_BH_IGNORE_CACHE); 488 OCFS2_BH_IGNORE_CACHE);
485 } else { 489 } else {
486 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 490 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
487 if (!status) 491 /*
492 * If buffer is in jbd, then its checksum may not have been
493 * computed as yet.
494 */
495 if (!status && !buffer_jbd(bh))
488 status = ocfs2_validate_inode_block(osb->sb, bh); 496 status = ocfs2_validate_inode_block(osb->sb, bh);
489 } 497 }
490 if (status < 0) { 498 if (status < 0) {
@@ -539,7 +547,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
539 struct buffer_head *fe_bh) 547 struct buffer_head *fe_bh)
540{ 548{
541 int status = 0; 549 int status = 0;
542 struct ocfs2_truncate_context *tc = NULL;
543 struct ocfs2_dinode *fe; 550 struct ocfs2_dinode *fe;
544 handle_t *handle = NULL; 551 handle_t *handle = NULL;
545 552
@@ -582,13 +589,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
582 ocfs2_commit_trans(osb, handle); 589 ocfs2_commit_trans(osb, handle);
583 handle = NULL; 590 handle = NULL;
584 591
585 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); 592 status = ocfs2_commit_truncate(osb, inode, fe_bh);
586 if (status < 0) {
587 mlog_errno(status);
588 goto out;
589 }
590
591 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
592 if (status < 0) { 593 if (status < 0) {
593 mlog_errno(status); 594 mlog_errno(status);
594 goto out; 595 goto out;
@@ -659,12 +660,7 @@ static int ocfs2_remove_inode(struct inode *inode,
659 660
660 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 661 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
661 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 662 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
662 663 ocfs2_journal_dirty(handle, di_bh);
663 status = ocfs2_journal_dirty(handle, di_bh);
664 if (status < 0) {
665 mlog_errno(status);
666 goto bail_commit;
667 }
668 664
669 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 665 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
670 dquot_free_inode(inode); 666 dquot_free_inode(inode);
@@ -977,10 +973,10 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
977 truncate_inode_pages(&inode->i_data, 0); 973 truncate_inode_pages(&inode->i_data, 0);
978} 974}
979 975
980void ocfs2_delete_inode(struct inode *inode) 976static void ocfs2_delete_inode(struct inode *inode)
981{ 977{
982 int wipe, status; 978 int wipe, status;
983 sigset_t blocked, oldset; 979 sigset_t oldset;
984 struct buffer_head *di_bh = NULL; 980 struct buffer_head *di_bh = NULL;
985 981
986 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 982 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -1007,13 +1003,7 @@ void ocfs2_delete_inode(struct inode *inode)
1007 * messaging paths may return us -ERESTARTSYS. Which would 1003 * messaging paths may return us -ERESTARTSYS. Which would
1008 * cause us to exit early, resulting in inodes being orphaned 1004 * cause us to exit early, resulting in inodes being orphaned
1009 * forever. */ 1005 * forever. */
1010 sigfillset(&blocked); 1006 ocfs2_block_signals(&oldset);
1011 status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
1012 if (status < 0) {
1013 mlog_errno(status);
1014 ocfs2_cleanup_delete_inode(inode, 1);
1015 goto bail;
1016 }
1017 1007
1018 /* 1008 /*
1019 * Synchronize us against ocfs2_get_dentry. We take this in 1009 * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1087,24 +1077,19 @@ bail_unlock_nfs_sync:
1087 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); 1077 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
1088 1078
1089bail_unblock: 1079bail_unblock:
1090 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 1080 ocfs2_unblock_signals(&oldset);
1091 if (status < 0)
1092 mlog_errno(status);
1093bail: 1081bail:
1094 clear_inode(inode);
1095 mlog_exit_void(); 1082 mlog_exit_void();
1096} 1083}
1097 1084
1098void ocfs2_clear_inode(struct inode *inode) 1085static void ocfs2_clear_inode(struct inode *inode)
1099{ 1086{
1100 int status; 1087 int status;
1101 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1088 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1102 1089
1103 mlog_entry_void(); 1090 mlog_entry_void();
1104 1091
1105 if (!inode) 1092 end_writeback(inode);
1106 goto bail;
1107
1108 mlog(0, "Clearing inode: %llu, nlink = %u\n", 1093 mlog(0, "Clearing inode: %llu, nlink = %u\n",
1109 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink); 1094 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
1110 1095
@@ -1123,6 +1108,10 @@ void ocfs2_clear_inode(struct inode *inode)
1123 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); 1108 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
1124 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1109 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1125 1110
1111 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
1112 &oi->ip_la_data_resv);
1113 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1114
1126 /* We very well may get a clear_inode before all an inodes 1115 /* We very well may get a clear_inode before all an inodes
1127 * metadata has hit disk. Of course, we can't drop any cluster 1116 * metadata has hit disk. Of course, we can't drop any cluster
1128 * locks until the journal has finished with it. The only 1117 * locks until the journal has finished with it. The only
@@ -1192,16 +1181,27 @@ void ocfs2_clear_inode(struct inode *inode)
1192 jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, 1181 jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
1193 &oi->ip_jinode); 1182 &oi->ip_jinode);
1194 1183
1195bail:
1196 mlog_exit_void(); 1184 mlog_exit_void();
1197} 1185}
1198 1186
1187void ocfs2_evict_inode(struct inode *inode)
1188{
1189 if (!inode->i_nlink ||
1190 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
1191 ocfs2_delete_inode(inode);
1192 } else {
1193 truncate_inode_pages(&inode->i_data, 0);
1194 }
1195 ocfs2_clear_inode(inode);
1196}
1197
1199/* Called under inode_lock, with no more references on the 1198/* Called under inode_lock, with no more references on the
1200 * struct inode, so it's safe here to check the flags field 1199 * struct inode, so it's safe here to check the flags field
1201 * and to manipulate i_nlink without any other locks. */ 1200 * and to manipulate i_nlink without any other locks. */
1202void ocfs2_drop_inode(struct inode *inode) 1201int ocfs2_drop_inode(struct inode *inode)
1203{ 1202{
1204 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1203 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1204 int res;
1205 1205
1206 mlog_entry_void(); 1206 mlog_entry_void();
1207 1207
@@ -1209,11 +1209,12 @@ void ocfs2_drop_inode(struct inode *inode)
1209 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); 1209 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
1210 1210
1211 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) 1211 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
1212 generic_delete_inode(inode); 1212 res = 1;
1213 else 1213 else
1214 generic_drop_inode(inode); 1214 res = generic_drop_inode(inode);
1215 1215
1216 mlog_exit_void(); 1216 mlog_exit_void();
1217 return res;
1217} 1218}
1218 1219
1219/* 1220/*
@@ -1298,13 +1299,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1298 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); 1299 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
1299 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1300 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1300 1301
1301 status = ocfs2_journal_dirty(handle, bh); 1302 ocfs2_journal_dirty(handle, bh);
1302 if (status < 0)
1303 mlog_errno(status);
1304
1305 status = 0;
1306leave: 1303leave:
1307
1308 mlog_exit(status); 1304 mlog_exit(status);
1309 return status; 1305 return status;
1310} 1306}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 0b28e1921a39..6de5a869db30 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
70 /* Only valid if the inode is the dir. */ 70 /* Only valid if the inode is the dir. */
71 u32 ip_last_used_slot; 71 u32 ip_last_used_slot;
72 u64 ip_last_used_group; 72 u64 ip_last_used_group;
73
74 struct ocfs2_alloc_reservation ip_la_data_resv;
73}; 75};
74 76
75/* 77/*
@@ -121,9 +123,8 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
121 return &OCFS2_I(inode)->ip_metadata_cache; 123 return &OCFS2_I(inode)->ip_metadata_cache;
122} 124}
123 125
124void ocfs2_clear_inode(struct inode *inode); 126void ocfs2_evict_inode(struct inode *inode);
125void ocfs2_delete_inode(struct inode *inode); 127int ocfs2_drop_inode(struct inode *inode);
126void ocfs2_drop_inode(struct inode *inode);
127 128
128/* Flags for ocfs2_iget() */ 129/* Flags for ocfs2_iget() */
129#define OCFS2_FI_FLAG_SYSFILE 0x1 130#define OCFS2_FI_FLAG_SYSFILE 0x1
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60e3a36..9b57c0350ff9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
402} 402}
403 403
404/* 404/*
405 * 'nblocks' is what you want to add to the current 405 * 'nblocks' is what you want to add to the current transaction.
406 * transaction. extend_trans will either extend the current handle by
407 * nblocks, or commit it and start a new one with nblocks credits.
408 * 406 *
409 * This might call jbd2_journal_restart() which will commit dirty buffers 407 * This might call jbd2_journal_restart() which will commit dirty buffers
410 * and then restart the transaction. Before calling 408 * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
422 */ 420 */
423int ocfs2_extend_trans(handle_t *handle, int nblocks) 421int ocfs2_extend_trans(handle_t *handle, int nblocks)
424{ 422{
425 int status; 423 int status, old_nblocks;
426 424
427 BUG_ON(!handle); 425 BUG_ON(!handle);
428 BUG_ON(!nblocks); 426 BUG_ON(nblocks < 0);
427
428 if (!nblocks)
429 return 0;
429 430
431 old_nblocks = handle->h_buffer_credits;
430 mlog_entry_void(); 432 mlog_entry_void();
431 433
432 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 434 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
445 mlog(0, 447 mlog(0,
446 "jbd2_journal_extend failed, trying " 448 "jbd2_journal_extend failed, trying "
447 "jbd2_journal_restart\n"); 449 "jbd2_journal_restart\n");
448 status = jbd2_journal_restart(handle, nblocks); 450 status = jbd2_journal_restart(handle,
451 old_nblocks + nblocks);
449 if (status < 0) { 452 if (status < 0) {
450 mlog_errno(status); 453 mlog_errno(status);
451 goto bail; 454 goto bail;
@@ -469,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
469 return container_of(triggers, struct ocfs2_triggers, ot_triggers); 472 return container_of(triggers, struct ocfs2_triggers, ot_triggers);
470} 473}
471 474
472static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, 475static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
473 struct buffer_head *bh, 476 struct buffer_head *bh,
474 void *data, size_t size) 477 void *data, size_t size)
475{ 478{
@@ -488,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
488 * Quota blocks have their own trigger because the struct ocfs2_block_check 491 * Quota blocks have their own trigger because the struct ocfs2_block_check
489 * offset depends on the blocksize. 492 * offset depends on the blocksize.
490 */ 493 */
491static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, 494static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
492 struct buffer_head *bh, 495 struct buffer_head *bh,
493 void *data, size_t size) 496 void *data, size_t size)
494{ 497{
@@ -508,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
508 * Directory blocks also have their own trigger because the 511 * Directory blocks also have their own trigger because the
509 * struct ocfs2_block_check offset depends on the blocksize. 512 * struct ocfs2_block_check offset depends on the blocksize.
510 */ 513 */
511static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers, 514static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
512 struct buffer_head *bh, 515 struct buffer_head *bh,
513 void *data, size_t size) 516 void *data, size_t size)
514{ 517{
@@ -541,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
541 544
542static struct ocfs2_triggers di_triggers = { 545static struct ocfs2_triggers di_triggers = {
543 .ot_triggers = { 546 .ot_triggers = {
544 .t_commit = ocfs2_commit_trigger, 547 .t_frozen = ocfs2_frozen_trigger,
545 .t_abort = ocfs2_abort_trigger, 548 .t_abort = ocfs2_abort_trigger,
546 }, 549 },
547 .ot_offset = offsetof(struct ocfs2_dinode, i_check), 550 .ot_offset = offsetof(struct ocfs2_dinode, i_check),
@@ -549,7 +552,7 @@ static struct ocfs2_triggers di_triggers = {
549 552
550static struct ocfs2_triggers eb_triggers = { 553static struct ocfs2_triggers eb_triggers = {
551 .ot_triggers = { 554 .ot_triggers = {
552 .t_commit = ocfs2_commit_trigger, 555 .t_frozen = ocfs2_frozen_trigger,
553 .t_abort = ocfs2_abort_trigger, 556 .t_abort = ocfs2_abort_trigger,
554 }, 557 },
555 .ot_offset = offsetof(struct ocfs2_extent_block, h_check), 558 .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
@@ -557,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = {
557 560
558static struct ocfs2_triggers rb_triggers = { 561static struct ocfs2_triggers rb_triggers = {
559 .ot_triggers = { 562 .ot_triggers = {
560 .t_commit = ocfs2_commit_trigger, 563 .t_frozen = ocfs2_frozen_trigger,
561 .t_abort = ocfs2_abort_trigger, 564 .t_abort = ocfs2_abort_trigger,
562 }, 565 },
563 .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), 566 .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
@@ -565,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = {
565 568
566static struct ocfs2_triggers gd_triggers = { 569static struct ocfs2_triggers gd_triggers = {
567 .ot_triggers = { 570 .ot_triggers = {
568 .t_commit = ocfs2_commit_trigger, 571 .t_frozen = ocfs2_frozen_trigger,
569 .t_abort = ocfs2_abort_trigger, 572 .t_abort = ocfs2_abort_trigger,
570 }, 573 },
571 .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), 574 .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
@@ -573,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = {
573 576
574static struct ocfs2_triggers db_triggers = { 577static struct ocfs2_triggers db_triggers = {
575 .ot_triggers = { 578 .ot_triggers = {
576 .t_commit = ocfs2_db_commit_trigger, 579 .t_frozen = ocfs2_db_frozen_trigger,
577 .t_abort = ocfs2_abort_trigger, 580 .t_abort = ocfs2_abort_trigger,
578 }, 581 },
579}; 582};
580 583
581static struct ocfs2_triggers xb_triggers = { 584static struct ocfs2_triggers xb_triggers = {
582 .ot_triggers = { 585 .ot_triggers = {
583 .t_commit = ocfs2_commit_trigger, 586 .t_frozen = ocfs2_frozen_trigger,
584 .t_abort = ocfs2_abort_trigger, 587 .t_abort = ocfs2_abort_trigger,
585 }, 588 },
586 .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), 589 .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
@@ -588,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = {
588 591
589static struct ocfs2_triggers dq_triggers = { 592static struct ocfs2_triggers dq_triggers = {
590 .ot_triggers = { 593 .ot_triggers = {
591 .t_commit = ocfs2_dq_commit_trigger, 594 .t_frozen = ocfs2_dq_frozen_trigger,
592 .t_abort = ocfs2_abort_trigger, 595 .t_abort = ocfs2_abort_trigger,
593 }, 596 },
594}; 597};
595 598
596static struct ocfs2_triggers dr_triggers = { 599static struct ocfs2_triggers dr_triggers = {
597 .ot_triggers = { 600 .ot_triggers = {
598 .t_commit = ocfs2_commit_trigger, 601 .t_frozen = ocfs2_frozen_trigger,
599 .t_abort = ocfs2_abort_trigger, 602 .t_abort = ocfs2_abort_trigger,
600 }, 603 },
601 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), 604 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -603,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = {
603 606
604static struct ocfs2_triggers dl_triggers = { 607static struct ocfs2_triggers dl_triggers = {
605 .ot_triggers = { 608 .ot_triggers = {
606 .t_commit = ocfs2_commit_trigger, 609 .t_frozen = ocfs2_frozen_trigger,
607 .t_abort = ocfs2_abort_trigger, 610 .t_abort = ocfs2_abort_trigger,
608 }, 611 },
609 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), 612 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
734 return __ocfs2_journal_access(handle, ci, bh, NULL, type); 737 return __ocfs2_journal_access(handle, ci, bh, NULL, type);
735} 738}
736 739
737int ocfs2_journal_dirty(handle_t *handle, 740void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
738 struct buffer_head *bh)
739{ 741{
740 int status; 742 int status;
741 743
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
743 (unsigned long long)bh->b_blocknr); 745 (unsigned long long)bh->b_blocknr);
744 746
745 status = jbd2_journal_dirty_metadata(handle, bh); 747 status = jbd2_journal_dirty_metadata(handle, bh);
746 if (status < 0) 748 BUG_ON(status);
747 mlog(ML_ERROR, "Could not dirty metadata buffer. "
748 "(bh->b_blocknr=%llu)\n",
749 (unsigned long long)bh->b_blocknr);
750 749
751 mlog_exit(status); 750 mlog_exit_void();
752 return status;
753} 751}
754 752
755#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) 753#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -762,13 +760,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
762 if (osb->osb_commit_interval) 760 if (osb->osb_commit_interval)
763 commit_interval = osb->osb_commit_interval; 761 commit_interval = osb->osb_commit_interval;
764 762
765 spin_lock(&journal->j_state_lock); 763 write_lock(&journal->j_state_lock);
766 journal->j_commit_interval = commit_interval; 764 journal->j_commit_interval = commit_interval;
767 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 765 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
768 journal->j_flags |= JBD2_BARRIER; 766 journal->j_flags |= JBD2_BARRIER;
769 else 767 else
770 journal->j_flags &= ~JBD2_BARRIER; 768 journal->j_flags &= ~JBD2_BARRIER;
771 spin_unlock(&journal->j_state_lock); 769 write_unlock(&journal->j_state_lock);
772} 770}
773 771
774int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) 772int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
@@ -1938,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
1938 mutex_lock(&os->os_lock); 1936 mutex_lock(&os->os_lock);
1939 ocfs2_queue_orphan_scan(osb); 1937 ocfs2_queue_orphan_scan(osb);
1940 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) 1938 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
1941 schedule_delayed_work(&os->os_orphan_scan_work, 1939 queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
1942 ocfs2_orphan_scan_timeout()); 1940 ocfs2_orphan_scan_timeout());
1943 mutex_unlock(&os->os_lock); 1941 mutex_unlock(&os->os_lock);
1944} 1942}
@@ -1978,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
1978 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); 1976 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
1979 else { 1977 else {
1980 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); 1978 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
1981 schedule_delayed_work(&os->os_orphan_scan_work, 1979 queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
1982 ocfs2_orphan_scan_timeout()); 1980 ocfs2_orphan_scan_timeout());
1983 } 1981 }
1984} 1982}
1985 1983
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d80..b5baaa8e710f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
325 * <modify the bh> 325 * <modify the bh>
326 * ocfs2_journal_dirty(handle, bh); 326 * ocfs2_journal_dirty(handle, bh);
327 */ 327 */
328int ocfs2_journal_dirty(handle_t *handle, 328void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
329 struct buffer_head *bh);
330 329
331/* 330/*
332 * Credit Macros: 331 * Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
562 return blocks; 561 return blocks;
563} 562}
564 563
564/*
565 * Allocating a discontiguous block group requires the credits from
566 * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
567 * the group descriptor's extent list. The caller already has started
568 * the transaction with ocfs2_calc_group_alloc_credits(). They extend
569 * it with these credits.
570 */
571static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
572{
573 return ocfs2_extent_recs_per_gd(sb);
574}
575
565static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, 576static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
566 unsigned int clusters_to_del, 577 unsigned int clusters_to_del,
567 struct ocfs2_dinode *fe, 578 struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index c983715d8d8c..ec6adbf8f551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
52 52
53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
54 struct ocfs2_dinode *alloc, 54 struct ocfs2_dinode *alloc,
55 u32 numbits); 55 u32 *numbits,
56 struct ocfs2_alloc_reservation *resv);
56 57
57static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); 58static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
58 59
@@ -74,6 +75,151 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
75 struct inode *local_alloc_inode); 76 struct inode *local_alloc_inode);
76 77
78/*
79 * ocfs2_la_default_mb() - determine a default size, in megabytes of
80 * the local alloc.
81 *
82 * Generally, we'd like to pick as large a local alloc as
83 * possible. Performance on large workloads tends to scale
84 * proportionally to la size. In addition to that, the reservations
85 * code functions more efficiently as it can reserve more windows for
86 * write.
87 *
88 * Some things work against us when trying to choose a large local alloc:
89 *
90 * - We need to ensure our sizing is picked to leave enough space in
91 * group descriptors for other allocations (such as block groups,
92 * etc). Picking default sizes which are a multiple of 4 could help
93 * - block groups are allocated in 2mb and 4mb chunks.
94 *
95 * - Likewise, we don't want to starve other nodes of bits on small
96 * file systems. This can easily be taken care of by limiting our
97 * default to a reasonable size (256M) on larger cluster sizes.
98 *
99 * - Some file systems can't support very large sizes - 4k and 8k in
100 * particular are limited to less than 128 and 256 megabytes respectively.
101 *
102 * The following reference table shows group descriptor and local
103 * alloc maximums at various cluster sizes (4k blocksize)
104 *
105 * csize: 4K group: 126M la: 121M
106 * csize: 8K group: 252M la: 243M
107 * csize: 16K group: 504M la: 486M
108 * csize: 32K group: 1008M la: 972M
109 * csize: 64K group: 2016M la: 1944M
110 * csize: 128K group: 4032M la: 3888M
111 * csize: 256K group: 8064M la: 7776M
112 * csize: 512K group: 16128M la: 15552M
113 * csize: 1024K group: 32256M la: 31104M
114 */
115#define OCFS2_LA_MAX_DEFAULT_MB 256
116#define OCFS2_LA_OLD_DEFAULT 8
117unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
118{
119 unsigned int la_mb;
120 unsigned int gd_mb;
121 unsigned int la_max_mb;
122 unsigned int megs_per_slot;
123 struct super_block *sb = osb->sb;
124
125 gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
126 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
127
128 /*
129 * This takes care of files systems with very small group
130 * descriptors - 512 byte blocksize at cluster sizes lower
131 * than 16K and also 1k blocksize with 4k cluster size.
132 */
133 if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
134 || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
135 return OCFS2_LA_OLD_DEFAULT;
136
137 /*
138 * Leave enough room for some block groups and make the final
139 * value we work from a multiple of 4.
140 */
141 gd_mb -= 16;
142 gd_mb &= 0xFFFFFFFB;
143
144 la_mb = gd_mb;
145
146 /*
147 * Keep window sizes down to a reasonable default
148 */
149 if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
150 /*
151 * Some clustersize / blocksize combinations will have
152 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
153 * default size, but get poor distribution when
154 * limited to exactly 256 megabytes.
155 *
156 * As an example, 16K clustersize at 4K blocksize
157 * gives us a cluster group size of 504M. Paring the
158 * local alloc size down to 256 however, would give us
159 * only one window and around 200MB left in the
160 * cluster group. Instead, find the first size below
161 * 256 which would give us an even distribution.
162 *
163 * Larger cluster group sizes actually work out pretty
164 * well when pared to 256, so we don't have to do this
165 * for any group that fits more than two
166 * OCFS2_LA_MAX_DEFAULT_MB windows.
167 */
168 if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
169 la_mb = 256;
170 else {
171 unsigned int gd_mult = gd_mb;
172
173 while (gd_mult > 256)
174 gd_mult = gd_mult >> 1;
175
176 la_mb = gd_mult;
177 }
178 }
179
180 megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
181 megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
182 /* Too many nodes, too few disk clusters. */
183 if (megs_per_slot < la_mb)
184 la_mb = megs_per_slot;
185
186 /* We can't store more bits than we can in a block. */
187 la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
188 ocfs2_local_alloc_size(sb) * 8);
189 if (la_mb > la_max_mb)
190 la_mb = la_max_mb;
191
192 return la_mb;
193}
194
195void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
196{
197 struct super_block *sb = osb->sb;
198 unsigned int la_default_mb = ocfs2_la_default_mb(osb);
199 unsigned int la_max_mb;
200
201 la_max_mb = ocfs2_clusters_to_megabytes(sb,
202 ocfs2_local_alloc_size(sb) * 8);
203
204 mlog(0, "requested: %dM, max: %uM, default: %uM\n",
205 requested_mb, la_max_mb, la_default_mb);
206
207 if (requested_mb == -1) {
208 /* No user request - use defaults */
209 osb->local_alloc_default_bits =
210 ocfs2_megabytes_to_clusters(sb, la_default_mb);
211 } else if (requested_mb > la_max_mb) {
212 /* Request is too big, we give the maximum available */
213 osb->local_alloc_default_bits =
214 ocfs2_megabytes_to_clusters(sb, la_max_mb);
215 } else {
216 osb->local_alloc_default_bits =
217 ocfs2_megabytes_to_clusters(sb, requested_mb);
218 }
219
220 osb->local_alloc_bits = osb->local_alloc_default_bits;
221}
222
77static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) 223static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
78{ 224{
79 return (osb->local_alloc_state == OCFS2_LA_THROTTLED || 225 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +302,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
156 osb->local_alloc_bits, (osb->bitmap_cpg - 1)); 302 osb->local_alloc_bits, (osb->bitmap_cpg - 1));
157 osb->local_alloc_bits = 303 osb->local_alloc_bits =
158 ocfs2_megabytes_to_clusters(osb->sb, 304 ocfs2_megabytes_to_clusters(osb->sb,
159 OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); 305 ocfs2_la_default_mb(osb));
160 } 306 }
161 307
162 /* read the alloc off disk */ 308 /* read the alloc off disk */
@@ -262,6 +408,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
262 408
263 osb->local_alloc_state = OCFS2_LA_DISABLED; 409 osb->local_alloc_state = OCFS2_LA_DISABLED;
264 410
411 ocfs2_resmap_uninit(&osb->osb_la_resmap);
412
265 main_bm_inode = ocfs2_get_system_file_inode(osb, 413 main_bm_inode = ocfs2_get_system_file_inode(osb,
266 GLOBAL_BITMAP_SYSTEM_INODE, 414 GLOBAL_BITMAP_SYSTEM_INODE,
267 OCFS2_INVALID_SLOT); 415 OCFS2_INVALID_SLOT);
@@ -305,12 +453,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
305 } 453 }
306 454
307 ocfs2_clear_local_alloc(alloc); 455 ocfs2_clear_local_alloc(alloc);
308 456 ocfs2_journal_dirty(handle, bh);
309 status = ocfs2_journal_dirty(handle, bh);
310 if (status < 0) {
311 mlog_errno(status);
312 goto out_commit;
313 }
314 457
315 brelse(bh); 458 brelse(bh);
316 osb->local_alloc_bh = NULL; 459 osb->local_alloc_bh = NULL;
@@ -481,46 +624,6 @@ out:
481 return status; 624 return status;
482} 625}
483 626
484/* Check to see if the local alloc window is within ac->ac_max_block */
485static int ocfs2_local_alloc_in_range(struct inode *inode,
486 struct ocfs2_alloc_context *ac,
487 u32 bits_wanted)
488{
489 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
490 struct ocfs2_dinode *alloc;
491 struct ocfs2_local_alloc *la;
492 int start;
493 u64 block_off;
494
495 if (!ac->ac_max_block)
496 return 1;
497
498 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
499 la = OCFS2_LOCAL_ALLOC(alloc);
500
501 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
502 if (start == -1) {
503 mlog_errno(-ENOSPC);
504 return 0;
505 }
506
507 /*
508 * Converting (bm_off + start + bits_wanted) to blocks gives us
509 * the blkno just past our actual allocation. This is perfect
510 * to compare with ac_max_block.
511 */
512 block_off = ocfs2_clusters_to_blocks(inode->i_sb,
513 le32_to_cpu(la->la_bm_off) +
514 start + bits_wanted);
515 mlog(0, "Checking %llu against %llu\n",
516 (unsigned long long)block_off,
517 (unsigned long long)ac->ac_max_block);
518 if (block_off > ac->ac_max_block)
519 return 0;
520
521 return 1;
522}
523
524/* 627/*
525 * make sure we've got at least bits_wanted contiguous bits in the 628 * make sure we've got at least bits_wanted contiguous bits in the
526 * local alloc. You lose them when you drop i_mutex. 629 * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +716,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
613 mlog(0, "Calling in_range for max block %llu\n", 716 mlog(0, "Calling in_range for max block %llu\n",
614 (unsigned long long)ac->ac_max_block); 717 (unsigned long long)ac->ac_max_block);
615 718
616 if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
617 bits_wanted)) {
618 /*
619 * The window is outside ac->ac_max_block.
620 * This errno tells the caller to keep localalloc enabled
621 * but to get the allocation from the main bitmap.
622 */
623 status = -EFBIG;
624 goto bail;
625 }
626
627 ac->ac_inode = local_alloc_inode; 719 ac->ac_inode = local_alloc_inode;
628 /* We should never use localalloc from another slot */ 720 /* We should never use localalloc from another slot */
629 ac->ac_alloc_slot = osb->slot_num; 721 ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +756,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
664 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 756 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
665 la = OCFS2_LOCAL_ALLOC(alloc); 757 la = OCFS2_LOCAL_ALLOC(alloc);
666 758
667 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); 759 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
760 ac->ac_resv);
668 if (start == -1) { 761 if (start == -1) {
669 /* TODO: Shouldn't we just BUG here? */ 762 /* TODO: Shouldn't we just BUG here? */
670 status = -ENOSPC; 763 status = -ENOSPC;
@@ -674,8 +767,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
674 767
675 bitmap = la->la_bitmap; 768 bitmap = la->la_bitmap;
676 *bit_off = le32_to_cpu(la->la_bm_off) + start; 769 *bit_off = le32_to_cpu(la->la_bm_off) + start;
677 /* local alloc is always contiguous by nature -- we never
678 * delete bits from it! */
679 *num_bits = bits_wanted; 770 *num_bits = bits_wanted;
680 771
681 status = ocfs2_journal_access_di(handle, 772 status = ocfs2_journal_access_di(handle,
@@ -687,18 +778,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
687 goto bail; 778 goto bail;
688 } 779 }
689 780
781 ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
782 bits_wanted);
783
690 while(bits_wanted--) 784 while(bits_wanted--)
691 ocfs2_set_bit(start++, bitmap); 785 ocfs2_set_bit(start++, bitmap);
692 786
693 le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits); 787 le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
788 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
694 789
695 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
696 if (status < 0) {
697 mlog_errno(status);
698 goto bail;
699 }
700
701 status = 0;
702bail: 790bail:
703 mlog_exit(status); 791 mlog_exit(status);
704 return status; 792 return status;
@@ -722,13 +810,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
722} 810}
723 811
724static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 812static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
725 struct ocfs2_dinode *alloc, 813 struct ocfs2_dinode *alloc,
726 u32 numbits) 814 u32 *numbits,
815 struct ocfs2_alloc_reservation *resv)
727{ 816{
728 int numfound, bitoff, left, startoff, lastzero; 817 int numfound, bitoff, left, startoff, lastzero;
818 int local_resv = 0;
819 struct ocfs2_alloc_reservation r;
729 void *bitmap = NULL; 820 void *bitmap = NULL;
821 struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
730 822
731 mlog_entry("(numbits wanted = %u)\n", numbits); 823 mlog_entry("(numbits wanted = %u)\n", *numbits);
732 824
733 if (!alloc->id1.bitmap1.i_total) { 825 if (!alloc->id1.bitmap1.i_total) {
734 mlog(0, "No bits in my window!\n"); 826 mlog(0, "No bits in my window!\n");
@@ -736,6 +828,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
736 goto bail; 828 goto bail;
737 } 829 }
738 830
831 if (!resv) {
832 local_resv = 1;
833 ocfs2_resv_init_once(&r);
834 ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
835 resv = &r;
836 }
837
838 numfound = *numbits;
839 if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
840 if (numfound < *numbits)
841 *numbits = numfound;
842 goto bail;
843 }
844
845 /*
846 * Code error. While reservations are enabled, local
847 * allocation should _always_ go through them.
848 */
849 BUG_ON(osb->osb_resv_level != 0);
850
851 /*
852 * Reservations are disabled. Handle this the old way.
853 */
854
739 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; 855 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
740 856
741 numfound = bitoff = startoff = 0; 857 numfound = bitoff = startoff = 0;
@@ -761,7 +877,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
761 startoff = bitoff+1; 877 startoff = bitoff+1;
762 } 878 }
763 /* we got everything we needed */ 879 /* we got everything we needed */
764 if (numfound == numbits) { 880 if (numfound == *numbits) {
765 /* mlog(0, "Found it all!\n"); */ 881 /* mlog(0, "Found it all!\n"); */
766 break; 882 break;
767 } 883 }
@@ -770,12 +886,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
770 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, 886 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
771 numfound); 887 numfound);
772 888
773 if (numfound == numbits) 889 if (numfound == *numbits)
774 bitoff = startoff - numfound; 890 bitoff = startoff - numfound;
775 else 891 else
776 bitoff = -1; 892 bitoff = -1;
777 893
778bail: 894bail:
895 if (local_resv)
896 ocfs2_resv_discard(resmap, resv);
897
779 mlog_exit(bitoff); 898 mlog_exit(bitoff);
780 return bitoff; 899 return bitoff;
781} 900}
@@ -1049,7 +1168,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
1049 /* we used the generic suballoc reserve function, but we set 1168 /* we used the generic suballoc reserve function, but we set
1050 * everything up nicely, so there's no reason why we can't use 1169 * everything up nicely, so there's no reason why we can't use
1051 * the more specific cluster api to claim bits. */ 1170 * the more specific cluster api to claim bits. */
1052 status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, 1171 status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
1053 &cluster_off, &cluster_count); 1172 &cluster_off, &cluster_count);
1054 if (status == -ENOSPC) { 1173 if (status == -ENOSPC) {
1055retry_enospc: 1174retry_enospc:
@@ -1063,7 +1182,7 @@ retry_enospc:
1063 goto bail; 1182 goto bail;
1064 1183
1065 ac->ac_bits_wanted = osb->local_alloc_default_bits; 1184 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1066 status = ocfs2_claim_clusters(osb, handle, ac, 1185 status = ocfs2_claim_clusters(handle, ac,
1067 osb->local_alloc_bits, 1186 osb->local_alloc_bits,
1068 &cluster_off, 1187 &cluster_off,
1069 &cluster_count); 1188 &cluster_count);
@@ -1098,6 +1217,9 @@ retry_enospc:
1098 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, 1217 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
1099 le16_to_cpu(la->la_size)); 1218 le16_to_cpu(la->la_size));
1100 1219
1220 ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
1221 OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
1222
1101 mlog(0, "New window allocated:\n"); 1223 mlog(0, "New window allocated:\n");
1102 mlog(0, "window la_bm_off = %u\n", 1224 mlog(0, "window la_bm_off = %u\n",
1103 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); 1225 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1169,12 +1291,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1169 } 1291 }
1170 1292
1171 ocfs2_clear_local_alloc(alloc); 1293 ocfs2_clear_local_alloc(alloc);
1172 1294 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
1173 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
1174 if (status < 0) {
1175 mlog_errno(status);
1176 goto bail;
1177 }
1178 1295
1179 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, 1296 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
1180 main_bm_inode, main_bm_bh); 1297 main_bm_inode, main_bm_bh);
@@ -1192,7 +1309,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1192 1309
1193 atomic_inc(&osb->alloc_stats.moves); 1310 atomic_inc(&osb->alloc_stats.moves);
1194 1311
1195 status = 0;
1196bail: 1312bail:
1197 if (handle) 1313 if (handle)
1198 ocfs2_commit_trans(osb, handle); 1314 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f86653..1be9b5864460 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
30 30
31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); 31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
32 32
33void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
34unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
35
33int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, 36int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
34 int node_num, 37 int node_num,
35 struct ocfs2_dinode **alloc_copy); 38 struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 7898bd3a99f5..4c18f4ad93b4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -41,44 +41,20 @@
41#include "file.h" 41#include "file.h"
42#include "inode.h" 42#include "inode.h"
43#include "mmap.h" 43#include "mmap.h"
44#include "super.h"
44 45
45static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
46{
47 /* The best way to deal with signals in the vm path is
48 * to block them upfront, rather than allowing the
49 * locking paths to return -ERESTARTSYS. */
50 sigfillset(blocked);
51
52 /* We should technically never get a bad return value
53 * from sigprocmask */
54 return sigprocmask(SIG_BLOCK, blocked, oldset);
55}
56
57static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
58{
59 return sigprocmask(SIG_SETMASK, oldset, NULL);
60}
61 46
62static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) 47static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
63{ 48{
64 sigset_t blocked, oldset; 49 sigset_t oldset;
65 int error, ret; 50 int ret;
66 51
67 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff); 52 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
68 53
69 error = ocfs2_vm_op_block_sigs(&blocked, &oldset); 54 ocfs2_block_signals(&oldset);
70 if (error < 0) {
71 mlog_errno(error);
72 ret = VM_FAULT_SIGBUS;
73 goto out;
74 }
75
76 ret = filemap_fault(area, vmf); 55 ret = filemap_fault(area, vmf);
56 ocfs2_unblock_signals(&oldset);
77 57
78 error = ocfs2_vm_op_unblock_sigs(&oldset);
79 if (error < 0)
80 mlog_errno(error);
81out:
82 mlog_exit_ptr(vmf->page); 58 mlog_exit_ptr(vmf->page);
83 return ret; 59 return ret;
84} 60}
@@ -98,9 +74,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
98 /* 74 /*
99 * Another node might have truncated while we were waiting on 75 * Another node might have truncated while we were waiting on
100 * cluster locks. 76 * cluster locks.
77 * We don't check size == 0 before the shift. This is borrowed
78 * from do_generic_file_read.
101 */ 79 */
102 last_index = size >> PAGE_CACHE_SHIFT; 80 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
103 if (page->index > last_index) { 81 if (unlikely(!size || page->index > last_index)) {
104 ret = -EINVAL; 82 ret = -EINVAL;
105 goto out; 83 goto out;
106 } 84 }
@@ -131,7 +109,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
131 * because the "write" would invalidate their data. 109 * because the "write" would invalidate their data.
132 */ 110 */
133 if (page->index == last_index) 111 if (page->index == last_index)
134 len = size & ~PAGE_CACHE_MASK; 112 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
135 113
136 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, 114 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
137 &fsdata, di_bh, page); 115 &fsdata, di_bh, page);
@@ -158,14 +136,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
158 struct page *page = vmf->page; 136 struct page *page = vmf->page;
159 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 137 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
160 struct buffer_head *di_bh = NULL; 138 struct buffer_head *di_bh = NULL;
161 sigset_t blocked, oldset; 139 sigset_t oldset;
162 int ret, ret2; 140 int ret;
163 141
164 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); 142 ocfs2_block_signals(&oldset);
165 if (ret < 0) {
166 mlog_errno(ret);
167 return ret;
168 }
169 143
170 /* 144 /*
171 * The cluster locks taken will block a truncate from another 145 * The cluster locks taken will block a truncate from another
@@ -193,9 +167,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
193 ocfs2_inode_unlock(inode, 1); 167 ocfs2_inode_unlock(inode, 1);
194 168
195out: 169out:
196 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 170 ocfs2_unblock_signals(&oldset);
197 if (ret2 < 0)
198 mlog_errno(ret2);
199 if (ret) 171 if (ret)
200 ret = VM_FAULT_SIGBUS; 172 ret = VM_FAULT_SIGBUS;
201 return ret; 173 return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4cbb18f26c5f..a00dda2e4f16 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
204 inode->i_nlink = 2; 204 inode->i_nlink = 2;
205 else 205 else
206 inode->i_nlink = 1; 206 inode->i_nlink = 1;
207 inode->i_uid = current_fsuid(); 207 inode_init_owner(inode, dir, mode);
208 if (dir->i_mode & S_ISGID) {
209 inode->i_gid = dir->i_gid;
210 if (S_ISDIR(mode))
211 mode |= S_ISGID;
212 } else
213 inode->i_gid = current_fsgid();
214 inode->i_mode = mode;
215 dquot_initialize(inode); 208 dquot_initialize(inode);
216 return inode; 209 return inode;
217} 210}
@@ -239,6 +232,8 @@ static int ocfs2_mknod(struct inode *dir,
239 }; 232 };
240 int did_quota_inode = 0; 233 int did_quota_inode = 0;
241 struct ocfs2_dir_lookup_result lookup = { NULL, }; 234 struct ocfs2_dir_lookup_result lookup = { NULL, };
235 sigset_t oldset;
236 int did_block_signals = 0;
242 237
243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 238 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
244 (unsigned long)dev, dentry->d_name.len, 239 (unsigned long)dev, dentry->d_name.len,
@@ -350,6 +345,10 @@ static int ocfs2_mknod(struct inode *dir,
350 goto leave; 345 goto leave;
351 } 346 }
352 347
348 /* Starting to change things, restart is no longer possible. */
349 ocfs2_block_signals(&oldset);
350 did_block_signals = 1;
351
353 status = dquot_alloc_inode(inode); 352 status = dquot_alloc_inode(inode);
354 if (status) 353 if (status)
355 goto leave; 354 goto leave;
@@ -384,11 +383,7 @@ static int ocfs2_mknod(struct inode *dir,
384 goto leave; 383 goto leave;
385 } 384 }
386 ocfs2_add_links_count(dirfe, 1); 385 ocfs2_add_links_count(dirfe, 1);
387 status = ocfs2_journal_dirty(handle, parent_fe_bh); 386 ocfs2_journal_dirty(handle, parent_fe_bh);
388 if (status < 0) {
389 mlog_errno(status);
390 goto leave;
391 }
392 inc_nlink(dir); 387 inc_nlink(dir);
393 } 388 }
394 389
@@ -439,6 +434,8 @@ leave:
439 ocfs2_commit_trans(osb, handle); 434 ocfs2_commit_trans(osb, handle);
440 435
441 ocfs2_inode_unlock(dir, 1); 436 ocfs2_inode_unlock(dir, 1);
437 if (did_block_signals)
438 ocfs2_unblock_signals(&oldset);
442 439
443 if (status == -ENOSPC) 440 if (status == -ENOSPC)
444 mlog(0, "Disk is full\n"); 441 mlog(0, "Disk is full\n");
@@ -475,31 +472,23 @@ leave:
475 return status; 472 return status;
476} 473}
477 474
478static int ocfs2_mknod_locked(struct ocfs2_super *osb, 475static int __ocfs2_mknod_locked(struct inode *dir,
479 struct inode *dir, 476 struct inode *inode,
480 struct inode *inode, 477 dev_t dev,
481 dev_t dev, 478 struct buffer_head **new_fe_bh,
482 struct buffer_head **new_fe_bh, 479 struct buffer_head *parent_fe_bh,
483 struct buffer_head *parent_fe_bh, 480 handle_t *handle,
484 handle_t *handle, 481 struct ocfs2_alloc_context *inode_ac,
485 struct ocfs2_alloc_context *inode_ac) 482 u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
486{ 483{
487 int status = 0; 484 int status = 0;
485 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
488 struct ocfs2_dinode *fe = NULL; 486 struct ocfs2_dinode *fe = NULL;
489 struct ocfs2_extent_list *fel; 487 struct ocfs2_extent_list *fel;
490 u64 fe_blkno = 0;
491 u16 suballoc_bit;
492 u16 feat; 488 u16 feat;
493 489
494 *new_fe_bh = NULL; 490 *new_fe_bh = NULL;
495 491
496 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
497 inode_ac, &suballoc_bit, &fe_blkno);
498 if (status < 0) {
499 mlog_errno(status);
500 goto leave;
501 }
502
503 /* populate as many fields early on as possible - many of 492 /* populate as many fields early on as possible - many of
504 * these are used by the support functions here and in 493 * these are used by the support functions here and in
505 * callers. */ 494 * callers. */
@@ -531,6 +520,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
531 fe->i_generation = cpu_to_le32(inode->i_generation); 520 fe->i_generation = cpu_to_le32(inode->i_generation);
532 fe->i_fs_generation = cpu_to_le32(osb->fs_generation); 521 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
533 fe->i_blkno = cpu_to_le64(fe_blkno); 522 fe->i_blkno = cpu_to_le64(fe_blkno);
523 fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
534 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 524 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
535 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); 525 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
536 fe->i_uid = cpu_to_le32(inode->i_uid); 526 fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -567,11 +557,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
567 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); 557 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
568 } 558 }
569 559
570 status = ocfs2_journal_dirty(handle, *new_fe_bh); 560 ocfs2_journal_dirty(handle, *new_fe_bh);
571 if (status < 0) {
572 mlog_errno(status);
573 goto leave;
574 }
575 561
576 ocfs2_populate_inode(inode, fe, 1); 562 ocfs2_populate_inode(inode, fe, 1);
577 ocfs2_ci_set_new(osb, INODE_CACHE(inode)); 563 ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -596,6 +582,34 @@ leave:
596 return status; 582 return status;
597} 583}
598 584
585static int ocfs2_mknod_locked(struct ocfs2_super *osb,
586 struct inode *dir,
587 struct inode *inode,
588 dev_t dev,
589 struct buffer_head **new_fe_bh,
590 struct buffer_head *parent_fe_bh,
591 handle_t *handle,
592 struct ocfs2_alloc_context *inode_ac)
593{
594 int status = 0;
595 u64 suballoc_loc, fe_blkno = 0;
596 u16 suballoc_bit;
597
598 *new_fe_bh = NULL;
599
600 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
601 inode_ac, &suballoc_loc,
602 &suballoc_bit, &fe_blkno);
603 if (status < 0) {
604 mlog_errno(status);
605 return status;
606 }
607
608 return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
609 parent_fe_bh, handle, inode_ac,
610 fe_blkno, suballoc_loc, suballoc_bit);
611}
612
599static int ocfs2_mkdir(struct inode *dir, 613static int ocfs2_mkdir(struct inode *dir,
600 struct dentry *dentry, 614 struct dentry *dentry,
601 int mode) 615 int mode)
@@ -637,6 +651,7 @@ static int ocfs2_link(struct dentry *old_dentry,
637 struct ocfs2_dinode *fe = NULL; 651 struct ocfs2_dinode *fe = NULL;
638 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 652 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
639 struct ocfs2_dir_lookup_result lookup = { NULL, }; 653 struct ocfs2_dir_lookup_result lookup = { NULL, };
654 sigset_t oldset;
640 655
641 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 656 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
642 old_dentry->d_name.len, old_dentry->d_name.name, 657 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -693,6 +708,9 @@ static int ocfs2_link(struct dentry *old_dentry,
693 goto out_unlock_inode; 708 goto out_unlock_inode;
694 } 709 }
695 710
711 /* Starting to change things, restart is no longer possible. */
712 ocfs2_block_signals(&oldset);
713
696 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, 714 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
697 OCFS2_JOURNAL_ACCESS_WRITE); 715 OCFS2_JOURNAL_ACCESS_WRITE);
698 if (err < 0) { 716 if (err < 0) {
@@ -705,14 +723,7 @@ static int ocfs2_link(struct dentry *old_dentry,
705 ocfs2_set_links_count(fe, inode->i_nlink); 723 ocfs2_set_links_count(fe, inode->i_nlink);
706 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 724 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
707 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 725 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
708 726 ocfs2_journal_dirty(handle, fe_bh);
709 err = ocfs2_journal_dirty(handle, fe_bh);
710 if (err < 0) {
711 ocfs2_add_links_count(fe, -1);
712 drop_nlink(inode);
713 mlog_errno(err);
714 goto out_commit;
715 }
716 727
717 err = ocfs2_add_entry(handle, dentry, inode, 728 err = ocfs2_add_entry(handle, dentry, inode,
718 OCFS2_I(inode)->ip_blkno, 729 OCFS2_I(inode)->ip_blkno,
@@ -736,6 +747,7 @@ static int ocfs2_link(struct dentry *old_dentry,
736 747
737out_commit: 748out_commit:
738 ocfs2_commit_trans(osb, handle); 749 ocfs2_commit_trans(osb, handle);
750 ocfs2_unblock_signals(&oldset);
739out_unlock_inode: 751out_unlock_inode:
740 ocfs2_inode_unlock(inode, 1); 752 ocfs2_inode_unlock(inode, 1);
741 753
@@ -909,12 +921,7 @@ static int ocfs2_unlink(struct inode *dir,
909 drop_nlink(inode); 921 drop_nlink(inode);
910 drop_nlink(inode); 922 drop_nlink(inode);
911 ocfs2_set_links_count(fe, inode->i_nlink); 923 ocfs2_set_links_count(fe, inode->i_nlink);
912 924 ocfs2_journal_dirty(handle, fe_bh);
913 status = ocfs2_journal_dirty(handle, fe_bh);
914 if (status < 0) {
915 mlog_errno(status);
916 goto leave;
917 }
918 925
919 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 926 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
920 if (S_ISDIR(inode->i_mode)) 927 if (S_ISDIR(inode->i_mode))
@@ -1332,12 +1339,7 @@ static int ocfs2_rename(struct inode *old_dir,
1332 ocfs2_set_links_count(newfe, 0); 1339 ocfs2_set_links_count(newfe, 0);
1333 else 1340 else
1334 ocfs2_add_links_count(newfe, -1); 1341 ocfs2_add_links_count(newfe, -1);
1335 1342 ocfs2_journal_dirty(handle, newfe_bh);
1336 status = ocfs2_journal_dirty(handle, newfe_bh);
1337 if (status < 0) {
1338 mlog_errno(status);
1339 goto bail;
1340 }
1341 } else { 1343 } else {
1342 /* if the name was not found in new_dir, add it now */ 1344 /* if the name was not found in new_dir, add it now */
1343 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1345 status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1356,10 +1358,7 @@ static int ocfs2_rename(struct inode *old_dir,
1356 1358
1357 old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); 1359 old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
1358 old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); 1360 old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
1359 1361 ocfs2_journal_dirty(handle, old_inode_bh);
1360 status = ocfs2_journal_dirty(handle, old_inode_bh);
1361 if (status < 0)
1362 mlog_errno(status);
1363 } else 1362 } else
1364 mlog_errno(status); 1363 mlog_errno(status);
1365 1364
@@ -1431,7 +1430,7 @@ static int ocfs2_rename(struct inode *old_dir,
1431 OCFS2_JOURNAL_ACCESS_WRITE); 1430 OCFS2_JOURNAL_ACCESS_WRITE);
1432 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1431 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1433 ocfs2_set_links_count(fe, old_dir->i_nlink); 1432 ocfs2_set_links_count(fe, old_dir->i_nlink);
1434 status = ocfs2_journal_dirty(handle, old_dir_bh); 1433 ocfs2_journal_dirty(handle, old_dir_bh);
1435 } 1434 }
1436 } 1435 }
1437 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1436 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1563,11 +1562,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1563 (bytes_left > sb->s_blocksize) ? sb->s_blocksize : 1562 (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
1564 bytes_left); 1563 bytes_left);
1565 1564
1566 status = ocfs2_journal_dirty(handle, bhs[virtual]); 1565 ocfs2_journal_dirty(handle, bhs[virtual]);
1567 if (status < 0) {
1568 mlog_errno(status);
1569 goto bail;
1570 }
1571 1566
1572 virtual++; 1567 virtual++;
1573 p_blkno++; 1568 p_blkno++;
@@ -1611,6 +1606,8 @@ static int ocfs2_symlink(struct inode *dir,
1611 }; 1606 };
1612 int did_quota = 0, did_quota_inode = 0; 1607 int did_quota = 0, did_quota_inode = 0;
1613 struct ocfs2_dir_lookup_result lookup = { NULL, }; 1608 struct ocfs2_dir_lookup_result lookup = { NULL, };
1609 sigset_t oldset;
1610 int did_block_signals = 0;
1614 1611
1615 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1612 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1616 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1613 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1706,6 +1703,10 @@ static int ocfs2_symlink(struct inode *dir,
1706 goto bail; 1703 goto bail;
1707 } 1704 }
1708 1705
1706 /* Starting to change things, restart is no longer possible. */
1707 ocfs2_block_signals(&oldset);
1708 did_block_signals = 1;
1709
1709 status = dquot_alloc_inode(inode); 1710 status = dquot_alloc_inode(inode);
1710 if (status) 1711 if (status)
1711 goto bail; 1712 goto bail;
@@ -1814,6 +1815,8 @@ bail:
1814 ocfs2_commit_trans(osb, handle); 1815 ocfs2_commit_trans(osb, handle);
1815 1816
1816 ocfs2_inode_unlock(dir, 1); 1817 ocfs2_inode_unlock(dir, 1);
1818 if (did_block_signals)
1819 ocfs2_unblock_signals(&oldset);
1817 1820
1818 brelse(new_fe_bh); 1821 brelse(new_fe_bh);
1819 brelse(parent_fe_bh); 1822 brelse(parent_fe_bh);
@@ -1868,61 +1871,117 @@ bail:
1868 return status; 1871 return status;
1869} 1872}
1870 1873
1871static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 1874static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
1872 struct inode **ret_orphan_dir, 1875 struct inode **ret_orphan_dir,
1873 u64 blkno, 1876 struct buffer_head **ret_orphan_dir_bh)
1874 char *name,
1875 struct ocfs2_dir_lookup_result *lookup)
1876{ 1877{
1877 struct inode *orphan_dir_inode; 1878 struct inode *orphan_dir_inode;
1878 struct buffer_head *orphan_dir_bh = NULL; 1879 struct buffer_head *orphan_dir_bh = NULL;
1879 int status = 0; 1880 int ret = 0;
1880
1881 status = ocfs2_blkno_stringify(blkno, name);
1882 if (status < 0) {
1883 mlog_errno(status);
1884 return status;
1885 }
1886 1881
1887 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1882 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1888 ORPHAN_DIR_SYSTEM_INODE, 1883 ORPHAN_DIR_SYSTEM_INODE,
1889 osb->slot_num); 1884 osb->slot_num);
1890 if (!orphan_dir_inode) { 1885 if (!orphan_dir_inode) {
1891 status = -ENOENT; 1886 ret = -ENOENT;
1892 mlog_errno(status); 1887 mlog_errno(ret);
1893 return status; 1888 return ret;
1894 } 1889 }
1895 1890
1896 mutex_lock(&orphan_dir_inode->i_mutex); 1891 mutex_lock(&orphan_dir_inode->i_mutex);
1897 1892
1898 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); 1893 ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
1899 if (status < 0) { 1894 if (ret < 0) {
1900 mlog_errno(status); 1895 mutex_unlock(&orphan_dir_inode->i_mutex);
1901 goto leave; 1896 iput(orphan_dir_inode);
1897
1898 mlog_errno(ret);
1899 return ret;
1902 } 1900 }
1903 1901
1904 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 1902 *ret_orphan_dir = orphan_dir_inode;
1905 orphan_dir_bh, name, 1903 *ret_orphan_dir_bh = orphan_dir_bh;
1906 OCFS2_ORPHAN_NAMELEN, lookup);
1907 if (status < 0) {
1908 ocfs2_inode_unlock(orphan_dir_inode, 1);
1909 1904
1910 mlog_errno(status); 1905 return 0;
1911 goto leave; 1906}
1907
1908static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
1909 struct buffer_head *orphan_dir_bh,
1910 u64 blkno,
1911 char *name,
1912 struct ocfs2_dir_lookup_result *lookup)
1913{
1914 int ret;
1915 struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
1916
1917 ret = ocfs2_blkno_stringify(blkno, name);
1918 if (ret < 0) {
1919 mlog_errno(ret);
1920 return ret;
1921 }
1922
1923 ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
1924 orphan_dir_bh, name,
1925 OCFS2_ORPHAN_NAMELEN, lookup);
1926 if (ret < 0) {
1927 mlog_errno(ret);
1928 return ret;
1929 }
1930
1931 return 0;
1932}
1933
1934/**
1935 * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
1936 * insertion of an orphan.
1937 * @osb: ocfs2 file system
1938 * @ret_orphan_dir: Orphan dir inode - returned locked!
1939 * @blkno: Actual block number of the inode to be inserted into orphan dir.
1940 * @lookup: dir lookup result, to be passed back into functions like
1941 * ocfs2_orphan_add
1942 *
1943 * Returns zero on success and the ret_orphan_dir, name and lookup
1944 * fields will be populated.
1945 *
1946 * Returns non-zero on failure.
1947 */
1948static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1949 struct inode **ret_orphan_dir,
1950 u64 blkno,
1951 char *name,
1952 struct ocfs2_dir_lookup_result *lookup)
1953{
1954 struct inode *orphan_dir_inode = NULL;
1955 struct buffer_head *orphan_dir_bh = NULL;
1956 int ret = 0;
1957
1958 ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
1959 &orphan_dir_bh);
1960 if (ret < 0) {
1961 mlog_errno(ret);
1962 return ret;
1963 }
1964
1965 ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
1966 blkno, name, lookup);
1967 if (ret < 0) {
1968 mlog_errno(ret);
1969 goto out;
1912 } 1970 }
1913 1971
1914 *ret_orphan_dir = orphan_dir_inode; 1972 *ret_orphan_dir = orphan_dir_inode;
1915 1973
1916leave: 1974out:
1917 if (status) { 1975 brelse(orphan_dir_bh);
1976
1977 if (ret) {
1978 ocfs2_inode_unlock(orphan_dir_inode, 1);
1918 mutex_unlock(&orphan_dir_inode->i_mutex); 1979 mutex_unlock(&orphan_dir_inode->i_mutex);
1919 iput(orphan_dir_inode); 1980 iput(orphan_dir_inode);
1920 } 1981 }
1921 1982
1922 brelse(orphan_dir_bh); 1983 mlog_exit(ret);
1923 1984 return ret;
1924 mlog_exit(status);
1925 return status;
1926} 1985}
1927 1986
1928static int ocfs2_orphan_add(struct ocfs2_super *osb, 1987static int ocfs2_orphan_add(struct ocfs2_super *osb,
@@ -1961,12 +2020,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1961 if (S_ISDIR(inode->i_mode)) 2020 if (S_ISDIR(inode->i_mode))
1962 ocfs2_add_links_count(orphan_fe, 1); 2021 ocfs2_add_links_count(orphan_fe, 1);
1963 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 2022 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
1964 2023 ocfs2_journal_dirty(handle, orphan_dir_bh);
1965 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
1966 if (status < 0) {
1967 mlog_errno(status);
1968 goto leave;
1969 }
1970 2024
1971 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 2025 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
1972 OCFS2_ORPHAN_NAMELEN, inode, 2026 OCFS2_ORPHAN_NAMELEN, inode,
@@ -2065,12 +2119,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2065 if (S_ISDIR(inode->i_mode)) 2119 if (S_ISDIR(inode->i_mode))
2066 ocfs2_add_links_count(orphan_fe, -1); 2120 ocfs2_add_links_count(orphan_fe, -1);
2067 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 2121 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
2068 2122 ocfs2_journal_dirty(handle, orphan_dir_bh);
2069 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2070 if (status < 0) {
2071 mlog_errno(status);
2072 goto leave;
2073 }
2074 2123
2075leave: 2124leave:
2076 ocfs2_free_dir_lookup_result(&lookup); 2125 ocfs2_free_dir_lookup_result(&lookup);
@@ -2079,6 +2128,99 @@ leave:
2079 return status; 2128 return status;
2080} 2129}
2081 2130
2131/**
2132 * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
2133 * allocated file. This is different from the typical 'add to orphan dir'
2134 * operation in that the inode does not yet exist. This is a problem because
2135 * the orphan dir stringifies the inode block number to come up with it's
2136 * dirent. Obviously if the inode does not yet exist we have a chicken and egg
2137 * problem. This function works around it by calling deeper into the orphan
2138 * and suballoc code than other callers. Use this only by necessity.
2139 * @dir: The directory which this inode will ultimately wind up under - not the
2140 * orphan dir!
2141 * @dir_bh: buffer_head the @dir inode block
2142 * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
2143 * with the string to be used for orphan dirent. Pass back to the orphan dir
2144 * code.
2145 * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
2146 * dir code.
2147 * @ret_di_blkno: block number where the new inode will be allocated.
2148 * @orphan_insert: Dir insert context to be passed back into orphan dir code.
2149 * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
2150 *
2151 * Returns zero on success and the ret_orphan_dir, name and lookup
2152 * fields will be populated.
2153 *
2154 * Returns non-zero on failure.
2155 */
2156static int ocfs2_prep_new_orphaned_file(struct inode *dir,
2157 struct buffer_head *dir_bh,
2158 char *orphan_name,
2159 struct inode **ret_orphan_dir,
2160 u64 *ret_di_blkno,
2161 struct ocfs2_dir_lookup_result *orphan_insert,
2162 struct ocfs2_alloc_context **ret_inode_ac)
2163{
2164 int ret;
2165 u64 di_blkno;
2166 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2167 struct inode *orphan_dir = NULL;
2168 struct buffer_head *orphan_dir_bh = NULL;
2169 struct ocfs2_alloc_context *inode_ac = NULL;
2170
2171 ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
2172 if (ret < 0) {
2173 mlog_errno(ret);
2174 return ret;
2175 }
2176
2177 /* reserve an inode spot */
2178 ret = ocfs2_reserve_new_inode(osb, &inode_ac);
2179 if (ret < 0) {
2180 if (ret != -ENOSPC)
2181 mlog_errno(ret);
2182 goto out;
2183 }
2184
2185 ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
2186 &di_blkno);
2187 if (ret) {
2188 mlog_errno(ret);
2189 goto out;
2190 }
2191
2192 ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
2193 di_blkno, orphan_name, orphan_insert);
2194 if (ret < 0) {
2195 mlog_errno(ret);
2196 goto out;
2197 }
2198
2199out:
2200 if (ret == 0) {
2201 *ret_orphan_dir = orphan_dir;
2202 *ret_di_blkno = di_blkno;
2203 *ret_inode_ac = inode_ac;
2204 /*
2205 * orphan_name and orphan_insert are already up to
2206 * date via prepare_orphan_dir
2207 */
2208 } else {
2209 /* Unroll reserve_new_inode* */
2210 if (inode_ac)
2211 ocfs2_free_alloc_context(inode_ac);
2212
2213 /* Unroll orphan dir locking */
2214 mutex_unlock(&orphan_dir->i_mutex);
2215 ocfs2_inode_unlock(orphan_dir, 1);
2216 iput(orphan_dir);
2217 }
2218
2219 brelse(orphan_dir_bh);
2220
2221 return 0;
2222}
2223
2082int ocfs2_create_inode_in_orphan(struct inode *dir, 2224int ocfs2_create_inode_in_orphan(struct inode *dir,
2083 int mode, 2225 int mode,
2084 struct inode **new_inode) 2226 struct inode **new_inode)
@@ -2094,6 +2236,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2094 struct buffer_head *new_di_bh = NULL; 2236 struct buffer_head *new_di_bh = NULL;
2095 struct ocfs2_alloc_context *inode_ac = NULL; 2237 struct ocfs2_alloc_context *inode_ac = NULL;
2096 struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; 2238 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
2239 u64 uninitialized_var(di_blkno), suballoc_loc;
2240 u16 suballoc_bit;
2097 2241
2098 status = ocfs2_inode_lock(dir, &parent_di_bh, 1); 2242 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2099 if (status < 0) { 2243 if (status < 0) {
@@ -2102,20 +2246,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2102 return status; 2246 return status;
2103 } 2247 }
2104 2248
2105 /* 2249 status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
2106 * We give the orphan dir the root blkno to fake an orphan name, 2250 orphan_name, &orphan_dir,
2107 * and allocate enough space for our insertion. 2251 &di_blkno, &orphan_insert, &inode_ac);
2108 */
2109 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
2110 osb->root_blkno,
2111 orphan_name, &orphan_insert);
2112 if (status < 0) {
2113 mlog_errno(status);
2114 goto leave;
2115 }
2116
2117 /* reserve an inode spot */
2118 status = ocfs2_reserve_new_inode(osb, &inode_ac);
2119 if (status < 0) { 2252 if (status < 0) {
2120 if (status != -ENOSPC) 2253 if (status != -ENOSPC)
2121 mlog_errno(status); 2254 mlog_errno(status);
@@ -2142,17 +2275,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2142 goto leave; 2275 goto leave;
2143 did_quota_inode = 1; 2276 did_quota_inode = 1;
2144 2277
2145 inode->i_nlink = 0; 2278 status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
2146 /* do the real work now. */ 2279 &suballoc_loc,
2147 status = ocfs2_mknod_locked(osb, dir, inode, 2280 &suballoc_bit, di_blkno);
2148 0, &new_di_bh, parent_di_bh, handle,
2149 inode_ac);
2150 if (status < 0) { 2281 if (status < 0) {
2151 mlog_errno(status); 2282 mlog_errno(status);
2152 goto leave; 2283 goto leave;
2153 } 2284 }
2154 2285
2155 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); 2286 inode->i_nlink = 0;
2287 /* do the real work now. */
2288 status = __ocfs2_mknod_locked(dir, inode,
2289 0, &new_di_bh, parent_di_bh, handle,
2290 inode_ac, di_blkno, suballoc_loc,
2291 suballoc_bit);
2156 if (status < 0) { 2292 if (status < 0) {
2157 mlog_errno(status); 2293 mlog_errno(status);
2158 goto leave; 2294 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index adf5e2ebc2c4..c67003b6b5a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
47/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
48#include "blockcheck.h" 48#include "blockcheck.h"
49 49
50#include "reservations.h"
50 51
51/* Caching of metadata buffers */ 52/* Caching of metadata buffers */
52 53
@@ -341,6 +342,9 @@ struct ocfs2_super
341 */ 342 */
342 unsigned int local_alloc_bits; 343 unsigned int local_alloc_bits;
343 unsigned int local_alloc_default_bits; 344 unsigned int local_alloc_default_bits;
345 /* osb_clusters_at_boot can become stale! Do not trust it to
346 * be up to date. */
347 unsigned int osb_clusters_at_boot;
344 348
345 enum ocfs2_local_alloc_state local_alloc_state; /* protected 349 enum ocfs2_local_alloc_state local_alloc_state; /* protected
346 * by osb_lock */ 350 * by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
349 353
350 u64 la_last_gd; 354 u64 la_last_gd;
351 355
356 struct ocfs2_reservation_map osb_la_resmap;
357
358 unsigned int osb_resv_level;
359 unsigned int osb_dir_resv_level;
360
352 /* Next three fields are for local node slot recovery during 361 /* Next three fields are for local node slot recovery during
353 * mount. */ 362 * mount. */
354 int dirty; 363 int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
482 return 0; 491 return 0;
483} 492}
484 493
494static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
495{
496 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
497 return 1;
498 return 0;
499}
500
485static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) 501static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
486{ 502{
487 if (ocfs2_supports_indexed_dirs(osb)) 503 if (ocfs2_supports_indexed_dirs(osb))
@@ -763,6 +779,12 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 779 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
764} 780}
765 781
782static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
783 unsigned int clusters)
784{
785 return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
786}
787
766static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) 788static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
767{ 789{
768 ext2_set_bit(bit, bitmap); 790 ext2_set_bit(bit, bitmap);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218a7978..fa31d05e41b7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
100 | OCFS2_FEATURE_INCOMPAT_XATTR \ 100 | OCFS2_FEATURE_INCOMPAT_XATTR \
101 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) 103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
104#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 105#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
105 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 106 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
106 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 107 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
165/* Refcount tree support */ 166/* Refcount tree support */
166#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 167#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
167 168
169/* Discontigous block groups */
170#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
171
168/* 172/*
169 * backup superblock flag is used to indicate that this volume 173 * backup superblock flag is used to indicate that this volume
170 * has backup superblocks. 174 * has backup superblocks.
@@ -231,18 +235,31 @@
231#define OCFS2_HAS_REFCOUNT_FL (0x0010) 235#define OCFS2_HAS_REFCOUNT_FL (0x0010)
232 236
233/* Inode attributes, keep in sync with EXT2 */ 237/* Inode attributes, keep in sync with EXT2 */
234#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 238#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */
235#define OCFS2_UNRM_FL (0x00000002) /* Undelete */ 239#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */
236#define OCFS2_COMPR_FL (0x00000004) /* Compress file */ 240#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */
237#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */ 241#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
238#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */ 242#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
239#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */ 243#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
240#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */ 244#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
241#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */ 245#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
242#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */ 246/* Reserved for compression usage... */
243 247#define OCFS2_DIRTY_FL FS_DIRTY_FL
244#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ 248#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
245#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ 249#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
250#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
251/* End compression flags --- maybe not all used */
252#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */
253#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
254#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
255#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
256#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
257#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
258#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
259#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
260
261#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
262#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
246 263
247/* 264/*
248 * Extent record flags (e_node.leaf.flags) 265 * Extent record flags (e_node.leaf.flags)
@@ -283,14 +300,6 @@
283#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 300#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
284 301
285/* 302/*
286 * Default local alloc size (in megabytes)
287 *
288 * The value chosen should be such that most allocations, including new
289 * block groups, use local alloc.
290 */
291#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
292
293/*
294 * Inline extended attribute size (in bytes) 303 * Inline extended attribute size (in bytes)
295 * The value chosen should be aligned to 16 byte boundaries. 304 * The value chosen should be aligned to 16 byte boundaries.
296 */ 305 */
@@ -512,7 +521,10 @@ struct ocfs2_extent_block
512 block group */ 521 block group */
513 __le32 h_fs_generation; /* Must match super block */ 522 __le32 h_fs_generation; /* Must match super block */
514 __le64 h_blkno; /* Offset on disk, in blocks */ 523 __le64 h_blkno; /* Offset on disk, in blocks */
515/*20*/ __le64 h_reserved3; 524/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this
525 eb belongs to. Only valid
526 if allocated from a
527 discontiguous block group */
516 __le64 h_next_leaf_blk; /* Offset on disk, in blocks, 528 __le64 h_next_leaf_blk; /* Offset on disk, in blocks,
517 of next leaf header pointing 529 of next leaf header pointing
518 to data */ 530 to data */
@@ -679,7 +691,11 @@ struct ocfs2_dinode {
679/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 691/*80*/ struct ocfs2_block_check i_check; /* Error checking */
680/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ 692/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
681/*90*/ __le64 i_refcount_loc; 693/*90*/ __le64 i_refcount_loc;
682 __le64 i_reserved2[4]; 694 __le64 i_suballoc_loc; /* Suballocator block group this
695 inode belongs to. Only valid
696 if allocated from a
697 discontiguous block group */
698/*A0*/ __le64 i_reserved2[3];
683/*B8*/ union { 699/*B8*/ union {
684 __le64 i_pad1; /* Generic way to refer to this 700 __le64 i_pad1; /* Generic way to refer to this
685 64bit union */ 701 64bit union */
@@ -814,7 +830,12 @@ struct ocfs2_dx_root_block {
814 __le32 dr_reserved2; 830 __le32 dr_reserved2;
815 __le64 dr_free_blk; /* Pointer to head of free 831 __le64 dr_free_blk; /* Pointer to head of free
816 * unindexed block list. */ 832 * unindexed block list. */
817 __le64 dr_reserved3[15]; 833 __le64 dr_suballoc_loc; /* Suballocator block group
834 this root belongs to.
835 Only valid if allocated
836 from a discontiguous
837 block group */
838 __le64 dr_reserved3[14];
818 union { 839 union {
819 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 840 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
820 * bits for maximum space 841 * bits for maximum space
@@ -840,6 +861,13 @@ struct ocfs2_dx_leaf {
840}; 861};
841 862
842/* 863/*
864 * Largest bitmap for a block (suballocator) group in bytes. This limit
865 * does not affect cluster groups (global allocator). Cluster group
866 * bitmaps run to the end of the block.
867 */
868#define OCFS2_MAX_BG_BITMAP_SIZE 256
869
870/*
843 * On disk allocator group structure for OCFS2 871 * On disk allocator group structure for OCFS2
844 */ 872 */
845struct ocfs2_group_desc 873struct ocfs2_group_desc
@@ -860,7 +888,29 @@ struct ocfs2_group_desc
860 __le64 bg_blkno; /* Offset on disk, in blocks */ 888 __le64 bg_blkno; /* Offset on disk, in blocks */
861/*30*/ struct ocfs2_block_check bg_check; /* Error checking */ 889/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
862 __le64 bg_reserved2; 890 __le64 bg_reserved2;
863/*40*/ __u8 bg_bitmap[0]; 891/*40*/ union {
892 __u8 bg_bitmap[0];
893 struct {
894 /*
895 * Block groups may be discontiguous when
896 * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
897 * The extents of a discontigous block group are
898 * stored in bg_list. It is a flat list.
899 * l_tree_depth must always be zero. A
900 * discontiguous group is signified by a non-zero
901 * bg_list->l_next_free_rec. Only block groups
902 * can be discontiguous; Cluster groups cannot.
903 * We've never made a block group with more than
904 * 2048 blocks (256 bytes of bg_bitmap). This
905 * codifies that limit so that we can fit bg_list.
906 * bg_size of a discontiguous block group will
907 * be 256 to match bg_bitmap_filler.
908 */
909 __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
910/*140*/ struct ocfs2_extent_list bg_list;
911 };
912 };
913/* Actual on-disk size is one block */
864}; 914};
865 915
866struct ocfs2_refcount_rec { 916struct ocfs2_refcount_rec {
@@ -905,7 +955,11 @@ struct ocfs2_refcount_block {
905/*40*/ __le32 rf_generation; /* generation number. all be the same 955/*40*/ __le32 rf_generation; /* generation number. all be the same
906 * for the same refcount tree. */ 956 * for the same refcount tree. */
907 __le32 rf_reserved0; 957 __le32 rf_reserved0;
908 __le64 rf_reserved1[7]; 958 __le64 rf_suballoc_loc; /* Suballocator block group this
959 refcount block belongs to. Only
960 valid if allocated from a
961 discontiguous block group */
962/*50*/ __le64 rf_reserved1[6];
909/*80*/ union { 963/*80*/ union {
910 struct ocfs2_refcount_list rf_records; /* List of refcount 964 struct ocfs2_refcount_list rf_records; /* List of refcount
911 records */ 965 records */
@@ -1017,7 +1071,10 @@ struct ocfs2_xattr_block {
1017 real xattr or a xattr tree. */ 1071 real xattr or a xattr tree. */
1018 __le16 xb_reserved0; 1072 __le16 xb_reserved0;
1019 __le32 xb_reserved1; 1073 __le32 xb_reserved1;
1020 __le64 xb_reserved2; 1074 __le64 xb_suballoc_loc; /* Suballocator block group this
1075 xattr block belongs to. Only
1076 valid if allocated from a
1077 discontiguous block group */
1021/*30*/ union { 1078/*30*/ union {
1022 struct ocfs2_xattr_header xb_header; /* xattr header if this 1079 struct ocfs2_xattr_header xb_header; /* xattr header if this
1023 block contains xattr */ 1080 block contains xattr */
@@ -1254,6 +1311,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1254 return size / sizeof(struct ocfs2_extent_rec); 1311 return size / sizeof(struct ocfs2_extent_rec);
1255} 1312}
1256 1313
1314static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
1315{
1316 int size;
1317
1318 size = sb->s_blocksize -
1319 offsetof(struct ocfs2_group_desc, bg_list.l_recs);
1320
1321 return size / sizeof(struct ocfs2_extent_rec);
1322}
1323
1257static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) 1324static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1258{ 1325{
1259 int size; 1326 int size;
@@ -1284,13 +1351,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1284 return size; 1351 return size;
1285} 1352}
1286 1353
1287static inline int ocfs2_group_bitmap_size(struct super_block *sb) 1354static inline int ocfs2_group_bitmap_size(struct super_block *sb,
1355 int suballocator,
1356 u32 feature_incompat)
1288{ 1357{
1289 int size; 1358 int size = sb->s_blocksize -
1290
1291 size = sb->s_blocksize -
1292 offsetof(struct ocfs2_group_desc, bg_bitmap); 1359 offsetof(struct ocfs2_group_desc, bg_bitmap);
1293 1360
1361 /*
1362 * The cluster allocator uses the entire block. Suballocators have
1363 * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
1364 * code expects bg_size set to the maximum. Thus we must keep
1365 * bg_size as-is unless discontig_bg is enabled.
1366 */
1367 if (suballocator &&
1368 (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
1369 size = OCFS2_MAX_BG_BITMAP_SIZE;
1370
1294 return size; 1371 return size;
1295} 1372}
1296 1373
@@ -1402,23 +1479,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
1402 return size / sizeof(struct ocfs2_extent_rec); 1479 return size / sizeof(struct ocfs2_extent_rec);
1403} 1480}
1404 1481
1405static inline int ocfs2_local_alloc_size(int blocksize) 1482static inline int ocfs2_extent_recs_per_gd(int blocksize)
1406{ 1483{
1407 int size; 1484 int size;
1408 1485
1409 size = blocksize - 1486 size = blocksize -
1410 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); 1487 offsetof(struct ocfs2_group_desc, bg_list.l_recs);
1411 1488
1412 return size; 1489 return size / sizeof(struct ocfs2_extent_rec);
1413} 1490}
1414 1491
1415static inline int ocfs2_group_bitmap_size(int blocksize) 1492static inline int ocfs2_local_alloc_size(int blocksize)
1416{ 1493{
1417 int size; 1494 int size;
1418 1495
1419 size = blocksize - 1496 size = blocksize -
1497 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
1498
1499 return size;
1500}
1501
1502static inline int ocfs2_group_bitmap_size(int blocksize,
1503 int suballocator,
1504 uint32_t feature_incompat)
1505{
1506 int size = sb->s_blocksize -
1420 offsetof(struct ocfs2_group_desc, bg_bitmap); 1507 offsetof(struct ocfs2_group_desc, bg_bitmap);
1421 1508
1509 /*
1510 * The cluster allocator uses the entire block. Suballocators have
1511 * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
1512 * code expects bg_size set to the maximum. Thus we must keep
1513 * bg_size as-is unless discontig_bg is enabled.
1514 */
1515 if (suballocator &&
1516 (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
1517 size = OCFS2_MAX_BG_BITMAP_SIZE;
1518
1422 return size; 1519 return size;
1423} 1520}
1424 1521
@@ -1491,5 +1588,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
1491 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 1588 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1492} 1589}
1493 1590
1591static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
1592{
1593 if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
1594 le16_to_cpu(gd->bg_size)) !=
1595 offsetof(struct ocfs2_group_desc, bg_list))
1596 return 0;
1597 /*
1598 * Only valid to check l_next_free_rec if
1599 * bg_bitmap + bg_size == bg_list.
1600 */
1601 if (!gd->bg_list.l_next_free_rec)
1602 return 0;
1603 return 1;
1604}
1494#endif /* _OCFS2_FS_H */ 1605#endif /* _OCFS2_FS_H */
1495 1606
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a83..5d241505690b 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
23/* 23/*
24 * ioctl commands 24 * ioctl commands
25 */ 25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) 26#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) 27#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) 28#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) 29#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
30 30
31/* 31/*
32 * Space reservation / allocation / free ioctls and argument structure 32 * Space reservation / allocation / free ioctls and argument structure
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 123bc520a2c0..196fcb52d95d 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -23,6 +23,7 @@
23struct ocfs2_dquot { 23struct ocfs2_dquot {
24 struct dquot dq_dquot; /* Generic VFS dquot */ 24 struct dquot dq_dquot; /* Generic VFS dquot */
25 loff_t dq_local_off; /* Offset in the local quota file */ 25 loff_t dq_local_off; /* Offset in the local quota file */
26 u64 dq_local_phys_blk; /* Physical block carrying quota structure */
26 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */ 27 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
27 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ 28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
28 s64 dq_origspace; /* Last globally synced space usage */ 29 s64 dq_origspace; /* Last globally synced space usage */
@@ -51,8 +52,9 @@ struct ocfs2_mem_dqinfo {
51 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ 52 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
52 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */ 53 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
53 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */ 54 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
55 u64 dqi_giblk; /* Number of block with global information header */
54 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */ 56 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
55 struct buffer_head *dqi_ibh; /* Buffer with information header */ 57 struct buffer_head *dqi_libh; /* Buffer with local information header */
56 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */ 58 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
57 struct delayed_work dqi_sync_work; /* Work for syncing dquots */ 59 struct delayed_work dqi_sync_work; /* Work for syncing dquots */
58 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery 60 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
@@ -102,8 +104,12 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
102 104
103int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); 105int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
104void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); 106void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
105int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 107int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
106 struct buffer_head **bh); 108int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
109 struct buffer_head **bh);
110int ocfs2_create_local_dquot(struct dquot *dquot);
111int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
112int ocfs2_local_write_dquot(struct dquot *dquot);
107 113
108extern const struct dquot_operations ocfs2_quota_operations; 114extern const struct dquot_operations ocfs2_quota_operations;
109extern struct quota_format_type ocfs2_quota_format; 115extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index ab42a74c7539..4607923eb24c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -25,8 +25,44 @@
25#include "dlmglue.h" 25#include "dlmglue.h"
26#include "uptodate.h" 26#include "uptodate.h"
27#include "super.h" 27#include "super.h"
28#include "buffer_head_io.h"
28#include "quota.h" 29#include "quota.h"
29 30
31/*
32 * Locking of quotas with OCFS2 is rather complex. Here are rules that
33 * should be obeyed by all the functions:
34 * - any write of quota structure (either to local or global file) is protected
35 * by dqio_mutex or dquot->dq_lock.
36 * - any modification of global quota file holds inode cluster lock, i_mutex,
37 * and ip_alloc_sem of the global quota file (achieved by
38 * ocfs2_lock_global_qf). It also has to hold qinfo_lock.
39 * - an allocation of new blocks for local quota file is protected by
40 * its ip_alloc_sem
41 *
42 * A rough sketch of locking dependencies (lf = local file, gf = global file):
43 * Normal filesystem operation:
44 * start_trans -> dqio_mutex -> write to lf
45 * Syncing of local and global file:
46 * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
47 * write to gf
48 * -> write to lf
49 * Acquire dquot for the first time:
50 * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
51 * -> alloc space for gf
52 * -> start_trans -> qinfo_lock -> write to gf
53 * -> ip_alloc_sem of lf -> alloc space for lf
54 * -> write to lf
55 * Release last reference to dquot:
56 * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
57 * -> write to lf
58 * Note that all the above operations also hold the inode cluster lock of lf.
59 * Recovery:
60 * inode cluster lock of recovered lf
61 * -> read bitmaps -> ip_alloc_sem of lf
62 * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
63 * write to gf
64 */
65
30static struct workqueue_struct *ocfs2_quota_wq = NULL; 66static struct workqueue_struct *ocfs2_quota_wq = NULL;
31 67
32static void qsync_work_fn(struct work_struct *work); 68static void qsync_work_fn(struct work_struct *work);
@@ -91,8 +127,7 @@ struct qtree_fmt_operations ocfs2_global_ops = {
91 .is_id = ocfs2_global_is_id, 127 .is_id = ocfs2_global_is_id,
92}; 128};
93 129
94static int ocfs2_validate_quota_block(struct super_block *sb, 130int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
95 struct buffer_head *bh)
96{ 131{
97 struct ocfs2_disk_dqtrailer *dqt = 132 struct ocfs2_disk_dqtrailer *dqt =
98 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); 133 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
@@ -110,54 +145,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
110 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check); 145 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
111} 146}
112 147
113int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 148int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
114 struct buffer_head **bh) 149 struct buffer_head **bhp)
115{ 150{
116 int rc = 0; 151 int rc;
117 struct buffer_head *tmp = *bh; 152
118 153 *bhp = NULL;
119 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { 154 rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
120 ocfs2_error(inode->i_sb, 155 ocfs2_validate_quota_block);
121 "Quota file %llu is probably corrupted! Requested "
122 "to read block %Lu but file has size only %Lu\n",
123 (unsigned long long)OCFS2_I(inode)->ip_blkno,
124 (unsigned long long)v_block,
125 (unsigned long long)i_size_read(inode));
126 return -EIO;
127 }
128 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
129 ocfs2_validate_quota_block);
130 if (rc) 156 if (rc)
131 mlog_errno(rc); 157 mlog_errno(rc);
132
133 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
134 if (!rc && !*bh)
135 *bh = tmp;
136
137 return rc; 158 return rc;
138} 159}
139 160
140static int ocfs2_get_quota_block(struct inode *inode, int block,
141 struct buffer_head **bh)
142{
143 u64 pblock, pcount;
144 int err;
145
146 down_read(&OCFS2_I(inode)->ip_alloc_sem);
147 err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
148 up_read(&OCFS2_I(inode)->ip_alloc_sem);
149 if (err) {
150 mlog_errno(err);
151 return err;
152 }
153 *bh = sb_getblk(inode->i_sb, pblock);
154 if (!*bh) {
155 err = -EIO;
156 mlog_errno(err);
157 }
158 return err;
159}
160
161/* Read data from global quotafile - avoid pagecache and such because we cannot 161/* Read data from global quotafile - avoid pagecache and such because we cannot
162 * afford acquiring the locks... We use quota cluster lock to serialize 162 * afford acquiring the locks... We use quota cluster lock to serialize
163 * operations. Caller is responsible for acquiring it. */ 163 * operations. Caller is responsible for acquiring it. */
@@ -172,6 +172,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
172 int err = 0; 172 int err = 0;
173 struct buffer_head *bh; 173 struct buffer_head *bh;
174 size_t toread, tocopy; 174 size_t toread, tocopy;
175 u64 pblock = 0, pcount = 0;
175 176
176 if (off > i_size) 177 if (off > i_size)
177 return 0; 178 return 0;
@@ -180,8 +181,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
180 toread = len; 181 toread = len;
181 while (toread > 0) { 182 while (toread > 0) {
182 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread); 183 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
184 if (!pcount) {
185 err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
186 &pcount, NULL);
187 if (err) {
188 mlog_errno(err);
189 return err;
190 }
191 } else {
192 pcount--;
193 pblock++;
194 }
183 bh = NULL; 195 bh = NULL;
184 err = ocfs2_read_quota_block(gqinode, blk, &bh); 196 err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
185 if (err) { 197 if (err) {
186 mlog_errno(err); 198 mlog_errno(err);
187 return err; 199 return err;
@@ -209,6 +221,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
209 int err = 0, new = 0, ja_type; 221 int err = 0, new = 0, ja_type;
210 struct buffer_head *bh = NULL; 222 struct buffer_head *bh = NULL;
211 handle_t *handle = journal_current_handle(); 223 handle_t *handle = journal_current_handle();
224 u64 pblock, pcount;
212 225
213 if (!handle) { 226 if (!handle) {
214 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled " 227 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -221,12 +234,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
221 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; 234 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
222 } 235 }
223 236
224 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
225 if (gqinode->i_size < off + len) { 237 if (gqinode->i_size < off + len) {
226 loff_t rounded_end = 238 loff_t rounded_end =
227 ocfs2_align_bytes_to_blocks(sb, off + len); 239 ocfs2_align_bytes_to_blocks(sb, off + len);
228 240
229 /* Space is already allocated in ocfs2_global_read_dquot() */ 241 /* Space is already allocated in ocfs2_acquire_dquot() */
230 err = ocfs2_simple_size_update(gqinode, 242 err = ocfs2_simple_size_update(gqinode,
231 oinfo->dqi_gqi_bh, 243 oinfo->dqi_gqi_bh,
232 rounded_end); 244 rounded_end);
@@ -234,13 +246,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
234 goto out; 246 goto out;
235 new = 1; 247 new = 1;
236 } 248 }
249 err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
250 if (err) {
251 mlog_errno(err);
252 goto out;
253 }
237 /* Not rewriting whole block? */ 254 /* Not rewriting whole block? */
238 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) && 255 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
239 !new) { 256 !new) {
240 err = ocfs2_read_quota_block(gqinode, blk, &bh); 257 err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
241 ja_type = OCFS2_JOURNAL_ACCESS_WRITE; 258 ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
242 } else { 259 } else {
243 err = ocfs2_get_quota_block(gqinode, blk, &bh); 260 bh = sb_getblk(sb, pblock);
261 if (!bh)
262 err = -ENOMEM;
244 ja_type = OCFS2_JOURNAL_ACCESS_CREATE; 263 ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
245 } 264 }
246 if (err) { 265 if (err) {
@@ -261,19 +280,15 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
261 brelse(bh); 280 brelse(bh);
262 goto out; 281 goto out;
263 } 282 }
264 err = ocfs2_journal_dirty(handle, bh); 283 ocfs2_journal_dirty(handle, bh);
265 brelse(bh); 284 brelse(bh);
266 if (err < 0)
267 goto out;
268out: 285out:
269 if (err) { 286 if (err) {
270 mutex_unlock(&gqinode->i_mutex);
271 mlog_errno(err); 287 mlog_errno(err);
272 return err; 288 return err;
273 } 289 }
274 gqinode->i_version++; 290 gqinode->i_version++;
275 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); 291 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
276 mutex_unlock(&gqinode->i_mutex);
277 return len; 292 return len;
278} 293}
279 294
@@ -291,11 +306,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
291 else 306 else
292 WARN_ON(bh != oinfo->dqi_gqi_bh); 307 WARN_ON(bh != oinfo->dqi_gqi_bh);
293 spin_unlock(&dq_data_lock); 308 spin_unlock(&dq_data_lock);
309 if (ex) {
310 mutex_lock(&oinfo->dqi_gqinode->i_mutex);
311 down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
312 } else {
313 down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
314 }
294 return 0; 315 return 0;
295} 316}
296 317
297void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) 318void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
298{ 319{
320 if (ex) {
321 up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
322 mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
323 } else {
324 up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
325 }
299 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex); 326 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
300 brelse(oinfo->dqi_gqi_bh); 327 brelse(oinfo->dqi_gqi_bh);
301 spin_lock(&dq_data_lock); 328 spin_lock(&dq_data_lock);
@@ -313,6 +340,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
313 struct ocfs2_global_disk_dqinfo dinfo; 340 struct ocfs2_global_disk_dqinfo dinfo;
314 struct mem_dqinfo *info = sb_dqinfo(sb, type); 341 struct mem_dqinfo *info = sb_dqinfo(sb, type);
315 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; 342 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
343 u64 pcount;
316 int status; 344 int status;
317 345
318 mlog_entry_void(); 346 mlog_entry_void();
@@ -339,9 +367,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
339 mlog_errno(status); 367 mlog_errno(status);
340 goto out_err; 368 goto out_err;
341 } 369 }
370
371 status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
372 &pcount, NULL);
373 if (status < 0)
374 goto out_unlock;
375
376 status = ocfs2_qinfo_lock(oinfo, 0);
377 if (status < 0)
378 goto out_unlock;
342 status = sb->s_op->quota_read(sb, type, (char *)&dinfo, 379 status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
343 sizeof(struct ocfs2_global_disk_dqinfo), 380 sizeof(struct ocfs2_global_disk_dqinfo),
344 OCFS2_GLOBAL_INFO_OFF); 381 OCFS2_GLOBAL_INFO_OFF);
382 ocfs2_qinfo_unlock(oinfo, 0);
345 ocfs2_unlock_global_qf(oinfo, 0); 383 ocfs2_unlock_global_qf(oinfo, 0);
346 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) { 384 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
347 mlog(ML_ERROR, "Cannot read global quota info (%d).\n", 385 mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -368,6 +406,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
368out_err: 406out_err:
369 mlog_exit(status); 407 mlog_exit(status);
370 return status; 408 return status;
409out_unlock:
410 ocfs2_unlock_global_qf(oinfo, 0);
411 mlog_errno(status);
412 goto out_err;
371} 413}
372 414
373/* Write information to global quota file. Expects exlusive lock on quota 415/* Write information to global quota file. Expects exlusive lock on quota
@@ -426,78 +468,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
426 468
427static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) 469static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
428{ 470{
429 /* We modify all the allocated blocks, tree root, and info block */ 471 /* We modify all the allocated blocks, tree root, info block and
472 * the inode */
430 return (ocfs2_global_qinit_alloc(sb, type) + 2) * 473 return (ocfs2_global_qinit_alloc(sb, type) + 2) *
431 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; 474 OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
432}
433
434/* Read in information from global quota file and acquire a reference to it.
435 * dquot_acquire() has already started the transaction and locked quota file */
436int ocfs2_global_read_dquot(struct dquot *dquot)
437{
438 int err, err2, ex = 0;
439 struct super_block *sb = dquot->dq_sb;
440 int type = dquot->dq_type;
441 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
442 struct ocfs2_super *osb = OCFS2_SB(sb);
443 struct inode *gqinode = info->dqi_gqinode;
444 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
445 handle_t *handle = NULL;
446
447 err = ocfs2_qinfo_lock(info, 0);
448 if (err < 0)
449 goto out;
450 err = qtree_read_dquot(&info->dqi_gi, dquot);
451 if (err < 0)
452 goto out_qlock;
453 OCFS2_DQUOT(dquot)->dq_use_count++;
454 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
455 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
456 ocfs2_qinfo_unlock(info, 0);
457
458 if (!dquot->dq_off) { /* No real quota entry? */
459 ex = 1;
460 /*
461 * Add blocks to quota file before we start a transaction since
462 * locking allocators ranks above a transaction start
463 */
464 WARN_ON(journal_current_handle());
465 down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
466 err = ocfs2_extend_no_holes(gqinode,
467 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
468 gqinode->i_size);
469 up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
470 if (err < 0)
471 goto out;
472 }
473
474 handle = ocfs2_start_trans(osb,
475 ocfs2_calc_global_qinit_credits(sb, type));
476 if (IS_ERR(handle)) {
477 err = PTR_ERR(handle);
478 goto out;
479 }
480 err = ocfs2_qinfo_lock(info, ex);
481 if (err < 0)
482 goto out_trans;
483 err = qtree_write_dquot(&info->dqi_gi, dquot);
484 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
485 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
486 if (!err)
487 err = err2;
488 }
489out_qlock:
490 if (ex)
491 ocfs2_qinfo_unlock(info, 1);
492 else
493 ocfs2_qinfo_unlock(info, 0);
494out_trans:
495 if (handle)
496 ocfs2_commit_trans(osb, handle);
497out:
498 if (err < 0)
499 mlog_errno(err);
500 return err;
501} 475}
502 476
503/* Sync local information about quota modifications with global quota file. 477/* Sync local information about quota modifications with global quota file.
@@ -638,14 +612,13 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
638 } 612 }
639 mutex_lock(&sb_dqopt(sb)->dqio_mutex); 613 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
640 status = ocfs2_sync_dquot(dquot); 614 status = ocfs2_sync_dquot(dquot);
641 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
642 if (status < 0) 615 if (status < 0)
643 mlog_errno(status); 616 mlog_errno(status);
644 /* We have to write local structure as well... */ 617 /* We have to write local structure as well... */
645 dquot_mark_dquot_dirty(dquot); 618 status = ocfs2_local_write_dquot(dquot);
646 status = dquot_commit(dquot);
647 if (status < 0) 619 if (status < 0)
648 mlog_errno(status); 620 mlog_errno(status);
621 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
649 ocfs2_commit_trans(osb, handle); 622 ocfs2_commit_trans(osb, handle);
650out_ilock: 623out_ilock:
651 ocfs2_unlock_global_qf(oinfo, 1); 624 ocfs2_unlock_global_qf(oinfo, 1);
@@ -684,7 +657,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
684 mlog_errno(status); 657 mlog_errno(status);
685 goto out; 658 goto out;
686 } 659 }
687 status = dquot_commit(dquot); 660 mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
661 status = ocfs2_local_write_dquot(dquot);
662 mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
688 ocfs2_commit_trans(osb, handle); 663 ocfs2_commit_trans(osb, handle);
689out: 664out:
690 mlog_exit(status); 665 mlog_exit(status);
@@ -715,6 +690,10 @@ static int ocfs2_release_dquot(struct dquot *dquot)
715 690
716 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 691 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
717 692
693 mutex_lock(&dquot->dq_lock);
694 /* Check whether we are not racing with some other dqget() */
695 if (atomic_read(&dquot->dq_count) > 1)
696 goto out;
718 status = ocfs2_lock_global_qf(oinfo, 1); 697 status = ocfs2_lock_global_qf(oinfo, 1);
719 if (status < 0) 698 if (status < 0)
720 goto out; 699 goto out;
@@ -725,30 +704,113 @@ static int ocfs2_release_dquot(struct dquot *dquot)
725 mlog_errno(status); 704 mlog_errno(status);
726 goto out_ilock; 705 goto out_ilock;
727 } 706 }
728 status = dquot_release(dquot); 707
708 status = ocfs2_global_release_dquot(dquot);
709 if (status < 0) {
710 mlog_errno(status);
711 goto out_trans;
712 }
713 status = ocfs2_local_release_dquot(handle, dquot);
714 /*
715 * If we fail here, we cannot do much as global structure is
716 * already released. So just complain...
717 */
718 if (status < 0)
719 mlog_errno(status);
720 clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
721out_trans:
729 ocfs2_commit_trans(osb, handle); 722 ocfs2_commit_trans(osb, handle);
730out_ilock: 723out_ilock:
731 ocfs2_unlock_global_qf(oinfo, 1); 724 ocfs2_unlock_global_qf(oinfo, 1);
732out: 725out:
726 mutex_unlock(&dquot->dq_lock);
733 mlog_exit(status); 727 mlog_exit(status);
734 return status; 728 return status;
735} 729}
736 730
731/*
732 * Read global dquot structure from disk or create it if it does
733 * not exist. Also update use count of the global structure and
734 * create structure in node-local quota file.
735 */
737static int ocfs2_acquire_dquot(struct dquot *dquot) 736static int ocfs2_acquire_dquot(struct dquot *dquot)
738{ 737{
739 struct ocfs2_mem_dqinfo *oinfo = 738 int status = 0, err;
740 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; 739 int ex = 0;
741 int status = 0; 740 struct super_block *sb = dquot->dq_sb;
741 struct ocfs2_super *osb = OCFS2_SB(sb);
742 int type = dquot->dq_type;
743 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
744 struct inode *gqinode = info->dqi_gqinode;
745 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
746 handle_t *handle;
742 747
743 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 748 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
744 /* We need an exclusive lock, because we're going to update use count 749 mutex_lock(&dquot->dq_lock);
745 * and instantiate possibly new dquot structure */ 750 /*
746 status = ocfs2_lock_global_qf(oinfo, 1); 751 * We need an exclusive lock, because we're going to update use count
752 * and instantiate possibly new dquot structure
753 */
754 status = ocfs2_lock_global_qf(info, 1);
747 if (status < 0) 755 if (status < 0)
748 goto out; 756 goto out;
749 status = dquot_acquire(dquot); 757 if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
750 ocfs2_unlock_global_qf(oinfo, 1); 758 status = ocfs2_qinfo_lock(info, 0);
759 if (status < 0)
760 goto out_dq;
761 status = qtree_read_dquot(&info->dqi_gi, dquot);
762 ocfs2_qinfo_unlock(info, 0);
763 if (status < 0)
764 goto out_dq;
765 }
766 set_bit(DQ_READ_B, &dquot->dq_flags);
767
768 OCFS2_DQUOT(dquot)->dq_use_count++;
769 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
770 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
771 if (!dquot->dq_off) { /* No real quota entry? */
772 ex = 1;
773 /*
774 * Add blocks to quota file before we start a transaction since
775 * locking allocators ranks above a transaction start
776 */
777 WARN_ON(journal_current_handle());
778 status = ocfs2_extend_no_holes(gqinode, NULL,
779 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
780 gqinode->i_size);
781 if (status < 0)
782 goto out_dq;
783 }
784
785 handle = ocfs2_start_trans(osb,
786 ocfs2_calc_global_qinit_credits(sb, type));
787 if (IS_ERR(handle)) {
788 status = PTR_ERR(handle);
789 goto out_dq;
790 }
791 status = ocfs2_qinfo_lock(info, ex);
792 if (status < 0)
793 goto out_trans;
794 status = qtree_write_dquot(&info->dqi_gi, dquot);
795 if (ex && info_dirty(sb_dqinfo(sb, type))) {
796 err = __ocfs2_global_write_info(sb, type);
797 if (!status)
798 status = err;
799 }
800 ocfs2_qinfo_unlock(info, ex);
801out_trans:
802 ocfs2_commit_trans(osb, handle);
803out_dq:
804 ocfs2_unlock_global_qf(info, 1);
805 if (status < 0)
806 goto out;
807
808 status = ocfs2_create_local_dquot(dquot);
809 if (status < 0)
810 goto out;
811 set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
751out: 812out:
813 mutex_unlock(&dquot->dq_lock);
752 mlog_exit(status); 814 mlog_exit(status);
753 return status; 815 return status;
754} 816}
@@ -770,7 +832,6 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
770 struct ocfs2_super *osb = OCFS2_SB(sb); 832 struct ocfs2_super *osb = OCFS2_SB(sb);
771 833
772 mlog_entry("id=%u, type=%d", dquot->dq_id, type); 834 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
773 dquot_mark_dquot_dirty(dquot);
774 835
775 /* In case user set some limits, sync dquot immediately to global 836 /* In case user set some limits, sync dquot immediately to global
776 * quota file so that information propagates quicker */ 837 * quota file so that information propagates quicker */
@@ -793,14 +854,16 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
793 mlog_errno(status); 854 mlog_errno(status);
794 goto out_ilock; 855 goto out_ilock;
795 } 856 }
857 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
796 status = ocfs2_sync_dquot(dquot); 858 status = ocfs2_sync_dquot(dquot);
797 if (status < 0) { 859 if (status < 0) {
798 mlog_errno(status); 860 mlog_errno(status);
799 goto out_trans; 861 goto out_dlock;
800 } 862 }
801 /* Now write updated local dquot structure */ 863 /* Now write updated local dquot structure */
802 status = dquot_commit(dquot); 864 status = ocfs2_local_write_dquot(dquot);
803out_trans: 865out_dlock:
866 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
804 ocfs2_commit_trans(osb, handle); 867 ocfs2_commit_trans(osb, handle);
805out_ilock: 868out_ilock:
806 ocfs2_unlock_global_qf(oinfo, 1); 869 ocfs2_unlock_global_qf(oinfo, 1);
@@ -852,7 +915,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
852} 915}
853 916
854const struct dquot_operations ocfs2_quota_operations = { 917const struct dquot_operations ocfs2_quota_operations = {
855 .write_dquot = ocfs2_write_dquot, 918 /* We never make dquot dirty so .write_dquot is never called */
856 .acquire_dquot = ocfs2_acquire_dquot, 919 .acquire_dquot = ocfs2_acquire_dquot,
857 .release_dquot = ocfs2_release_dquot, 920 .release_dquot = ocfs2_release_dquot,
858 .mark_dirty = ocfs2_mark_dquot_dirty, 921 .mark_dirty = ocfs2_mark_dquot_dirty,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 9ad49305f450..dc78764ccc4c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -22,6 +22,7 @@
22#include "dlmglue.h" 22#include "dlmglue.h"
23#include "quota.h" 23#include "quota.h"
24#include "uptodate.h" 24#include "uptodate.h"
25#include "super.h"
25 26
26/* Number of local quota structures per block */ 27/* Number of local quota structures per block */
27static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) 28static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -119,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
119 lock_buffer(bh); 120 lock_buffer(bh);
120 modify(bh, private); 121 modify(bh, private);
121 unlock_buffer(bh); 122 unlock_buffer(bh);
122 status = ocfs2_journal_dirty(handle, bh); 123 ocfs2_journal_dirty(handle, bh);
123 if (status < 0) { 124
124 mlog_errno(status);
125 ocfs2_commit_trans(OCFS2_SB(sb), handle);
126 return status;
127 }
128 status = ocfs2_commit_trans(OCFS2_SB(sb), handle); 125 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
129 if (status < 0) { 126 if (status < 0) {
130 mlog_errno(status); 127 mlog_errno(status);
@@ -133,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
133 return 0; 130 return 0;
134} 131}
135 132
133/*
134 * Read quota block from a given logical offset.
135 *
136 * This function acquires ip_alloc_sem and thus it must not be called with a
137 * transaction started.
138 */
139static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
140 struct buffer_head **bh)
141{
142 int rc = 0;
143 struct buffer_head *tmp = *bh;
144
145 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
146 ocfs2_error(inode->i_sb,
147 "Quota file %llu is probably corrupted! Requested "
148 "to read block %Lu but file has size only %Lu\n",
149 (unsigned long long)OCFS2_I(inode)->ip_blkno,
150 (unsigned long long)v_block,
151 (unsigned long long)i_size_read(inode));
152 return -EIO;
153 }
154 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
155 ocfs2_validate_quota_block);
156 if (rc)
157 mlog_errno(rc);
158
159 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
160 if (!rc && !*bh)
161 *bh = tmp;
162
163 return rc;
164}
165
136/* Check whether we understand format of quota files */ 166/* Check whether we understand format of quota files */
137static int ocfs2_local_check_quota_file(struct super_block *sb, int type) 167static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
138{ 168{
@@ -523,9 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
523 ocfs2_clear_bit(bit, dchunk->dqc_bitmap); 553 ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
524 le32_add_cpu(&dchunk->dqc_free, 1); 554 le32_add_cpu(&dchunk->dqc_free, 1);
525 unlock_buffer(qbh); 555 unlock_buffer(qbh);
526 status = ocfs2_journal_dirty(handle, qbh); 556 ocfs2_journal_dirty(handle, qbh);
527 if (status < 0)
528 mlog_errno(status);
529out_commit: 557out_commit:
530 mutex_unlock(&sb_dqopt(sb)->dqio_mutex); 558 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
531 ocfs2_commit_trans(OCFS2_SB(sb), handle); 559 ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -631,9 +659,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
631 lock_buffer(bh); 659 lock_buffer(bh);
632 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); 660 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
633 unlock_buffer(bh); 661 unlock_buffer(bh);
634 status = ocfs2_journal_dirty(handle, bh); 662 ocfs2_journal_dirty(handle, bh);
635 if (status < 0)
636 mlog_errno(status);
637out_trans: 663out_trans:
638 ocfs2_commit_trans(osb, handle); 664 ocfs2_commit_trans(osb, handle);
639out_bh: 665out_bh:
@@ -679,7 +705,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
679 INIT_LIST_HEAD(&oinfo->dqi_chunk); 705 INIT_LIST_HEAD(&oinfo->dqi_chunk);
680 oinfo->dqi_rec = NULL; 706 oinfo->dqi_rec = NULL;
681 oinfo->dqi_lqi_bh = NULL; 707 oinfo->dqi_lqi_bh = NULL;
682 oinfo->dqi_ibh = NULL; 708 oinfo->dqi_libh = NULL;
683 709
684 status = ocfs2_global_read_info(sb, type); 710 status = ocfs2_global_read_info(sb, type);
685 if (status < 0) 711 if (status < 0)
@@ -705,7 +731,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
705 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); 731 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
706 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); 732 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
707 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); 733 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
708 oinfo->dqi_ibh = bh; 734 oinfo->dqi_libh = bh;
709 735
710 /* We crashed when using local quota file? */ 736 /* We crashed when using local quota file? */
711 if (!(info->dqi_flags & OLQF_CLEAN)) { 737 if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -767,7 +793,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
767{ 793{
768 struct mem_dqinfo *info = sb_dqinfo(sb, type); 794 struct mem_dqinfo *info = sb_dqinfo(sb, type);
769 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) 795 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
770 ->dqi_ibh; 796 ->dqi_libh;
771 int status; 797 int status;
772 798
773 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, 799 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -790,10 +816,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
790 int mark_clean = 1, len; 816 int mark_clean = 1, len;
791 int status; 817 int status;
792 818
793 /* At this point we know there are no more dquots and thus
794 * even if there's some sync in the pdflush queue, it won't
795 * find any dquots and return without doing anything */
796 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
797 iput(oinfo->dqi_gqinode); 819 iput(oinfo->dqi_gqinode);
798 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); 820 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
799 ocfs2_lock_res_free(&oinfo->dqi_gqlock); 821 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -828,7 +850,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
828 /* Mark local file as clean */ 850 /* Mark local file as clean */
829 info->dqi_flags |= OLQF_CLEAN; 851 info->dqi_flags |= OLQF_CLEAN;
830 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], 852 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
831 oinfo->dqi_ibh, 853 oinfo->dqi_libh,
832 olq_update_info, 854 olq_update_info,
833 info); 855 info);
834 if (status < 0) { 856 if (status < 0) {
@@ -838,7 +860,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
838 860
839out: 861out:
840 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); 862 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
841 brelse(oinfo->dqi_ibh); 863 brelse(oinfo->dqi_libh);
842 brelse(oinfo->dqi_lqi_bh); 864 brelse(oinfo->dqi_lqi_bh);
843 kfree(oinfo); 865 kfree(oinfo);
844 return 0; 866 return 0;
@@ -866,22 +888,21 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
866} 888}
867 889
868/* Write dquot to local quota file */ 890/* Write dquot to local quota file */
869static int ocfs2_local_write_dquot(struct dquot *dquot) 891int ocfs2_local_write_dquot(struct dquot *dquot)
870{ 892{
871 struct super_block *sb = dquot->dq_sb; 893 struct super_block *sb = dquot->dq_sb;
872 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 894 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
873 struct buffer_head *bh = NULL; 895 struct buffer_head *bh;
896 struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type];
874 int status; 897 int status;
875 898
876 status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type], 899 status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
877 ol_dqblk_file_block(sb, od->dq_local_off), 900 &bh);
878 &bh);
879 if (status) { 901 if (status) {
880 mlog_errno(status); 902 mlog_errno(status);
881 goto out; 903 goto out;
882 } 904 }
883 status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh, 905 status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
884 olq_set_dquot, od);
885 if (status < 0) { 906 if (status < 0) {
886 mlog_errno(status); 907 mlog_errno(status);
887 goto out; 908 goto out;
@@ -950,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
950 u64 p_blkno; 971 u64 p_blkno;
951 972
952 /* We are protected by dqio_sem so no locking needed */ 973 /* We are protected by dqio_sem so no locking needed */
953 status = ocfs2_extend_no_holes(lqinode, 974 status = ocfs2_extend_no_holes(lqinode, NULL,
954 lqinode->i_size + 2 * sb->s_blocksize, 975 lqinode->i_size + 2 * sb->s_blocksize,
955 lqinode->i_size); 976 lqinode->i_size);
956 if (status < 0) { 977 if (status < 0) {
@@ -981,10 +1002,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
981 } 1002 }
982 1003
983 /* Initialize chunk header */ 1004 /* Initialize chunk header */
984 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
985 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 1005 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
986 &p_blkno, NULL, NULL); 1006 &p_blkno, NULL, NULL);
987 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
988 if (status < 0) { 1007 if (status < 0) {
989 mlog_errno(status); 1008 mlog_errno(status);
990 goto out_trans; 1009 goto out_trans;
@@ -1009,17 +1028,11 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1009 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - 1028 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
1010 OCFS2_QBLK_RESERVED_SPACE); 1029 OCFS2_QBLK_RESERVED_SPACE);
1011 unlock_buffer(bh); 1030 unlock_buffer(bh);
1012 status = ocfs2_journal_dirty(handle, bh); 1031 ocfs2_journal_dirty(handle, bh);
1013 if (status < 0) {
1014 mlog_errno(status);
1015 goto out_trans;
1016 }
1017 1032
1018 /* Initialize new block with structures */ 1033 /* Initialize new block with structures */
1019 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1020 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, 1034 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
1021 &p_blkno, NULL, NULL); 1035 &p_blkno, NULL, NULL);
1022 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1023 if (status < 0) { 1036 if (status < 0) {
1024 mlog_errno(status); 1037 mlog_errno(status);
1025 goto out_trans; 1038 goto out_trans;
@@ -1040,11 +1053,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1040 lock_buffer(dbh); 1053 lock_buffer(dbh);
1041 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); 1054 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
1042 unlock_buffer(dbh); 1055 unlock_buffer(dbh);
1043 status = ocfs2_journal_dirty(handle, dbh); 1056 ocfs2_journal_dirty(handle, dbh);
1044 if (status < 0) {
1045 mlog_errno(status);
1046 goto out_trans;
1047 }
1048 1057
1049 /* Update local quotafile info */ 1058 /* Update local quotafile info */
1050 oinfo->dqi_blocks += 2; 1059 oinfo->dqi_blocks += 2;
@@ -1105,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1105 return ocfs2_local_quota_add_chunk(sb, type, offset); 1114 return ocfs2_local_quota_add_chunk(sb, type, offset);
1106 1115
1107 /* We are protected by dqio_sem so no locking needed */ 1116 /* We are protected by dqio_sem so no locking needed */
1108 status = ocfs2_extend_no_holes(lqinode, 1117 status = ocfs2_extend_no_holes(lqinode, NULL,
1109 lqinode->i_size + sb->s_blocksize, 1118 lqinode->i_size + sb->s_blocksize,
1110 lqinode->i_size); 1119 lqinode->i_size);
1111 if (status < 0) { 1120 if (status < 0) {
@@ -1120,10 +1129,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1120 } 1129 }
1121 1130
1122 /* Get buffer from the just added block */ 1131 /* Get buffer from the just added block */
1123 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1124 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, 1132 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
1125 &p_blkno, NULL, NULL); 1133 &p_blkno, NULL, NULL);
1126 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
1127 if (status < 0) { 1134 if (status < 0) {
1128 mlog_errno(status); 1135 mlog_errno(status);
1129 goto out; 1136 goto out;
@@ -1155,11 +1162,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1155 lock_buffer(bh); 1162 lock_buffer(bh);
1156 memset(bh->b_data, 0, sb->s_blocksize); 1163 memset(bh->b_data, 0, sb->s_blocksize);
1157 unlock_buffer(bh); 1164 unlock_buffer(bh);
1158 status = ocfs2_journal_dirty(handle, bh); 1165 ocfs2_journal_dirty(handle, bh);
1159 if (status < 0) { 1166
1160 mlog_errno(status);
1161 goto out_trans;
1162 }
1163 /* Update chunk header */ 1167 /* Update chunk header */
1164 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), 1168 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
1165 chunk->qc_headerbh, 1169 chunk->qc_headerbh,
@@ -1173,11 +1177,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1173 lock_buffer(chunk->qc_headerbh); 1177 lock_buffer(chunk->qc_headerbh);
1174 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); 1178 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
1175 unlock_buffer(chunk->qc_headerbh); 1179 unlock_buffer(chunk->qc_headerbh);
1176 status = ocfs2_journal_dirty(handle, chunk->qc_headerbh); 1180 ocfs2_journal_dirty(handle, chunk->qc_headerbh);
1177 if (status < 0) { 1181
1178 mlog_errno(status);
1179 goto out_trans;
1180 }
1181 /* Update file header */ 1182 /* Update file header */
1182 oinfo->dqi_blocks++; 1183 oinfo->dqi_blocks++;
1183 status = ocfs2_local_write_info(sb, type); 1184 status = ocfs2_local_write_info(sb, type);
@@ -1210,7 +1211,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1210} 1211}
1211 1212
1212/* Create dquot in the local file for given id */ 1213/* Create dquot in the local file for given id */
1213static int ocfs2_create_local_dquot(struct dquot *dquot) 1214int ocfs2_create_local_dquot(struct dquot *dquot)
1214{ 1215{
1215 struct super_block *sb = dquot->dq_sb; 1216 struct super_block *sb = dquot->dq_sb;
1216 int type = dquot->dq_type; 1217 int type = dquot->dq_type;
@@ -1219,17 +1220,27 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
1219 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); 1220 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1220 int offset; 1221 int offset;
1221 int status; 1222 int status;
1223 u64 pcount;
1222 1224
1225 down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
1223 chunk = ocfs2_find_free_entry(sb, type, &offset); 1226 chunk = ocfs2_find_free_entry(sb, type, &offset);
1224 if (!chunk) { 1227 if (!chunk) {
1225 chunk = ocfs2_extend_local_quota_file(sb, type, &offset); 1228 chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
1226 if (IS_ERR(chunk)) 1229 if (IS_ERR(chunk)) {
1227 return PTR_ERR(chunk); 1230 status = PTR_ERR(chunk);
1231 goto out;
1232 }
1228 } else if (IS_ERR(chunk)) { 1233 } else if (IS_ERR(chunk)) {
1229 return PTR_ERR(chunk); 1234 status = PTR_ERR(chunk);
1235 goto out;
1230 } 1236 }
1231 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); 1237 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
1232 od->dq_chunk = chunk; 1238 od->dq_chunk = chunk;
1239 status = ocfs2_extent_map_get_blocks(lqinode,
1240 ol_dqblk_block(sb, chunk->qc_num, offset),
1241 &od->dq_local_phys_blk,
1242 &pcount,
1243 NULL);
1233 1244
1234 /* Initialize dquot structure on disk */ 1245 /* Initialize dquot structure on disk */
1235 status = ocfs2_local_write_dquot(dquot); 1246 status = ocfs2_local_write_dquot(dquot);
@@ -1246,39 +1257,15 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
1246 goto out; 1257 goto out;
1247 } 1258 }
1248out: 1259out:
1260 up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
1249 return status; 1261 return status;
1250} 1262}
1251 1263
1252/* Create entry in local file for dquot, load data from the global file */ 1264/*
1253static int ocfs2_local_read_dquot(struct dquot *dquot) 1265 * Release dquot structure from local quota file. ocfs2_release_dquot() has
1254{ 1266 * already started a transaction and written all changes to global quota file
1255 int status; 1267 */
1256 1268int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
1257 mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
1258
1259 status = ocfs2_global_read_dquot(dquot);
1260 if (status < 0) {
1261 mlog_errno(status);
1262 goto out_err;
1263 }
1264
1265 /* Now create entry in the local quota file */
1266 status = ocfs2_create_local_dquot(dquot);
1267 if (status < 0) {
1268 mlog_errno(status);
1269 goto out_err;
1270 }
1271 mlog_exit(0);
1272 return 0;
1273out_err:
1274 mlog_exit(status);
1275 return status;
1276}
1277
1278/* Release dquot structure from local quota file. ocfs2_release_dquot() has
1279 * already started a transaction and obtained exclusive lock for global
1280 * quota file. */
1281static int ocfs2_local_release_dquot(struct dquot *dquot)
1282{ 1269{
1283 int status; 1270 int status;
1284 int type = dquot->dq_type; 1271 int type = dquot->dq_type;
@@ -1286,15 +1273,6 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1286 struct super_block *sb = dquot->dq_sb; 1273 struct super_block *sb = dquot->dq_sb;
1287 struct ocfs2_local_disk_chunk *dchunk; 1274 struct ocfs2_local_disk_chunk *dchunk;
1288 int offset; 1275 int offset;
1289 handle_t *handle = journal_current_handle();
1290
1291 BUG_ON(!handle);
1292 /* First write all local changes to global file */
1293 status = ocfs2_global_release_dquot(dquot);
1294 if (status < 0) {
1295 mlog_errno(status);
1296 goto out;
1297 }
1298 1276
1299 status = ocfs2_journal_access_dq(handle, 1277 status = ocfs2_journal_access_dq(handle,
1300 INODE_CACHE(sb_dqopt(sb)->files[type]), 1278 INODE_CACHE(sb_dqopt(sb)->files[type]),
@@ -1312,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1312 ocfs2_clear_bit(offset, dchunk->dqc_bitmap); 1290 ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
1313 le32_add_cpu(&dchunk->dqc_free, 1); 1291 le32_add_cpu(&dchunk->dqc_free, 1);
1314 unlock_buffer(od->dq_chunk->qc_headerbh); 1292 unlock_buffer(od->dq_chunk->qc_headerbh);
1315 status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); 1293 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
1316 if (status < 0) { 1294
1317 mlog_errno(status);
1318 goto out;
1319 }
1320 status = 0;
1321out: 1295out:
1322 /* Clear the read bit so that next time someone uses this 1296 /* Clear the read bit so that next time someone uses this
1323 * dquot he reads fresh info from disk and allocates local 1297 * dquot he reads fresh info from disk and allocates local
@@ -1331,9 +1305,6 @@ static const struct quota_format_ops ocfs2_format_ops = {
1331 .read_file_info = ocfs2_local_read_info, 1305 .read_file_info = ocfs2_local_read_info,
1332 .write_file_info = ocfs2_global_write_info, 1306 .write_file_info = ocfs2_global_write_info,
1333 .free_file_info = ocfs2_local_free_info, 1307 .free_file_info = ocfs2_local_free_info,
1334 .read_dqblk = ocfs2_local_read_dquot,
1335 .commit_dqblk = ocfs2_local_write_dquot,
1336 .release_dqblk = ocfs2_local_release_dquot,
1337}; 1308};
1338 1309
1339struct quota_format_type ocfs2_quota_format = { 1310struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5cbcd0f008fc..efdd75607406 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
570 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; 570 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
571 u16 suballoc_bit_start; 571 u16 suballoc_bit_start;
572 u32 num_got; 572 u32 num_got;
573 u64 first_blkno; 573 u64 suballoc_loc, first_blkno;
574 574
575 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 575 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
576 576
@@ -596,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
596 goto out_commit; 596 goto out_commit;
597 } 597 }
598 598
599 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 599 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
600 &suballoc_bit_start, &num_got, 600 &suballoc_bit_start, &num_got,
601 &first_blkno); 601 &first_blkno);
602 if (ret) { 602 if (ret) {
@@ -626,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
626 memset(rb, 0, inode->i_sb->s_blocksize); 626 memset(rb, 0, inode->i_sb->s_blocksize);
627 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 627 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
628 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 628 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
629 rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
629 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
630 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
631 rb->rf_blkno = cpu_to_le64(first_blkno); 632 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -790,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
790 if (le32_to_cpu(rb->rf_count) == 1) { 791 if (le32_to_cpu(rb->rf_count) == 1) {
791 blk = le64_to_cpu(rb->rf_blkno); 792 blk = le64_to_cpu(rb->rf_blkno);
792 bit = le16_to_cpu(rb->rf_suballoc_bit); 793 bit = le16_to_cpu(rb->rf_suballoc_bit);
793 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 794 if (rb->rf_suballoc_loc)
795 bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
796 else
797 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
794 798
795 alloc_inode = ocfs2_get_system_file_inode(osb, 799 alloc_inode = ocfs2_get_system_file_inode(osb,
796 EXTENT_ALLOC_SYSTEM_INODE, 800 EXTENT_ALLOC_SYSTEM_INODE,
@@ -1268,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
1268 } else if (merge) 1272 } else if (merge)
1269 ocfs2_refcount_rec_merge(rb, index); 1273 ocfs2_refcount_rec_merge(rb, index);
1270 1274
1271 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1275 ocfs2_journal_dirty(handle, ref_leaf_bh);
1272 if (ret)
1273 mlog_errno(ret);
1274out: 1276out:
1275 return ret; 1277 return ret;
1276} 1278}
@@ -1284,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1284 int ret; 1286 int ret;
1285 u16 suballoc_bit_start; 1287 u16 suballoc_bit_start;
1286 u32 num_got; 1288 u32 num_got;
1287 u64 blkno; 1289 u64 suballoc_loc, blkno;
1288 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1290 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1289 struct buffer_head *new_bh = NULL; 1291 struct buffer_head *new_bh = NULL;
1290 struct ocfs2_refcount_block *new_rb; 1292 struct ocfs2_refcount_block *new_rb;
@@ -1298,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1298 goto out; 1300 goto out;
1299 } 1301 }
1300 1302
1301 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1303 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1302 &suballoc_bit_start, &num_got, 1304 &suballoc_bit_start, &num_got,
1303 &blkno); 1305 &blkno);
1304 if (ret) { 1306 if (ret) {
@@ -1330,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1330 1332
1331 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1333 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1332 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 1334 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1335 new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1333 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1336 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1334 new_rb->rf_blkno = cpu_to_le64(blkno); 1337 new_rb->rf_blkno = cpu_to_le64(blkno);
1335 new_rb->rf_cpos = cpu_to_le32(0); 1338 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1524,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1524 int ret; 1527 int ret;
1525 u16 suballoc_bit_start; 1528 u16 suballoc_bit_start;
1526 u32 num_got, new_cpos; 1529 u32 num_got, new_cpos;
1527 u64 blkno; 1530 u64 suballoc_loc, blkno;
1528 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1531 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1529 struct ocfs2_refcount_block *root_rb = 1532 struct ocfs2_refcount_block *root_rb =
1530 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1533 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1548,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1548 goto out; 1551 goto out;
1549 } 1552 }
1550 1553
1551 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1554 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1552 &suballoc_bit_start, &num_got, 1555 &suballoc_bit_start, &num_got,
1553 &blkno); 1556 &blkno);
1554 if (ret) { 1557 if (ret) {
@@ -1576,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1576 memset(new_rb, 0, sb->s_blocksize); 1579 memset(new_rb, 0, sb->s_blocksize);
1577 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1580 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1578 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 1581 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1582 new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1579 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1583 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1580 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1584 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1581 new_rb->rf_blkno = cpu_to_le64(blkno); 1585 new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1694,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
1694 * 2 more credits, one for the leaf refcount block, one for 1698 * 2 more credits, one for the leaf refcount block, one for
1695 * the extent block contains the extent rec. 1699 * the extent block contains the extent rec.
1696 */ 1700 */
1697 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); 1701 ret = ocfs2_extend_trans(handle, 2);
1698 if (ret < 0) { 1702 if (ret < 0) {
1699 mlog_errno(ret); 1703 mlog_errno(ret);
1700 goto out; 1704 goto out;
@@ -1802,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
1802 if (merge) 1806 if (merge)
1803 ocfs2_refcount_rec_merge(rb, index); 1807 ocfs2_refcount_rec_merge(rb, index);
1804 1808
1805 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1809 ocfs2_journal_dirty(handle, ref_leaf_bh);
1806 if (ret) {
1807 mlog_errno(ret);
1808 goto out;
1809 }
1810 1810
1811 if (index == 0) { 1811 if (index == 0) {
1812 ret = ocfs2_adjust_refcount_rec(handle, ci, 1812 ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1977,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1977 ocfs2_refcount_rec_merge(rb, index); 1977 ocfs2_refcount_rec_merge(rb, index);
1978 } 1978 }
1979 1979
1980 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1980 ocfs2_journal_dirty(handle, ref_leaf_bh);
1981 if (ret)
1982 mlog_errno(ret);
1983 1981
1984out: 1982out:
1985 brelse(new_bh); 1983 brelse(new_bh);
@@ -2112,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
2112 */ 2110 */
2113 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, 2111 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2114 le16_to_cpu(rb->rf_suballoc_slot), 2112 le16_to_cpu(rb->rf_suballoc_slot),
2113 le64_to_cpu(rb->rf_suballoc_loc),
2115 le64_to_cpu(rb->rf_blkno), 2114 le64_to_cpu(rb->rf_blkno),
2116 le16_to_cpu(rb->rf_suballoc_bit)); 2115 le16_to_cpu(rb->rf_suballoc_bit));
2117 if (ret) { 2116 if (ret) {
@@ -2437,16 +2436,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2437 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + 2436 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2438 le32_to_cpu(rec.r_clusters)) - cpos; 2437 le32_to_cpu(rec.r_clusters)) - cpos;
2439 /* 2438 /*
2440 * If the refcount rec already exist, cool. We just need
2441 * to check whether there is a split. Otherwise we just need
2442 * to increase the refcount.
2443 * If we will insert one, increases recs_add.
2444 *
2445 * We record all the records which will be inserted to the 2439 * We record all the records which will be inserted to the
2446 * same refcount block, so that we can tell exactly whether 2440 * same refcount block, so that we can tell exactly whether
2447 * we need a new refcount block or not. 2441 * we need a new refcount block or not.
2442 *
2443 * If we will insert a new one, this is easy and only happens
2444 * during adding refcounted flag to the extent, so we don't
2445 * have a chance of spliting. We just need one record.
2446 *
2447 * If the refcount rec already exists, that would be a little
2448 * complicated. we may have to:
2449 * 1) split at the beginning if the start pos isn't aligned.
2450 * we need 1 more record in this case.
2451 * 2) split int the end if the end pos isn't aligned.
2452 * we need 1 more record in this case.
2453 * 3) split in the middle because of file system fragmentation.
2454 * we need 2 more records in this case(we can't detect this
2455 * beforehand, so always think of the worst case).
2448 */ 2456 */
2449 if (rec.r_refcount) { 2457 if (rec.r_refcount) {
2458 recs_add += 2;
2450 /* Check whether we need a split at the beginning. */ 2459 /* Check whether we need a split at the beginning. */
2451 if (cpos == start_cpos && 2460 if (cpos == start_cpos &&
2452 cpos != le64_to_cpu(rec.r_cpos)) 2461 cpos != le64_to_cpu(rec.r_cpos))
@@ -2516,20 +2525,19 @@ out:
2516 * 2525 *
2517 * Normally the refcount blocks store these refcount should be 2526 * Normally the refcount blocks store these refcount should be
2518 * contiguous also, so that we can get the number easily. 2527 * contiguous also, so that we can get the number easily.
2519 * As for meta_ac, we will at most add split 2 refcount record and 2528 * We will at most add split 2 refcount records and 2 more
2520 * 2 more refcount block, so just check it in a rough way. 2529 * refcount blocks, so just check it in a rough way.
2521 * 2530 *
2522 * Caller must hold refcount tree lock. 2531 * Caller must hold refcount tree lock.
2523 */ 2532 */
2524int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 2533int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2525 struct buffer_head *di_bh, 2534 u64 refcount_loc,
2526 u64 phys_blkno, 2535 u64 phys_blkno,
2527 u32 clusters, 2536 u32 clusters,
2528 int *credits, 2537 int *credits,
2529 struct ocfs2_alloc_context **meta_ac) 2538 int *ref_blocks)
2530{ 2539{
2531 int ret, ref_blocks = 0; 2540 int ret;
2532 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2533 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2541 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2534 struct buffer_head *ref_root_bh = NULL; 2542 struct buffer_head *ref_root_bh = NULL;
2535 struct ocfs2_refcount_tree *tree; 2543 struct ocfs2_refcount_tree *tree;
@@ -2546,14 +2554,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2546 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2554 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2547 2555
2548 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), 2556 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2549 le64_to_cpu(di->i_refcount_loc), &tree); 2557 refcount_loc, &tree);
2550 if (ret) { 2558 if (ret) {
2551 mlog_errno(ret); 2559 mlog_errno(ret);
2552 goto out; 2560 goto out;
2553 } 2561 }
2554 2562
2555 ret = ocfs2_read_refcount_block(&tree->rf_ci, 2563 ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
2556 le64_to_cpu(di->i_refcount_loc),
2557 &ref_root_bh); 2564 &ref_root_bh);
2558 if (ret) { 2565 if (ret) {
2559 mlog_errno(ret); 2566 mlog_errno(ret);
@@ -2564,21 +2571,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2564 &tree->rf_ci, 2571 &tree->rf_ci,
2565 ref_root_bh, 2572 ref_root_bh,
2566 start_cpos, clusters, 2573 start_cpos, clusters,
2567 &ref_blocks, credits); 2574 ref_blocks, credits);
2568 if (ret) { 2575 if (ret) {
2569 mlog_errno(ret); 2576 mlog_errno(ret);
2570 goto out; 2577 goto out;
2571 } 2578 }
2572 2579
2573 mlog(0, "reserve new metadata %d, credits = %d\n", 2580 mlog(0, "reserve new metadata %d blocks, credits = %d\n",
2574 ref_blocks, *credits); 2581 *ref_blocks, *credits);
2575
2576 if (ref_blocks) {
2577 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
2578 ref_blocks, meta_ac);
2579 if (ret)
2580 mlog_errno(ret);
2581 }
2582 2582
2583out: 2583out:
2584 brelse(ref_root_bh); 2584 brelse(ref_root_bh);
@@ -2941,6 +2941,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2941 2941
2942 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2942 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2943 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2943 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2944 /*
2945 * We only duplicate pages until we reach the page contains i_size - 1.
2946 * So trim 'end' to i_size.
2947 */
2948 if (end > i_size_read(context->inode))
2949 end = i_size_read(context->inode);
2944 2950
2945 while (offset < end) { 2951 while (offset < end) {
2946 page_index = offset >> PAGE_CACHE_SHIFT; 2952 page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2954,7 +2960,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2954 if (map_end & (PAGE_CACHE_SIZE - 1)) 2960 if (map_end & (PAGE_CACHE_SIZE - 1))
2955 to = map_end & (PAGE_CACHE_SIZE - 1); 2961 to = map_end & (PAGE_CACHE_SIZE - 1);
2956 2962
2957 page = grab_cache_page(mapping, page_index); 2963 page = find_or_create_page(mapping, page_index, GFP_NOFS);
2958 2964
2959 /* 2965 /*
2960 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page 2966 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -3040,11 +3046,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3040 } 3046 }
3041 3047
3042 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); 3048 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3043 ret = ocfs2_journal_dirty(handle, new_bh); 3049 ocfs2_journal_dirty(handle, new_bh);
3044 if (ret) {
3045 mlog_errno(ret);
3046 break;
3047 }
3048 3050
3049 brelse(new_bh); 3051 brelse(new_bh);
3050 brelse(old_bh); 3052 brelse(old_bh);
@@ -3177,7 +3179,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3177 if (map_end > end) 3179 if (map_end > end)
3178 map_end = end; 3180 map_end = end;
3179 3181
3180 page = grab_cache_page(context->inode->i_mapping, page_index); 3182 page = find_or_create_page(context->inode->i_mapping,
3183 page_index, GFP_NOFS);
3181 BUG_ON(!page); 3184 BUG_ON(!page);
3182 3185
3183 wait_on_page_writeback(page); 3186 wait_on_page_writeback(page);
@@ -3282,7 +3285,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3282 } else { 3285 } else {
3283 delete = 1; 3286 delete = 1;
3284 3287
3285 ret = __ocfs2_claim_clusters(osb, handle, 3288 ret = __ocfs2_claim_clusters(handle,
3286 context->data_ac, 3289 context->data_ac,
3287 1, set_len, 3290 1, set_len,
3288 &new_bit, &new_len); 3291 &new_bit, &new_len);
@@ -4180,6 +4183,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
4180 struct inode *inode = old_dentry->d_inode; 4183 struct inode *inode = old_dentry->d_inode;
4181 struct buffer_head *new_bh = NULL; 4184 struct buffer_head *new_bh = NULL;
4182 4185
4186 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
4187 ret = -EINVAL;
4188 mlog_errno(ret);
4189 goto out;
4190 }
4191
4183 ret = filemap_fdatawrite(inode->i_mapping); 4192 ret = filemap_fdatawrite(inode->i_mapping);
4184 if (ret) { 4193 if (ret) {
4185 mlog_errno(ret); 4194 mlog_errno(ret);
@@ -4192,8 +4201,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
4192 goto out; 4201 goto out;
4193 } 4202 }
4194 4203
4195 mutex_lock(&new_inode->i_mutex); 4204 mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
4196 ret = ocfs2_inode_lock(new_inode, &new_bh, 1); 4205 ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
4206 OI_LS_REFLINK_TARGET);
4197 if (ret) { 4207 if (ret) {
4198 mlog_errno(ret); 4208 mlog_errno(ret);
4199 goto out_unlock; 4209 goto out_unlock;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ecc..9983ba1570e2 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
47 struct ocfs2_cached_dealloc_ctxt *dealloc, 47 struct ocfs2_cached_dealloc_ctxt *dealloc,
48 int delete); 48 int delete);
49int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 49int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
50 struct buffer_head *di_bh, 50 u64 refcount_loc,
51 u64 phys_blkno, 51 u64 phys_blkno,
52 u32 clusters, 52 u32 clusters,
53 int *credits, 53 int *credits,
54 struct ocfs2_alloc_context **meta_ac); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, 55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos); 56 u32 cpos, u32 write_len, u32 max_cpos);
57 57
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 000000000000..3e78db361bc7
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,844 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * reservations.c
5 *
6 * Allocation reservations implementation
7 *
8 * Some code borrowed from fs/ext3/balloc.c and is:
9 *
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 *
15 * The rest is copyright (C) 2010 Novell. All rights reserved.
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public
19 * License version 2 as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 * General Public License for more details.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/highmem.h>
30#include <linux/bitops.h>
31#include <linux/list.h>
32
33#define MLOG_MASK_PREFIX ML_RESERVATIONS
34#include <cluster/masklog.h>
35
36#include "ocfs2.h"
37
38#ifdef CONFIG_OCFS2_DEBUG_FS
39#define OCFS2_CHECK_RESERVATIONS
40#endif
41
42DEFINE_SPINLOCK(resv_lock);
43
44#define OCFS2_MIN_RESV_WINDOW_BITS 8
45#define OCFS2_MAX_RESV_WINDOW_BITS 1024
46
47int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
48{
49 return (osb->osb_resv_level && osb->osb_dir_resv_level);
50}
51
52static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
53 struct ocfs2_alloc_reservation *resv)
54{
55 struct ocfs2_super *osb = resmap->m_osb;
56 unsigned int bits;
57
58 if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
59 /* 8, 16, 32, 64, 128, 256, 512, 1024 */
60 bits = 4 << osb->osb_resv_level;
61 } else {
62 bits = 4 << osb->osb_dir_resv_level;
63 }
64 return bits;
65}
66
67static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
68{
69 if (resv->r_len)
70 return resv->r_start + resv->r_len - 1;
71 return resv->r_start;
72}
73
74static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
75{
76 return !!(resv->r_len == 0);
77}
78
79static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
80{
81 if (resmap->m_osb->osb_resv_level == 0)
82 return 1;
83 return 0;
84}
85
86static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
87{
88 struct ocfs2_super *osb = resmap->m_osb;
89 struct rb_node *node;
90 struct ocfs2_alloc_reservation *resv;
91 int i = 0;
92
93 mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
94 osb->dev_str, resmap->m_bitmap_len);
95
96 node = rb_first(&resmap->m_reservations);
97 while (node) {
98 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
99
100 mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
101 "\tlast_len: %u\n", resv->r_start,
102 ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
103 resv->r_last_len);
104
105 node = rb_next(node);
106 i++;
107 }
108
109 mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
110
111 i = 0;
112 list_for_each_entry(resv, &resmap->m_lru, r_lru) {
113 mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
114 "last_start: %u\tlast_len: %u\n", i, resv->r_start,
115 ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
116 resv->r_last_len);
117
118 i++;
119 }
120}
121
122#ifdef OCFS2_CHECK_RESERVATIONS
123static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
124 int i,
125 struct ocfs2_alloc_reservation *resv)
126{
127 char *disk_bitmap = resmap->m_disk_bitmap;
128 unsigned int start = resv->r_start;
129 unsigned int end = ocfs2_resv_end(resv);
130
131 while (start <= end) {
132 if (ocfs2_test_bit(start, disk_bitmap)) {
133 mlog(ML_ERROR,
134 "reservation %d covers an allocated area "
135 "starting at bit %u!\n", i, start);
136 return 1;
137 }
138
139 start++;
140 }
141 return 0;
142}
143
144static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
145{
146 unsigned int off = 0;
147 int i = 0;
148 struct rb_node *node;
149 struct ocfs2_alloc_reservation *resv;
150
151 node = rb_first(&resmap->m_reservations);
152 while (node) {
153 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
154
155 if (i > 0 && resv->r_start <= off) {
156 mlog(ML_ERROR, "reservation %d has bad start off!\n",
157 i);
158 goto bad;
159 }
160
161 if (resv->r_len == 0) {
162 mlog(ML_ERROR, "reservation %d has no length!\n",
163 i);
164 goto bad;
165 }
166
167 if (resv->r_start > ocfs2_resv_end(resv)) {
168 mlog(ML_ERROR, "reservation %d has invalid range!\n",
169 i);
170 goto bad;
171 }
172
173 if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
174 mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
175 i);
176 goto bad;
177 }
178
179 if (ocfs2_validate_resmap_bits(resmap, i, resv))
180 goto bad;
181
182 off = ocfs2_resv_end(resv);
183 node = rb_next(node);
184
185 i++;
186 }
187 return;
188
189bad:
190 ocfs2_dump_resv(resmap);
191 BUG();
192}
193#else
194static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
195{
196
197}
198#endif
199
200void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
201{
202 memset(resv, 0, sizeof(*resv));
203 INIT_LIST_HEAD(&resv->r_lru);
204}
205
206void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
207 unsigned int flags)
208{
209 BUG_ON(flags & ~OCFS2_RESV_TYPES);
210
211 resv->r_flags |= flags;
212}
213
214int ocfs2_resmap_init(struct ocfs2_super *osb,
215 struct ocfs2_reservation_map *resmap)
216{
217 memset(resmap, 0, sizeof(*resmap));
218
219 resmap->m_osb = osb;
220 resmap->m_reservations = RB_ROOT;
221 /* m_bitmap_len is initialized to zero by the above memset. */
222 INIT_LIST_HEAD(&resmap->m_lru);
223
224 return 0;
225}
226
227static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
228 struct ocfs2_alloc_reservation *resv)
229{
230 assert_spin_locked(&resv_lock);
231
232 if (!list_empty(&resv->r_lru))
233 list_del_init(&resv->r_lru);
234
235 list_add_tail(&resv->r_lru, &resmap->m_lru);
236}
237
238static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
239{
240 resv->r_len = 0;
241 resv->r_start = 0;
242}
243
244static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
245 struct ocfs2_alloc_reservation *resv)
246{
247 if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
248 list_del_init(&resv->r_lru);
249 rb_erase(&resv->r_node, &resmap->m_reservations);
250 resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
251 }
252}
253
254static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
255 struct ocfs2_alloc_reservation *resv)
256{
257 assert_spin_locked(&resv_lock);
258
259 __ocfs2_resv_trunc(resv);
260 /*
261 * last_len and last_start no longer make sense if
262 * we're changing the range of our allocations.
263 */
264 resv->r_last_len = resv->r_last_start = 0;
265
266 ocfs2_resv_remove(resmap, resv);
267}
268
269/* does nothing if 'resv' is null */
270void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
271 struct ocfs2_alloc_reservation *resv)
272{
273 if (resv) {
274 spin_lock(&resv_lock);
275 __ocfs2_resv_discard(resmap, resv);
276 spin_unlock(&resv_lock);
277 }
278}
279
280static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
281{
282 struct rb_node *node;
283 struct ocfs2_alloc_reservation *resv;
284
285 assert_spin_locked(&resv_lock);
286
287 while ((node = rb_last(&resmap->m_reservations)) != NULL) {
288 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
289
290 __ocfs2_resv_discard(resmap, resv);
291 }
292}
293
294void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
295 unsigned int clen, char *disk_bitmap)
296{
297 if (ocfs2_resmap_disabled(resmap))
298 return;
299
300 spin_lock(&resv_lock);
301
302 ocfs2_resmap_clear_all_resv(resmap);
303 resmap->m_bitmap_len = clen;
304 resmap->m_disk_bitmap = disk_bitmap;
305
306 spin_unlock(&resv_lock);
307}
308
309void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
310{
311 /* Does nothing for now. Keep this around for API symmetry */
312}
313
314static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
315 struct ocfs2_alloc_reservation *new)
316{
317 struct rb_root *root = &resmap->m_reservations;
318 struct rb_node *parent = NULL;
319 struct rb_node **p = &root->rb_node;
320 struct ocfs2_alloc_reservation *tmp;
321
322 assert_spin_locked(&resv_lock);
323
324 mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
325 new->r_len);
326
327 while (*p) {
328 parent = *p;
329
330 tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
331
332 if (new->r_start < tmp->r_start) {
333 p = &(*p)->rb_left;
334
335 /*
336 * This is a good place to check for
337 * overlapping reservations.
338 */
339 BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
340 } else if (new->r_start > ocfs2_resv_end(tmp)) {
341 p = &(*p)->rb_right;
342 } else {
343 /* This should never happen! */
344 mlog(ML_ERROR, "Duplicate reservation window!\n");
345 BUG();
346 }
347 }
348
349 rb_link_node(&new->r_node, parent, p);
350 rb_insert_color(&new->r_node, root);
351 new->r_flags |= OCFS2_RESV_FLAG_INUSE;
352
353 ocfs2_resv_mark_lru(resmap, new);
354
355 ocfs2_check_resmap(resmap);
356}
357
358/**
359 * ocfs2_find_resv_lhs() - find the window which contains goal
360 * @resmap: reservation map to search
361 * @goal: which bit to search for
362 *
363 * If a window containing that goal is not found, we return the window
364 * which comes before goal. Returns NULL on empty rbtree or no window
365 * before goal.
366 */
367static struct ocfs2_alloc_reservation *
368ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
369{
370 struct ocfs2_alloc_reservation *resv = NULL;
371 struct ocfs2_alloc_reservation *prev_resv = NULL;
372 struct rb_node *node = resmap->m_reservations.rb_node;
373
374 assert_spin_locked(&resv_lock);
375
376 if (!node)
377 return NULL;
378
379 node = rb_first(&resmap->m_reservations);
380 while (node) {
381 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
382
383 if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
384 break;
385
386 /* Check if we overshot the reservation just before goal? */
387 if (resv->r_start > goal) {
388 resv = prev_resv;
389 break;
390 }
391
392 prev_resv = resv;
393 node = rb_next(node);
394 }
395
396 return resv;
397}
398
399/*
400 * We are given a range within the bitmap, which corresponds to a gap
401 * inside the reservations tree (search_start, search_len). The range
402 * can be anything from the whole bitmap, to a gap between
403 * reservations.
404 *
405 * The start value of *rstart is insignificant.
406 *
407 * This function searches the bitmap range starting at search_start
408 * with length search_len for a set of contiguous free bits. We try
409 * to find up to 'wanted' bits, but can sometimes return less.
410 *
411 * Returns the length of allocation, 0 if no free bits are found.
412 *
413 * *cstart and *clen will also be populated with the result.
414 */
415static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
416 unsigned int wanted,
417 unsigned int search_start,
418 unsigned int search_len,
419 unsigned int *rstart,
420 unsigned int *rlen)
421{
422 void *bitmap = resmap->m_disk_bitmap;
423 unsigned int best_start, best_len = 0;
424 int offset, start, found;
425
426 mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
427 wanted, search_start, search_len, resmap->m_bitmap_len);
428
429 found = best_start = best_len = 0;
430
431 start = search_start;
432 while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
433 start)) != -1) {
434 /* Search reached end of the region */
435 if (offset >= (search_start + search_len))
436 break;
437
438 if (offset == start) {
439 /* we found a zero */
440 found++;
441 /* move start to the next bit to test */
442 start++;
443 } else {
444 /* got a zero after some ones */
445 found = 1;
446 start = offset + 1;
447 }
448 if (found > best_len) {
449 best_len = found;
450 best_start = start - found;
451 }
452
453 if (found >= wanted)
454 break;
455 }
456
457 if (best_len == 0)
458 return 0;
459
460 if (best_len >= wanted)
461 best_len = wanted;
462
463 *rlen = best_len;
464 *rstart = best_start;
465
466 mlog(0, "Found start: %u len: %u\n", best_start, best_len);
467
468 return *rlen;
469}
470
471static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
472 struct ocfs2_alloc_reservation *resv,
473 unsigned int goal, unsigned int wanted)
474{
475 struct rb_root *root = &resmap->m_reservations;
476 unsigned int gap_start, gap_end, gap_len;
477 struct ocfs2_alloc_reservation *prev_resv, *next_resv;
478 struct rb_node *prev, *next;
479 unsigned int cstart, clen;
480 unsigned int best_start = 0, best_len = 0;
481
482 /*
483 * Nasty cases to consider:
484 *
485 * - rbtree is empty
486 * - our window should be first in all reservations
487 * - our window should be last in all reservations
488 * - need to make sure we don't go past end of bitmap
489 */
490
491 mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
492 resv->r_start, ocfs2_resv_end(resv), goal, wanted);
493
494 assert_spin_locked(&resv_lock);
495
496 if (RB_EMPTY_ROOT(root)) {
497 /*
498 * Easiest case - empty tree. We can just take
499 * whatever window of free bits we want.
500 */
501
502 mlog(0, "Empty root\n");
503
504 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
505 resmap->m_bitmap_len - goal,
506 &cstart, &clen);
507
508 /*
509 * This should never happen - the local alloc window
510 * will always have free bits when we're called.
511 */
512 BUG_ON(goal == 0 && clen == 0);
513
514 if (clen == 0)
515 return;
516
517 resv->r_start = cstart;
518 resv->r_len = clen;
519
520 ocfs2_resv_insert(resmap, resv);
521 return;
522 }
523
524 prev_resv = ocfs2_find_resv_lhs(resmap, goal);
525
526 if (prev_resv == NULL) {
527 mlog(0, "Goal on LHS of leftmost window\n");
528
529 /*
530 * A NULL here means that the search code couldn't
531 * find a window that starts before goal.
532 *
533 * However, we can take the first window after goal,
534 * which is also by definition, the leftmost window in
535 * the entire tree. If we can find free bits in the
536 * gap between goal and the LHS window, then the
537 * reservation can safely be placed there.
538 *
539 * Otherwise we fall back to a linear search, checking
540 * the gaps in between windows for a place to
541 * allocate.
542 */
543
544 next = rb_first(root);
545 next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
546 r_node);
547
548 /*
549 * The search should never return such a window. (see
550 * comment above
551 */
552 if (next_resv->r_start <= goal) {
553 mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
554 goal, next_resv->r_start, next_resv->r_len);
555 ocfs2_dump_resv(resmap);
556 BUG();
557 }
558
559 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
560 next_resv->r_start - goal,
561 &cstart, &clen);
562 if (clen) {
563 best_len = clen;
564 best_start = cstart;
565 if (best_len == wanted)
566 goto out_insert;
567 }
568
569 prev_resv = next_resv;
570 next_resv = NULL;
571 }
572
573 prev = &prev_resv->r_node;
574
575 /* Now we do a linear search for a window, starting at 'prev_rsv' */
576 while (1) {
577 next = rb_next(prev);
578 if (next) {
579 mlog(0, "One more resv found in linear search\n");
580 next_resv = rb_entry(next,
581 struct ocfs2_alloc_reservation,
582 r_node);
583
584 gap_start = ocfs2_resv_end(prev_resv) + 1;
585 gap_end = next_resv->r_start - 1;
586 gap_len = gap_end - gap_start + 1;
587 } else {
588 mlog(0, "No next node\n");
589 /*
590 * We're at the rightmost edge of the
591 * tree. See if a reservation between this
592 * window and the end of the bitmap will work.
593 */
594 gap_start = ocfs2_resv_end(prev_resv) + 1;
595 gap_len = resmap->m_bitmap_len - gap_start;
596 gap_end = resmap->m_bitmap_len - 1;
597 }
598
599 /*
600 * No need to check this gap if we have already found
601 * a larger region of free bits.
602 */
603 if (gap_len <= best_len)
604 goto next_resv;
605
606 clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
607 gap_len, &cstart, &clen);
608 if (clen == wanted) {
609 best_len = clen;
610 best_start = cstart;
611 goto out_insert;
612 } else if (clen > best_len) {
613 best_len = clen;
614 best_start = cstart;
615 }
616
617next_resv:
618 if (!next)
619 break;
620
621 prev = next;
622 prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
623 r_node);
624 }
625
626out_insert:
627 if (best_len) {
628 resv->r_start = best_start;
629 resv->r_len = best_len;
630 ocfs2_resv_insert(resmap, resv);
631 }
632}
633
634static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
635 struct ocfs2_alloc_reservation *resv,
636 unsigned int wanted)
637{
638 struct ocfs2_alloc_reservation *lru_resv;
639 int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
640 unsigned int min_bits;
641
642 if (!tmpwindow)
643 min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
644 else
645 min_bits = wanted; /* We at know the temp window will use all
646 * of these bits */
647
648 /*
649 * Take the first reservation off the LRU as our 'target'. We
650 * don't try to be smart about it. There might be a case for
651 * searching based on size but I don't have enough data to be
652 * sure. --Mark (3/16/2010)
653 */
654 lru_resv = list_first_entry(&resmap->m_lru,
655 struct ocfs2_alloc_reservation, r_lru);
656
657 mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
658 lru_resv->r_len, ocfs2_resv_end(lru_resv));
659
660 /*
661 * Cannibalize (some or all) of the target reservation and
662 * feed it to the current window.
663 */
664 if (lru_resv->r_len <= min_bits) {
665 /*
666 * Discard completely if size is less than or equal to a
667 * reasonable threshold - 50% of window bits for non temporary
668 * windows.
669 */
670 resv->r_start = lru_resv->r_start;
671 resv->r_len = lru_resv->r_len;
672
673 __ocfs2_resv_discard(resmap, lru_resv);
674 } else {
675 unsigned int shrink;
676 if (tmpwindow)
677 shrink = min_bits;
678 else
679 shrink = lru_resv->r_len / 2;
680
681 lru_resv->r_len -= shrink;
682
683 resv->r_start = ocfs2_resv_end(lru_resv) + 1;
684 resv->r_len = shrink;
685 }
686
687 mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
688 "r_len: %u r_last_start: %u r_last_len: %u\n",
689 resv->r_start, ocfs2_resv_end(resv), resv->r_len,
690 resv->r_last_start, resv->r_last_len);
691
692 ocfs2_resv_insert(resmap, resv);
693}
694
695static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
696 struct ocfs2_alloc_reservation *resv,
697 unsigned int wanted)
698{
699 unsigned int goal = 0;
700
701 BUG_ON(!ocfs2_resv_empty(resv));
702
703 /*
704 * Begin by trying to get a window as close to the previous
705 * one as possible. Using the most recent allocation as a
706 * start goal makes sense.
707 */
708 if (resv->r_last_len) {
709 goal = resv->r_last_start + resv->r_last_len;
710 if (goal >= resmap->m_bitmap_len)
711 goal = 0;
712 }
713
714 __ocfs2_resv_find_window(resmap, resv, goal, wanted);
715
716 /* Search from last alloc didn't work, try once more from beginning. */
717 if (ocfs2_resv_empty(resv) && goal != 0)
718 __ocfs2_resv_find_window(resmap, resv, 0, wanted);
719
720 if (ocfs2_resv_empty(resv)) {
721 /*
722 * Still empty? Pull oldest one off the LRU, remove it from
723 * tree, put this one in it's place.
724 */
725 ocfs2_cannibalize_resv(resmap, resv, wanted);
726 }
727
728 BUG_ON(ocfs2_resv_empty(resv));
729}
730
731int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
732 struct ocfs2_alloc_reservation *resv,
733 int *cstart, int *clen)
734{
735 if (resv == NULL || ocfs2_resmap_disabled(resmap))
736 return -ENOSPC;
737
738 spin_lock(&resv_lock);
739
740 if (ocfs2_resv_empty(resv)) {
741 /*
742 * We don't want to over-allocate for temporary
743 * windows. Otherwise, we run the risk of fragmenting the
744 * allocation space.
745 */
746 unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
747
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen;
750
751 mlog(0, "empty reservation, find new window\n");
752 /*
753 * Try to get a window here. If it works, we must fall
754 * through and test the bitmap . This avoids some
755 * ping-ponging of windows due to non-reserved space
756 * being allocation before we initialize a window for
757 * that inode.
758 */
759 ocfs2_resv_find_window(resmap, resv, wanted);
760 }
761
762 BUG_ON(ocfs2_resv_empty(resv));
763
764 *cstart = resv->r_start;
765 *clen = resv->r_len;
766
767 spin_unlock(&resv_lock);
768 return 0;
769}
770
771static void
772 ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
773 struct ocfs2_alloc_reservation *resv,
774 unsigned int start, unsigned int end)
775{
776 unsigned int rhs = 0;
777 unsigned int old_end = ocfs2_resv_end(resv);
778
779 BUG_ON(start != resv->r_start || old_end < end);
780
781 /*
782 * Completely used? We can remove it then.
783 */
784 if (old_end == end) {
785 __ocfs2_resv_discard(resmap, resv);
786 return;
787 }
788
789 rhs = old_end - end;
790
791 /*
792 * This should have been trapped above.
793 */
794 BUG_ON(rhs == 0);
795
796 resv->r_start = end + 1;
797 resv->r_len = old_end - resv->r_start + 1;
798}
799
800void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
801 struct ocfs2_alloc_reservation *resv,
802 u32 cstart, u32 clen)
803{
804 unsigned int cend = cstart + clen - 1;
805
806 if (resmap == NULL || ocfs2_resmap_disabled(resmap))
807 return;
808
809 if (resv == NULL)
810 return;
811
812 BUG_ON(cstart != resv->r_start);
813
814 spin_lock(&resv_lock);
815
816 mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
817 "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
818 cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
819 resv->r_len, resv->r_last_start, resv->r_last_len);
820
821 BUG_ON(cstart < resv->r_start);
822 BUG_ON(cstart > ocfs2_resv_end(resv));
823 BUG_ON(cend > ocfs2_resv_end(resv));
824
825 ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
826 resv->r_last_start = cstart;
827 resv->r_last_len = clen;
828
829 /*
830 * May have been discarded above from
831 * ocfs2_adjust_resv_from_alloc().
832 */
833 if (!ocfs2_resv_empty(resv))
834 ocfs2_resv_mark_lru(resmap, resv);
835
836 mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
837 "r_len: %u r_last_start: %u r_last_len: %u\n",
838 resv->r_start, ocfs2_resv_end(resv), resv->r_len,
839 resv->r_last_start, resv->r_last_len);
840
841 ocfs2_check_resmap(resmap);
842
843 spin_unlock(&resv_lock);
844}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 000000000000..1e49cc29d06c
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * reservations.h
5 *
6 * Allocation reservations function prototypes and structures.
7 *
8 * Copyright (C) 2010 Novell. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License version 2 as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_RESERVATIONS_H
21#define OCFS2_RESERVATIONS_H
22
23#include <linux/rbtree.h>
24
25#define OCFS2_DEFAULT_RESV_LEVEL 2
26#define OCFS2_MAX_RESV_LEVEL 9
27#define OCFS2_MIN_RESV_LEVEL 0
28
29struct ocfs2_alloc_reservation {
30 struct rb_node r_node;
31
32 unsigned int r_start; /* Begining of current window */
33 unsigned int r_len; /* Length of the window */
34
35 unsigned int r_last_len; /* Length of most recent alloc */
36 unsigned int r_last_start; /* Start of most recent alloc */
37 struct list_head r_lru; /* LRU list head */
38
39 unsigned int r_flags;
40};
41
42#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
43#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
44 * destroyed immedately after use */
45#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
46 * directory btree */
47
48struct ocfs2_reservation_map {
49 struct rb_root m_reservations;
50 char *m_disk_bitmap;
51
52 struct ocfs2_super *m_osb;
53
54 /* The following are not initialized to meaningful values until a disk
55 * bitmap is provided. */
56 u32 m_bitmap_len; /* Number of valid
57 * bits available */
58
59 struct list_head m_lru; /* LRU of reservations
60 * structures. */
61
62};
63
64void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
65
66#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
67void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
68 unsigned int flags);
69
70int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
71
72/**
73 * ocfs2_resv_discard() - truncate a reservation
74 * @resmap:
75 * @resv: the reservation to truncate.
76 *
77 * After this function is called, the reservation will be empty, and
78 * unlinked from the rbtree.
79 */
80void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
81 struct ocfs2_alloc_reservation *resv);
82
83
84/**
85 * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
86 * @resmap: struct ocfs2_reservation_map to initialize
87 * @obj: unused for now
88 * @ops: unused for now
89 * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
90 *
91 * Only possible return value other than '0' is -ENOMEM for failure to
92 * allocation mirror bitmap.
93 */
94int ocfs2_resmap_init(struct ocfs2_super *osb,
95 struct ocfs2_reservation_map *resmap);
96
97/**
98 * ocfs2_resmap_restart() - "restart" a reservation bitmap
99 * @resmap: reservations bitmap
100 * @clen: Number of valid bits in the bitmap
101 * @disk_bitmap: the disk bitmap this resmap should refer to.
102 *
103 * Re-initialize the parameters of a reservation bitmap. This is
104 * useful for local alloc window slides.
105 *
106 * This function will call ocfs2_trunc_resv against all existing
107 * reservations. A future version will recalculate existing
108 * reservations based on the new bitmap.
109 */
110void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
111 unsigned int clen, char *disk_bitmap);
112
113/**
114 * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
115 * @resmap: the struct ocfs2_reservation_map to uninitialize
116 */
117void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
118
119/**
120 * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
121 * @resmap: reservations bitmap
122 * @resv: reservation to base search from
123 * @cstart: start of proposed allocation
124 * @clen: length (in clusters) of proposed allocation
125 *
126 * Using the reservation data from resv, this function will compare
127 * resmap and resmap->m_disk_bitmap to determine what part (if any) of
128 * the reservation window is still clear to use. If resv is empty,
129 * this function will try to allocate a window for it.
130 *
131 * On success, zero is returned and the valid allocation area is set in cstart
132 * and clen.
133 *
134 * Returns -ENOSPC if reservations are disabled.
135 */
136int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
137 struct ocfs2_alloc_reservation *resv,
138 int *cstart, int *clen);
139
140/**
141 * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
142 * @resmap: reservations bitmap
143 * @resv: optional reservation to recalulate based on new bitmap
144 * @cstart: start of allocation in clusters
145 * @clen: end of allocation in clusters.
146 *
147 * Tell the reservation code that bits were used to fulfill allocation in
148 * resmap. The bits don't have to have been part of any existing
149 * reservation. But we must always call this function when bits are claimed.
150 * Internally, the reservations code will use this information to mark the
151 * reservations bitmap. If resv is passed, it's next allocation window will be
152 * calculated. It also expects that 'cstart' is the same as we passed back
153 * from ocfs2_resmap_resv_bits().
154 */
155void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
156 struct ocfs2_alloc_reservation *resv,
157 u32 cstart, u32 clen);
158
159#endif /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d20..dacd553d8617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups); 134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
135 } 135 }
136 136
137 ret = ocfs2_journal_dirty(handle, group_bh); 137 ocfs2_journal_dirty(handle, group_bh);
138 if (ret < 0) {
139 mlog_errno(ret);
140 goto out_rollback;
141 }
142 138
143 /* update the inode accordingly. */ 139 /* update the inode accordingly. */
144 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, 140 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
319 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 315 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
320 316
321 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
322 ocfs2_group_bitmap_size(osb->sb) * 8) { 318 ocfs2_group_bitmap_size(osb->sb, 0,
319 osb->s_feature_incompat) * 8) {
323 mlog(ML_ERROR, "The disk is too old and small. " 320 mlog(ML_ERROR, "The disk is too old and small. "
324 "Force to do offline resize."); 321 "Force to do offline resize.");
325 ret = -EINVAL; 322 ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
500 fe = (struct ocfs2_dinode *)main_bm_bh->b_data; 497 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
501 498
502 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 499 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
503 ocfs2_group_bitmap_size(osb->sb) * 8) { 500 ocfs2_group_bitmap_size(osb->sb, 0,
501 osb->s_feature_incompat) * 8) {
504 mlog(ML_ERROR, "The disk is too old and small." 502 mlog(ML_ERROR, "The disk is too old and small."
505 " Force to do offline resize."); 503 " Force to do offline resize.");
506 ret = -EINVAL; 504 ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
545 543
546 group = (struct ocfs2_group_desc *)group_bh->b_data; 544 group = (struct ocfs2_group_desc *)group_bh->b_data;
547 group->bg_next_group = cr->c_blkno; 545 group->bg_next_group = cr->c_blkno;
548 546 ocfs2_journal_dirty(handle, group_bh);
549 ret = ocfs2_journal_dirty(handle, group_bh);
550 if (ret < 0) {
551 mlog_errno(ret);
552 goto out_commit;
553 }
554 547
555 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), 548 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
556 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); 549 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 19ba00f28547..849c2f0e0a0e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -53,6 +53,32 @@
53 53
54#define OCFS2_MAX_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56struct ocfs2_suballoc_result {
57 u64 sr_bg_blkno; /* The bg we allocated from. Set
58 to 0 when a block group is
59 contiguous. */
60 u64 sr_bg_stable_blkno; /*
61 * Doesn't change, always
62 * set to target block
63 * group descriptor
64 * block.
65 */
66 u64 sr_blkno; /* The first allocated block */
67 unsigned int sr_bit_offset; /* The bit in the bg */
68 unsigned int sr_bits; /* How many bits we claimed */
69};
70
71static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72{
73 if (res->sr_blkno == 0)
74 return 0;
75
76 if (res->sr_bg_blkno)
77 return res->sr_bg_blkno;
78
79 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80}
81
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 82static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 83static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
58static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 84static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +86,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
60 struct inode *alloc_inode, 86 struct inode *alloc_inode,
61 struct buffer_head *bg_bh, 87 struct buffer_head *bg_bh,
62 u64 group_blkno, 88 u64 group_blkno,
89 unsigned int group_clusters,
63 u16 my_chain, 90 u16 my_chain,
64 struct ocfs2_chain_list *cl); 91 struct ocfs2_chain_list *cl);
65static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 92static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +100,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
73 struct buffer_head *group_bh, 100 struct buffer_head *group_bh,
74 u32 bits_wanted, u32 min_bits, 101 u32 bits_wanted, u32 min_bits,
75 u64 max_block, 102 u64 max_block,
76 u16 *bit_off, u16 *bits_found); 103 struct ocfs2_suballoc_result *res);
77static int ocfs2_block_group_search(struct inode *inode, 104static int ocfs2_block_group_search(struct inode *inode,
78 struct buffer_head *group_bh, 105 struct buffer_head *group_bh,
79 u32 bits_wanted, u32 min_bits, 106 u32 bits_wanted, u32 min_bits,
80 u64 max_block, 107 u64 max_block,
81 u16 *bit_off, u16 *bits_found); 108 struct ocfs2_suballoc_result *res);
82static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 109static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
83 struct ocfs2_alloc_context *ac,
84 handle_t *handle, 110 handle_t *handle,
85 u32 bits_wanted, 111 u32 bits_wanted,
86 u32 min_bits, 112 u32 min_bits,
87 u16 *bit_off, 113 struct ocfs2_suballoc_result *res);
88 unsigned int *num_bits,
89 u64 *bg_blkno);
90static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 114static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
91 int nr); 115 int nr);
92static inline int ocfs2_block_group_set_bits(handle_t *handle, 116static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -130,6 +154,11 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
130 } 154 }
131 brelse(ac->ac_bh); 155 brelse(ac->ac_bh);
132 ac->ac_bh = NULL; 156 ac->ac_bh = NULL;
157 ac->ac_resv = NULL;
158 if (ac->ac_find_loc_priv) {
159 kfree(ac->ac_find_loc_priv);
160 ac->ac_find_loc_priv = NULL;
161 }
133} 162}
134 163
135void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 164void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -325,14 +354,38 @@ out:
325 return rc; 354 return rc;
326} 355}
327 356
357static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
358 struct ocfs2_group_desc *bg,
359 struct ocfs2_chain_list *cl,
360 u64 p_blkno, unsigned int clusters)
361{
362 struct ocfs2_extent_list *el = &bg->bg_list;
363 struct ocfs2_extent_rec *rec;
364
365 BUG_ON(!ocfs2_supports_discontig_bg(osb));
366 if (!el->l_next_free_rec)
367 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
368 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
369 rec->e_blkno = cpu_to_le64(p_blkno);
370 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
371 le16_to_cpu(cl->cl_bpc));
372 rec->e_leaf_clusters = cpu_to_le16(clusters);
373 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
374 le16_add_cpu(&bg->bg_free_bits_count,
375 clusters * le16_to_cpu(cl->cl_bpc));
376 le16_add_cpu(&el->l_next_free_rec, 1);
377}
378
328static int ocfs2_block_group_fill(handle_t *handle, 379static int ocfs2_block_group_fill(handle_t *handle,
329 struct inode *alloc_inode, 380 struct inode *alloc_inode,
330 struct buffer_head *bg_bh, 381 struct buffer_head *bg_bh,
331 u64 group_blkno, 382 u64 group_blkno,
383 unsigned int group_clusters,
332 u16 my_chain, 384 u16 my_chain,
333 struct ocfs2_chain_list *cl) 385 struct ocfs2_chain_list *cl)
334{ 386{
335 int status = 0; 387 int status = 0;
388 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
336 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 389 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
337 struct super_block * sb = alloc_inode->i_sb; 390 struct super_block * sb = alloc_inode->i_sb;
338 391
@@ -359,19 +412,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
359 memset(bg, 0, sb->s_blocksize); 412 memset(bg, 0, sb->s_blocksize);
360 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 413 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
361 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 414 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
362 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); 415 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
363 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 416 osb->s_feature_incompat));
364 bg->bg_chain = cpu_to_le16(my_chain); 417 bg->bg_chain = cpu_to_le16(my_chain);
365 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 418 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
366 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 419 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
367 bg->bg_blkno = cpu_to_le64(group_blkno); 420 bg->bg_blkno = cpu_to_le64(group_blkno);
421 if (group_clusters == le16_to_cpu(cl->cl_cpg))
422 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
423 else
424 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
425 group_clusters);
426
368 /* set the 1st bit in the bitmap to account for the descriptor block */ 427 /* set the 1st bit in the bitmap to account for the descriptor block */
369 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 428 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
370 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 429 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
371 430
372 status = ocfs2_journal_dirty(handle, bg_bh); 431 ocfs2_journal_dirty(handle, bg_bh);
373 if (status < 0)
374 mlog_errno(status);
375 432
376 /* There is no need to zero out or otherwise initialize the 433 /* There is no need to zero out or otherwise initialize the
377 * other blocks in a group - All valid FS metadata in a block 434 * other blocks in a group - All valid FS metadata in a block
@@ -397,6 +454,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
397 return best; 454 return best;
398} 455}
399 456
457static struct buffer_head *
458ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
459 struct inode *alloc_inode,
460 struct ocfs2_alloc_context *ac,
461 struct ocfs2_chain_list *cl)
462{
463 int status;
464 u32 bit_off, num_bits;
465 u64 bg_blkno;
466 struct buffer_head *bg_bh;
467 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
468
469 status = ocfs2_claim_clusters(handle, ac,
470 le16_to_cpu(cl->cl_cpg), &bit_off,
471 &num_bits);
472 if (status < 0) {
473 if (status != -ENOSPC)
474 mlog_errno(status);
475 goto bail;
476 }
477
478 /* setup the group */
479 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
480 mlog(0, "new descriptor, record %u, at block %llu\n",
481 alloc_rec, (unsigned long long)bg_blkno);
482
483 bg_bh = sb_getblk(osb->sb, bg_blkno);
484 if (!bg_bh) {
485 status = -EIO;
486 mlog_errno(status);
487 goto bail;
488 }
489 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
490
491 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
492 bg_blkno, num_bits, alloc_rec, cl);
493 if (status < 0) {
494 brelse(bg_bh);
495 mlog_errno(status);
496 }
497
498bail:
499 return status ? ERR_PTR(status) : bg_bh;
500}
501
502static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
503 handle_t *handle,
504 struct ocfs2_alloc_context *ac,
505 unsigned int min_bits,
506 u32 *bit_off, u32 *num_bits)
507{
508 int status = 0;
509
510 while (min_bits) {
511 status = ocfs2_claim_clusters(handle, ac, min_bits,
512 bit_off, num_bits);
513 if (status != -ENOSPC)
514 break;
515
516 min_bits >>= 1;
517 }
518
519 return status;
520}
521
522static int ocfs2_block_group_grow_discontig(handle_t *handle,
523 struct inode *alloc_inode,
524 struct buffer_head *bg_bh,
525 struct ocfs2_alloc_context *ac,
526 struct ocfs2_chain_list *cl,
527 unsigned int min_bits)
528{
529 int status;
530 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
531 struct ocfs2_group_desc *bg =
532 (struct ocfs2_group_desc *)bg_bh->b_data;
533 unsigned int needed = le16_to_cpu(cl->cl_cpg) -
534 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
535 u32 p_cpos, clusters;
536 u64 p_blkno;
537 struct ocfs2_extent_list *el = &bg->bg_list;
538
539 status = ocfs2_journal_access_gd(handle,
540 INODE_CACHE(alloc_inode),
541 bg_bh,
542 OCFS2_JOURNAL_ACCESS_CREATE);
543 if (status < 0) {
544 mlog_errno(status);
545 goto bail;
546 }
547
548 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
549 le16_to_cpu(el->l_count))) {
550 if (min_bits > needed)
551 min_bits = needed;
552 status = ocfs2_block_group_claim_bits(osb, handle, ac,
553 min_bits, &p_cpos,
554 &clusters);
555 if (status < 0) {
556 if (status != -ENOSPC)
557 mlog_errno(status);
558 goto bail;
559 }
560 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
561 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
562 clusters);
563
564 min_bits = clusters;
565 needed = le16_to_cpu(cl->cl_cpg) -
566 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
567 }
568
569 if (needed > 0) {
570 /*
571 * We have used up all the extent rec but can't fill up
572 * the cpg. So bail out.
573 */
574 status = -ENOSPC;
575 goto bail;
576 }
577
578 ocfs2_journal_dirty(handle, bg_bh);
579
580bail:
581 return status;
582}
583
584static void ocfs2_bg_alloc_cleanup(handle_t *handle,
585 struct ocfs2_alloc_context *cluster_ac,
586 struct inode *alloc_inode,
587 struct buffer_head *bg_bh)
588{
589 int i, ret;
590 struct ocfs2_group_desc *bg;
591 struct ocfs2_extent_list *el;
592 struct ocfs2_extent_rec *rec;
593
594 if (!bg_bh)
595 return;
596
597 bg = (struct ocfs2_group_desc *)bg_bh->b_data;
598 el = &bg->bg_list;
599 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
600 rec = &el->l_recs[i];
601 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
602 cluster_ac->ac_bh,
603 le64_to_cpu(rec->e_blkno),
604 le32_to_cpu(rec->e_leaf_clusters));
605 if (ret)
606 mlog_errno(ret);
607 /* Try all the clusters to free */
608 }
609
610 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
611 brelse(bg_bh);
612}
613
614static struct buffer_head *
615ocfs2_block_group_alloc_discontig(handle_t *handle,
616 struct inode *alloc_inode,
617 struct ocfs2_alloc_context *ac,
618 struct ocfs2_chain_list *cl)
619{
620 int status;
621 u32 bit_off, num_bits;
622 u64 bg_blkno;
623 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
624 struct buffer_head *bg_bh = NULL;
625 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
626 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
627
628 if (!ocfs2_supports_discontig_bg(osb)) {
629 status = -ENOSPC;
630 goto bail;
631 }
632
633 status = ocfs2_extend_trans(handle,
634 ocfs2_calc_bg_discontig_credits(osb->sb));
635 if (status) {
636 mlog_errno(status);
637 goto bail;
638 }
639
640 /*
641 * We're going to be grabbing from multiple cluster groups.
642 * We don't have enough credits to relink them all, and the
643 * cluster groups will be staying in cache for the duration of
644 * this operation.
645 */
646 ac->ac_allow_chain_relink = 0;
647
648 /* Claim the first region */
649 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
650 &bit_off, &num_bits);
651 if (status < 0) {
652 if (status != -ENOSPC)
653 mlog_errno(status);
654 goto bail;
655 }
656 min_bits = num_bits;
657
658 /* setup the group */
659 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
660 mlog(0, "new descriptor, record %u, at block %llu\n",
661 alloc_rec, (unsigned long long)bg_blkno);
662
663 bg_bh = sb_getblk(osb->sb, bg_blkno);
664 if (!bg_bh) {
665 status = -EIO;
666 mlog_errno(status);
667 goto bail;
668 }
669 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
670
671 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
672 bg_blkno, num_bits, alloc_rec, cl);
673 if (status < 0) {
674 mlog_errno(status);
675 goto bail;
676 }
677
678 status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
679 bg_bh, ac, cl, min_bits);
680 if (status)
681 mlog_errno(status);
682
683bail:
684 if (status)
685 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
686 return status ? ERR_PTR(status) : bg_bh;
687}
688
400/* 689/*
401 * We expect the block group allocator to already be locked. 690 * We expect the block group allocator to already be locked.
402 */ 691 */
@@ -412,9 +701,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
412 struct ocfs2_chain_list *cl; 701 struct ocfs2_chain_list *cl;
413 struct ocfs2_alloc_context *ac = NULL; 702 struct ocfs2_alloc_context *ac = NULL;
414 handle_t *handle = NULL; 703 handle_t *handle = NULL;
415 u32 bit_off, num_bits;
416 u16 alloc_rec; 704 u16 alloc_rec;
417 u64 bg_blkno;
418 struct buffer_head *bg_bh = NULL; 705 struct buffer_head *bg_bh = NULL;
419 struct ocfs2_group_desc *bg; 706 struct ocfs2_group_desc *bg;
420 707
@@ -447,44 +734,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
447 (unsigned long long)*last_alloc_group); 734 (unsigned long long)*last_alloc_group);
448 ac->ac_last_group = *last_alloc_group; 735 ac->ac_last_group = *last_alloc_group;
449 } 736 }
450 status = ocfs2_claim_clusters(osb, 737
451 handle, 738 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
452 ac, 739 ac, cl);
453 le16_to_cpu(cl->cl_cpg), 740 if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
454 &bit_off, 741 bg_bh = ocfs2_block_group_alloc_discontig(handle,
455 &num_bits); 742 alloc_inode,
456 if (status < 0) { 743 ac, cl);
744 if (IS_ERR(bg_bh)) {
745 status = PTR_ERR(bg_bh);
746 bg_bh = NULL;
457 if (status != -ENOSPC) 747 if (status != -ENOSPC)
458 mlog_errno(status); 748 mlog_errno(status);
459 goto bail; 749 goto bail;
460 } 750 }
461
462 alloc_rec = ocfs2_find_smallest_chain(cl);
463
464 /* setup the group */
465 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
466 mlog(0, "new descriptor, record %u, at block %llu\n",
467 alloc_rec, (unsigned long long)bg_blkno);
468
469 bg_bh = sb_getblk(osb->sb, bg_blkno);
470 if (!bg_bh) {
471 status = -EIO;
472 mlog_errno(status);
473 goto bail;
474 }
475 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
476
477 status = ocfs2_block_group_fill(handle,
478 alloc_inode,
479 bg_bh,
480 bg_blkno,
481 alloc_rec,
482 cl);
483 if (status < 0) {
484 mlog_errno(status);
485 goto bail;
486 }
487
488 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 751 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
489 752
490 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 753 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -494,10 +757,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
494 goto bail; 757 goto bail;
495 } 758 }
496 759
760 alloc_rec = le16_to_cpu(bg->bg_chain);
497 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 761 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
498 le16_to_cpu(bg->bg_free_bits_count)); 762 le16_to_cpu(bg->bg_free_bits_count));
499 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); 763 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
500 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); 764 le16_to_cpu(bg->bg_bits));
765 cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
501 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 766 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
502 le16_add_cpu(&cl->cl_next_free_rec, 1); 767 le16_add_cpu(&cl->cl_next_free_rec, 1);
503 768
@@ -506,11 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
506 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 771 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
507 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 772 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
508 773
509 status = ocfs2_journal_dirty(handle, bh); 774 ocfs2_journal_dirty(handle, bh);
510 if (status < 0) {
511 mlog_errno(status);
512 goto bail;
513 }
514 775
515 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 776 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
516 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 777 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -760,7 +1021,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
760 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 1021 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
761 EXTENT_ALLOC_SYSTEM_INODE, 1022 EXTENT_ALLOC_SYSTEM_INODE,
762 (u32)osb->slot_num, NULL, 1023 (u32)osb->slot_num, NULL,
763 ALLOC_NEW_GROUP); 1024 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
764 1025
765 1026
766 if (status >= 0) { 1027 if (status >= 0) {
@@ -946,11 +1207,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
946 status = ocfs2_reserve_local_alloc_bits(osb, 1207 status = ocfs2_reserve_local_alloc_bits(osb,
947 bits_wanted, 1208 bits_wanted,
948 *ac); 1209 *ac);
949 if (status == -EFBIG) { 1210 if ((status < 0) && (status != -ENOSPC)) {
950 /* The local alloc window is outside ac_max_block.
951 * use the main bitmap. */
952 status = -ENOSPC;
953 } else if ((status < 0) && (status != -ENOSPC)) {
954 mlog_errno(status); 1211 mlog_errno(status);
955 goto bail; 1212 goto bail;
956 } 1213 }
@@ -1033,8 +1290,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1033 struct buffer_head *bg_bh, 1290 struct buffer_head *bg_bh,
1034 unsigned int bits_wanted, 1291 unsigned int bits_wanted,
1035 unsigned int total_bits, 1292 unsigned int total_bits,
1036 u16 *bit_off, 1293 struct ocfs2_suballoc_result *res)
1037 u16 *bits_found)
1038{ 1294{
1039 void *bitmap; 1295 void *bitmap;
1040 u16 best_offset, best_size; 1296 u16 best_offset, best_size;
@@ -1078,14 +1334,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1078 } 1334 }
1079 } 1335 }
1080 1336
1081 /* XXX: I think the first clause is equivalent to the second 1337 if (best_size) {
1082 * - jlbec */ 1338 res->sr_bit_offset = best_offset;
1083 if (found == bits_wanted) { 1339 res->sr_bits = best_size;
1084 *bit_off = start - found;
1085 *bits_found = found;
1086 } else if (best_size) {
1087 *bit_off = best_offset;
1088 *bits_found = best_size;
1089 } else { 1340 } else {
1090 status = -ENOSPC; 1341 status = -ENOSPC;
1091 /* No error log here -- see the comment above 1342 /* No error log here -- see the comment above
@@ -1129,16 +1380,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1129 } 1380 }
1130 1381
1131 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1382 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1132
1133 while(num_bits--) 1383 while(num_bits--)
1134 ocfs2_set_bit(bit_off++, bitmap); 1384 ocfs2_set_bit(bit_off++, bitmap);
1135 1385
1136 status = ocfs2_journal_dirty(handle, 1386 ocfs2_journal_dirty(handle, group_bh);
1137 group_bh);
1138 if (status < 0) {
1139 mlog_errno(status);
1140 goto bail;
1141 }
1142 1387
1143bail: 1388bail:
1144 mlog_exit(status); 1389 mlog_exit(status);
@@ -1202,12 +1447,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1202 } 1447 }
1203 1448
1204 prev_bg->bg_next_group = bg->bg_next_group; 1449 prev_bg->bg_next_group = bg->bg_next_group;
1205 1450 ocfs2_journal_dirty(handle, prev_bg_bh);
1206 status = ocfs2_journal_dirty(handle, prev_bg_bh);
1207 if (status < 0) {
1208 mlog_errno(status);
1209 goto out_rollback;
1210 }
1211 1451
1212 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1452 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1213 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1453 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1217,12 +1457,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1217 } 1457 }
1218 1458
1219 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1459 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1220 1460 ocfs2_journal_dirty(handle, bg_bh);
1221 status = ocfs2_journal_dirty(handle, bg_bh);
1222 if (status < 0) {
1223 mlog_errno(status);
1224 goto out_rollback;
1225 }
1226 1461
1227 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1462 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1228 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1463 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1232,14 +1467,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1232 } 1467 }
1233 1468
1234 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1469 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1470 ocfs2_journal_dirty(handle, fe_bh);
1235 1471
1236 status = ocfs2_journal_dirty(handle, fe_bh);
1237 if (status < 0) {
1238 mlog_errno(status);
1239 goto out_rollback;
1240 }
1241
1242 status = 0;
1243out_rollback: 1472out_rollback:
1244 if (status < 0) { 1473 if (status < 0) {
1245 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); 1474 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1263,14 +1492,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1263 struct buffer_head *group_bh, 1492 struct buffer_head *group_bh,
1264 u32 bits_wanted, u32 min_bits, 1493 u32 bits_wanted, u32 min_bits,
1265 u64 max_block, 1494 u64 max_block,
1266 u16 *bit_off, u16 *bits_found) 1495 struct ocfs2_suballoc_result *res)
1267{ 1496{
1268 int search = -ENOSPC; 1497 int search = -ENOSPC;
1269 int ret; 1498 int ret;
1270 u64 blkoff; 1499 u64 blkoff;
1271 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1500 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1272 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1501 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1273 u16 tmp_off, tmp_found;
1274 unsigned int max_bits, gd_cluster_off; 1502 unsigned int max_bits, gd_cluster_off;
1275 1503
1276 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1504 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1297,15 +1525,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1297 1525
1298 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1526 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1299 group_bh, bits_wanted, 1527 group_bh, bits_wanted,
1300 max_bits, 1528 max_bits, res);
1301 &tmp_off, &tmp_found);
1302 if (ret) 1529 if (ret)
1303 return ret; 1530 return ret;
1304 1531
1305 if (max_block) { 1532 if (max_block) {
1306 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1533 blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1307 gd_cluster_off + 1534 gd_cluster_off +
1308 tmp_off + tmp_found); 1535 res->sr_bit_offset +
1536 res->sr_bits);
1309 mlog(0, "Checking %llu against %llu\n", 1537 mlog(0, "Checking %llu against %llu\n",
1310 (unsigned long long)blkoff, 1538 (unsigned long long)blkoff,
1311 (unsigned long long)max_block); 1539 (unsigned long long)max_block);
@@ -1317,16 +1545,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1317 * return success, but we still want to return 1545 * return success, but we still want to return
1318 * -ENOSPC unless it found the minimum number 1546 * -ENOSPC unless it found the minimum number
1319 * of bits. */ 1547 * of bits. */
1320 if (min_bits <= tmp_found) { 1548 if (min_bits <= res->sr_bits)
1321 *bit_off = tmp_off;
1322 *bits_found = tmp_found;
1323 search = 0; /* success */ 1549 search = 0; /* success */
1324 } else if (tmp_found) { 1550 else if (res->sr_bits) {
1325 /* 1551 /*
1326 * Don't show bits which we'll be returning 1552 * Don't show bits which we'll be returning
1327 * for allocation to the local alloc bitmap. 1553 * for allocation to the local alloc bitmap.
1328 */ 1554 */
1329 ocfs2_local_alloc_seen_free_bits(osb, tmp_found); 1555 ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1330 } 1556 }
1331 } 1557 }
1332 1558
@@ -1337,7 +1563,7 @@ static int ocfs2_block_group_search(struct inode *inode,
1337 struct buffer_head *group_bh, 1563 struct buffer_head *group_bh,
1338 u32 bits_wanted, u32 min_bits, 1564 u32 bits_wanted, u32 min_bits,
1339 u64 max_block, 1565 u64 max_block,
1340 u16 *bit_off, u16 *bits_found) 1566 struct ocfs2_suballoc_result *res)
1341{ 1567{
1342 int ret = -ENOSPC; 1568 int ret = -ENOSPC;
1343 u64 blkoff; 1569 u64 blkoff;
@@ -1350,10 +1576,10 @@ static int ocfs2_block_group_search(struct inode *inode,
1350 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1576 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1351 group_bh, bits_wanted, 1577 group_bh, bits_wanted,
1352 le16_to_cpu(bg->bg_bits), 1578 le16_to_cpu(bg->bg_bits),
1353 bit_off, bits_found); 1579 res);
1354 if (!ret && max_block) { 1580 if (!ret && max_block) {
1355 blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + 1581 blkoff = le64_to_cpu(bg->bg_blkno) +
1356 *bits_found; 1582 res->sr_bit_offset + res->sr_bits;
1357 mlog(0, "Checking %llu against %llu\n", 1583 mlog(0, "Checking %llu against %llu\n",
1358 (unsigned long long)blkoff, 1584 (unsigned long long)blkoff,
1359 (unsigned long long)max_block); 1585 (unsigned long long)max_block);
@@ -1386,33 +1612,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1386 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1612 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1387 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1613 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1388 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1614 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1389 1615 ocfs2_journal_dirty(handle, di_bh);
1390 ret = ocfs2_journal_dirty(handle, di_bh);
1391 if (ret < 0)
1392 mlog_errno(ret);
1393 1616
1394out: 1617out:
1395 return ret; 1618 return ret;
1396} 1619}
1397 1620
1621static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1622 struct ocfs2_extent_rec *rec,
1623 struct ocfs2_chain_list *cl)
1624{
1625 unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1626 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1627 unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
1628
1629 if (res->sr_bit_offset < bitoff)
1630 return 0;
1631 if (res->sr_bit_offset >= (bitoff + bitcount))
1632 return 0;
1633 res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1634 (res->sr_bit_offset - bitoff);
1635 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1636 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1637 return 1;
1638}
1639
1640static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1641 struct ocfs2_group_desc *bg,
1642 struct ocfs2_suballoc_result *res)
1643{
1644 int i;
1645 u64 bg_blkno = res->sr_bg_blkno; /* Save off */
1646 struct ocfs2_extent_rec *rec;
1647 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1648 struct ocfs2_chain_list *cl = &di->id2.i_chain;
1649
1650 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1651 res->sr_blkno = 0;
1652 return;
1653 }
1654
1655 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1656 res->sr_bg_blkno = 0; /* Clear it for contig block groups */
1657 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1658 !bg->bg_list.l_next_free_rec)
1659 return;
1660
1661 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1662 rec = &bg->bg_list.l_recs[i];
1663 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1664 res->sr_bg_blkno = bg_blkno; /* Restore */
1665 break;
1666 }
1667 }
1668}
1669
1398static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1670static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1399 handle_t *handle, 1671 handle_t *handle,
1400 u32 bits_wanted, 1672 u32 bits_wanted,
1401 u32 min_bits, 1673 u32 min_bits,
1402 u16 *bit_off, 1674 struct ocfs2_suballoc_result *res,
1403 unsigned int *num_bits,
1404 u64 gd_blkno,
1405 u16 *bits_left) 1675 u16 *bits_left)
1406{ 1676{
1407 int ret; 1677 int ret;
1408 u16 found;
1409 struct buffer_head *group_bh = NULL; 1678 struct buffer_head *group_bh = NULL;
1410 struct ocfs2_group_desc *gd; 1679 struct ocfs2_group_desc *gd;
1411 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1680 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1412 struct inode *alloc_inode = ac->ac_inode; 1681 struct inode *alloc_inode = ac->ac_inode;
1413 1682
1414 ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, 1683 ret = ocfs2_read_group_descriptor(alloc_inode, di,
1415 &group_bh); 1684 res->sr_bg_blkno, &group_bh);
1416 if (ret < 0) { 1685 if (ret < 0) {
1417 mlog_errno(ret); 1686 mlog_errno(ret);
1418 return ret; 1687 return ret;
@@ -1420,17 +1689,27 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1420 1689
1421 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1690 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1422 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1691 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1423 ac->ac_max_block, bit_off, &found); 1692 ac->ac_max_block, res);
1424 if (ret < 0) { 1693 if (ret < 0) {
1425 if (ret != -ENOSPC) 1694 if (ret != -ENOSPC)
1426 mlog_errno(ret); 1695 mlog_errno(ret);
1427 goto out; 1696 goto out;
1428 } 1697 }
1429 1698
1430 *num_bits = found; 1699 if (!ret)
1700 ocfs2_bg_discontig_fix_result(ac, gd, res);
1701
1702 /*
1703 * sr_bg_blkno might have been changed by
1704 * ocfs2_bg_discontig_fix_result
1705 */
1706 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1707
1708 if (ac->ac_find_loc_only)
1709 goto out_loc_only;
1431 1710
1432 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1711 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1433 *num_bits, 1712 res->sr_bits,
1434 le16_to_cpu(gd->bg_chain)); 1713 le16_to_cpu(gd->bg_chain));
1435 if (ret < 0) { 1714 if (ret < 0) {
1436 mlog_errno(ret); 1715 mlog_errno(ret);
@@ -1438,10 +1717,11 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1438 } 1717 }
1439 1718
1440 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1719 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1441 *bit_off, *num_bits); 1720 res->sr_bit_offset, res->sr_bits);
1442 if (ret < 0) 1721 if (ret < 0)
1443 mlog_errno(ret); 1722 mlog_errno(ret);
1444 1723
1724out_loc_only:
1445 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1725 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1446 1726
1447out: 1727out:
@@ -1454,14 +1734,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1454 handle_t *handle, 1734 handle_t *handle,
1455 u32 bits_wanted, 1735 u32 bits_wanted,
1456 u32 min_bits, 1736 u32 min_bits,
1457 u16 *bit_off, 1737 struct ocfs2_suballoc_result *res,
1458 unsigned int *num_bits,
1459 u64 *bg_blkno,
1460 u16 *bits_left) 1738 u16 *bits_left)
1461{ 1739{
1462 int status; 1740 int status;
1463 u16 chain, tmp_bits; 1741 u16 chain;
1464 u32 tmp_used;
1465 u64 next_group; 1742 u64 next_group;
1466 struct inode *alloc_inode = ac->ac_inode; 1743 struct inode *alloc_inode = ac->ac_inode;
1467 struct buffer_head *group_bh = NULL; 1744 struct buffer_head *group_bh = NULL;
@@ -1489,8 +1766,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1489 * the 1st group with any empty bits. */ 1766 * the 1st group with any empty bits. */
1490 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1767 while ((status = ac->ac_group_search(alloc_inode, group_bh,
1491 bits_wanted, min_bits, 1768 bits_wanted, min_bits,
1492 ac->ac_max_block, bit_off, 1769 ac->ac_max_block,
1493 &tmp_bits)) == -ENOSPC) { 1770 res)) == -ENOSPC) {
1494 if (!bg->bg_next_group) 1771 if (!bg->bg_next_group)
1495 break; 1772 break;
1496 1773
@@ -1515,11 +1792,19 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1515 } 1792 }
1516 1793
1517 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", 1794 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1518 tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); 1795 res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1796
1797 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1519 1798
1520 *num_bits = tmp_bits; 1799 BUG_ON(res->sr_bits == 0);
1800 if (!status)
1801 ocfs2_bg_discontig_fix_result(ac, bg, res);
1521 1802
1522 BUG_ON(*num_bits == 0); 1803 /*
1804 * sr_bg_blkno might have been changed by
1805 * ocfs2_bg_discontig_fix_result
1806 */
1807 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1523 1808
1524 /* 1809 /*
1525 * Keep track of previous block descriptor read. When 1810 * Keep track of previous block descriptor read. When
@@ -1536,7 +1821,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1536 */ 1821 */
1537 if (ac->ac_allow_chain_relink && 1822 if (ac->ac_allow_chain_relink &&
1538 (prev_group_bh) && 1823 (prev_group_bh) &&
1539 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { 1824 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1540 status = ocfs2_relink_block_group(handle, alloc_inode, 1825 status = ocfs2_relink_block_group(handle, alloc_inode,
1541 ac->ac_bh, group_bh, 1826 ac->ac_bh, group_bh,
1542 prev_group_bh, chain); 1827 prev_group_bh, chain);
@@ -1546,24 +1831,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1546 } 1831 }
1547 } 1832 }
1548 1833
1549 /* Ok, claim our bits now: set the info on dinode, chainlist 1834 if (ac->ac_find_loc_only)
1550 * and then the group */ 1835 goto out_loc_only;
1551 status = ocfs2_journal_access_di(handle,
1552 INODE_CACHE(alloc_inode),
1553 ac->ac_bh,
1554 OCFS2_JOURNAL_ACCESS_WRITE);
1555 if (status < 0) {
1556 mlog_errno(status);
1557 goto bail;
1558 }
1559
1560 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1561 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1562 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1563 1836
1564 status = ocfs2_journal_dirty(handle, 1837 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1565 ac->ac_bh); 1838 ac->ac_bh, res->sr_bits,
1566 if (status < 0) { 1839 chain);
1840 if (status) {
1567 mlog_errno(status); 1841 mlog_errno(status);
1568 goto bail; 1842 goto bail;
1569 } 1843 }
@@ -1572,17 +1846,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1572 alloc_inode, 1846 alloc_inode,
1573 bg, 1847 bg,
1574 group_bh, 1848 group_bh,
1575 *bit_off, 1849 res->sr_bit_offset,
1576 *num_bits); 1850 res->sr_bits);
1577 if (status < 0) { 1851 if (status < 0) {
1578 mlog_errno(status); 1852 mlog_errno(status);
1579 goto bail; 1853 goto bail;
1580 } 1854 }
1581 1855
1582 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, 1856 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1583 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1857 (unsigned long long)le64_to_cpu(fe->i_blkno));
1584 1858
1585 *bg_blkno = le64_to_cpu(bg->bg_blkno); 1859out_loc_only:
1586 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1860 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1587bail: 1861bail:
1588 brelse(group_bh); 1862 brelse(group_bh);
@@ -1593,19 +1867,16 @@ bail:
1593} 1867}
1594 1868
1595/* will give out up to bits_wanted contiguous bits. */ 1869/* will give out up to bits_wanted contiguous bits. */
1596static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 1870static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1597 struct ocfs2_alloc_context *ac,
1598 handle_t *handle, 1871 handle_t *handle,
1599 u32 bits_wanted, 1872 u32 bits_wanted,
1600 u32 min_bits, 1873 u32 min_bits,
1601 u16 *bit_off, 1874 struct ocfs2_suballoc_result *res)
1602 unsigned int *num_bits,
1603 u64 *bg_blkno)
1604{ 1875{
1605 int status; 1876 int status;
1606 u16 victim, i; 1877 u16 victim, i;
1607 u16 bits_left = 0; 1878 u16 bits_left = 0;
1608 u64 hint_blkno = ac->ac_last_group; 1879 u64 hint = ac->ac_last_group;
1609 struct ocfs2_chain_list *cl; 1880 struct ocfs2_chain_list *cl;
1610 struct ocfs2_dinode *fe; 1881 struct ocfs2_dinode *fe;
1611 1882
@@ -1623,7 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1623 1894
1624 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1895 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1625 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1896 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1626 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1897 ocfs2_error(ac->ac_inode->i_sb,
1898 "Chain allocator dinode %llu has %u used "
1627 "bits but only %u total.", 1899 "bits but only %u total.",
1628 (unsigned long long)le64_to_cpu(fe->i_blkno), 1900 (unsigned long long)le64_to_cpu(fe->i_blkno),
1629 le32_to_cpu(fe->id1.bitmap1.i_used), 1901 le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1632,22 +1904,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1632 goto bail; 1904 goto bail;
1633 } 1905 }
1634 1906
1635 if (hint_blkno) { 1907 res->sr_bg_blkno = hint;
1908 if (res->sr_bg_blkno) {
1636 /* Attempt to short-circuit the usual search mechanism 1909 /* Attempt to short-circuit the usual search mechanism
1637 * by jumping straight to the most recently used 1910 * by jumping straight to the most recently used
1638 * allocation group. This helps us mantain some 1911 * allocation group. This helps us mantain some
1639 * contiguousness across allocations. */ 1912 * contiguousness across allocations. */
1640 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1913 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1641 min_bits, bit_off, num_bits, 1914 min_bits, res, &bits_left);
1642 hint_blkno, &bits_left); 1915 if (!status)
1643 if (!status) {
1644 /* Be careful to update *bg_blkno here as the
1645 * caller is expecting it to be filled in, and
1646 * ocfs2_search_one_group() won't do that for
1647 * us. */
1648 *bg_blkno = hint_blkno;
1649 goto set_hint; 1916 goto set_hint;
1650 }
1651 if (status < 0 && status != -ENOSPC) { 1917 if (status < 0 && status != -ENOSPC) {
1652 mlog_errno(status); 1918 mlog_errno(status);
1653 goto bail; 1919 goto bail;
@@ -1660,10 +1926,12 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1660 ac->ac_chain = victim; 1926 ac->ac_chain = victim;
1661 ac->ac_allow_chain_relink = 1; 1927 ac->ac_allow_chain_relink = 1;
1662 1928
1663 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, 1929 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1664 num_bits, bg_blkno, &bits_left); 1930 res, &bits_left);
1665 if (!status) 1931 if (!status) {
1932 hint = ocfs2_group_from_res(res);
1666 goto set_hint; 1933 goto set_hint;
1934 }
1667 if (status < 0 && status != -ENOSPC) { 1935 if (status < 0 && status != -ENOSPC) {
1668 mlog_errno(status); 1936 mlog_errno(status);
1669 goto bail; 1937 goto bail;
@@ -1685,10 +1953,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1685 1953
1686 ac->ac_chain = i; 1954 ac->ac_chain = i;
1687 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1955 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1688 bit_off, num_bits, bg_blkno, 1956 res, &bits_left);
1689 &bits_left); 1957 if (!status) {
1690 if (!status) 1958 hint = ocfs2_group_from_res(res);
1691 break; 1959 break;
1960 }
1692 if (status < 0 && status != -ENOSPC) { 1961 if (status < 0 && status != -ENOSPC) {
1693 mlog_errno(status); 1962 mlog_errno(status);
1694 goto bail; 1963 goto bail;
@@ -1703,7 +1972,7 @@ set_hint:
1703 if (bits_left < min_bits) 1972 if (bits_left < min_bits)
1704 ac->ac_last_group = 0; 1973 ac->ac_last_group = 0;
1705 else 1974 else
1706 ac->ac_last_group = *bg_blkno; 1975 ac->ac_last_group = hint;
1707 } 1976 }
1708 1977
1709bail: 1978bail:
@@ -1711,37 +1980,37 @@ bail:
1711 return status; 1980 return status;
1712} 1981}
1713 1982
1714int ocfs2_claim_metadata(struct ocfs2_super *osb, 1983int ocfs2_claim_metadata(handle_t *handle,
1715 handle_t *handle,
1716 struct ocfs2_alloc_context *ac, 1984 struct ocfs2_alloc_context *ac,
1717 u32 bits_wanted, 1985 u32 bits_wanted,
1986 u64 *suballoc_loc,
1718 u16 *suballoc_bit_start, 1987 u16 *suballoc_bit_start,
1719 unsigned int *num_bits, 1988 unsigned int *num_bits,
1720 u64 *blkno_start) 1989 u64 *blkno_start)
1721{ 1990{
1722 int status; 1991 int status;
1723 u64 bg_blkno; 1992 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1724 1993
1725 BUG_ON(!ac); 1994 BUG_ON(!ac);
1726 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 1995 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1727 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 1996 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1728 1997
1729 status = ocfs2_claim_suballoc_bits(osb, 1998 status = ocfs2_claim_suballoc_bits(ac,
1730 ac,
1731 handle, 1999 handle,
1732 bits_wanted, 2000 bits_wanted,
1733 1, 2001 1,
1734 suballoc_bit_start, 2002 &res);
1735 num_bits,
1736 &bg_blkno);
1737 if (status < 0) { 2003 if (status < 0) {
1738 mlog_errno(status); 2004 mlog_errno(status);
1739 goto bail; 2005 goto bail;
1740 } 2006 }
1741 atomic_inc(&osb->alloc_stats.bg_allocs); 2007 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1742 2008
1743 *blkno_start = bg_blkno + (u64) *suballoc_bit_start; 2009 *suballoc_loc = res.sr_bg_blkno;
1744 ac->ac_bits_given += (*num_bits); 2010 *suballoc_bit_start = res.sr_bit_offset;
2011 *blkno_start = res.sr_blkno;
2012 ac->ac_bits_given += res.sr_bits;
2013 *num_bits = res.sr_bits;
1745 status = 0; 2014 status = 0;
1746bail: 2015bail:
1747 mlog_exit(status); 2016 mlog_exit(status);
@@ -1749,10 +2018,10 @@ bail:
1749} 2018}
1750 2019
1751static void ocfs2_init_inode_ac_group(struct inode *dir, 2020static void ocfs2_init_inode_ac_group(struct inode *dir,
1752 struct buffer_head *parent_fe_bh, 2021 struct buffer_head *parent_di_bh,
1753 struct ocfs2_alloc_context *ac) 2022 struct ocfs2_alloc_context *ac)
1754{ 2023{
1755 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; 2024 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
1756 /* 2025 /*
1757 * Try to allocate inodes from some specific group. 2026 * Try to allocate inodes from some specific group.
1758 * 2027 *
@@ -1766,10 +2035,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
1766 if (OCFS2_I(dir)->ip_last_used_group && 2035 if (OCFS2_I(dir)->ip_last_used_group &&
1767 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2036 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1768 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2037 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1769 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) 2038 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
1770 ac->ac_last_group = ocfs2_which_suballoc_group( 2039 if (di->i_suballoc_loc)
1771 le64_to_cpu(fe->i_blkno), 2040 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
1772 le16_to_cpu(fe->i_suballoc_bit)); 2041 else
2042 ac->ac_last_group = ocfs2_which_suballoc_group(
2043 le64_to_cpu(di->i_blkno),
2044 le16_to_cpu(di->i_suballoc_bit));
2045 }
1773} 2046}
1774 2047
1775static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2048static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1779,17 +2052,146 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1779 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2052 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1780} 2053}
1781 2054
1782int ocfs2_claim_new_inode(struct ocfs2_super *osb, 2055int ocfs2_find_new_inode_loc(struct inode *dir,
1783 handle_t *handle, 2056 struct buffer_head *parent_fe_bh,
2057 struct ocfs2_alloc_context *ac,
2058 u64 *fe_blkno)
2059{
2060 int ret;
2061 handle_t *handle = NULL;
2062 struct ocfs2_suballoc_result *res;
2063
2064 BUG_ON(!ac);
2065 BUG_ON(ac->ac_bits_given != 0);
2066 BUG_ON(ac->ac_bits_wanted != 1);
2067 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2068
2069 res = kzalloc(sizeof(*res), GFP_NOFS);
2070 if (res == NULL) {
2071 ret = -ENOMEM;
2072 mlog_errno(ret);
2073 goto out;
2074 }
2075
2076 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2077
2078 /*
2079 * The handle started here is for chain relink. Alternatively,
2080 * we could just disable relink for these calls.
2081 */
2082 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2083 if (IS_ERR(handle)) {
2084 ret = PTR_ERR(handle);
2085 handle = NULL;
2086 mlog_errno(ret);
2087 goto out;
2088 }
2089
2090 /*
2091 * This will instruct ocfs2_claim_suballoc_bits and
2092 * ocfs2_search_one_group to search but save actual allocation
2093 * for later.
2094 */
2095 ac->ac_find_loc_only = 1;
2096
2097 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2098 if (ret < 0) {
2099 mlog_errno(ret);
2100 goto out;
2101 }
2102
2103 ac->ac_find_loc_priv = res;
2104 *fe_blkno = res->sr_blkno;
2105
2106out:
2107 if (handle)
2108 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2109
2110 if (ret)
2111 kfree(res);
2112
2113 return ret;
2114}
2115
2116int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2117 struct inode *dir,
2118 struct ocfs2_alloc_context *ac,
2119 u64 *suballoc_loc,
2120 u16 *suballoc_bit,
2121 u64 di_blkno)
2122{
2123 int ret;
2124 u16 chain;
2125 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2126 struct buffer_head *bg_bh = NULL;
2127 struct ocfs2_group_desc *bg;
2128 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2129
2130 /*
2131 * Since di_blkno is being passed back in, we check for any
2132 * inconsistencies which may have happened between
2133 * calls. These are code bugs as di_blkno is not expected to
2134 * change once returned from ocfs2_find_new_inode_loc()
2135 */
2136 BUG_ON(res->sr_blkno != di_blkno);
2137
2138 ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2139 res->sr_bg_stable_blkno, &bg_bh);
2140 if (ret) {
2141 mlog_errno(ret);
2142 goto out;
2143 }
2144
2145 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2146 chain = le16_to_cpu(bg->bg_chain);
2147
2148 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2149 ac->ac_bh, res->sr_bits,
2150 chain);
2151 if (ret) {
2152 mlog_errno(ret);
2153 goto out;
2154 }
2155
2156 ret = ocfs2_block_group_set_bits(handle,
2157 ac->ac_inode,
2158 bg,
2159 bg_bh,
2160 res->sr_bit_offset,
2161 res->sr_bits);
2162 if (ret < 0) {
2163 mlog_errno(ret);
2164 goto out;
2165 }
2166
2167 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
2168 (unsigned long long)di_blkno);
2169
2170 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2171
2172 BUG_ON(res->sr_bits != 1);
2173
2174 *suballoc_loc = res->sr_bg_blkno;
2175 *suballoc_bit = res->sr_bit_offset;
2176 ac->ac_bits_given++;
2177 ocfs2_save_inode_ac_group(dir, ac);
2178
2179out:
2180 brelse(bg_bh);
2181
2182 return ret;
2183}
2184
2185int ocfs2_claim_new_inode(handle_t *handle,
1784 struct inode *dir, 2186 struct inode *dir,
1785 struct buffer_head *parent_fe_bh, 2187 struct buffer_head *parent_fe_bh,
1786 struct ocfs2_alloc_context *ac, 2188 struct ocfs2_alloc_context *ac,
2189 u64 *suballoc_loc,
1787 u16 *suballoc_bit, 2190 u16 *suballoc_bit,
1788 u64 *fe_blkno) 2191 u64 *fe_blkno)
1789{ 2192{
1790 int status; 2193 int status;
1791 unsigned int num_bits; 2194 struct ocfs2_suballoc_result res;
1792 u64 bg_blkno;
1793 2195
1794 mlog_entry_void(); 2196 mlog_entry_void();
1795 2197
@@ -1800,23 +2202,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1800 2202
1801 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2203 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1802 2204
1803 status = ocfs2_claim_suballoc_bits(osb, 2205 status = ocfs2_claim_suballoc_bits(ac,
1804 ac,
1805 handle, 2206 handle,
1806 1, 2207 1,
1807 1, 2208 1,
1808 suballoc_bit, 2209 &res);
1809 &num_bits,
1810 &bg_blkno);
1811 if (status < 0) { 2210 if (status < 0) {
1812 mlog_errno(status); 2211 mlog_errno(status);
1813 goto bail; 2212 goto bail;
1814 } 2213 }
1815 atomic_inc(&osb->alloc_stats.bg_allocs); 2214 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1816 2215
1817 BUG_ON(num_bits != 1); 2216 BUG_ON(res.sr_bits != 1);
1818 2217
1819 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 2218 *suballoc_loc = res.sr_bg_blkno;
2219 *suballoc_bit = res.sr_bit_offset;
2220 *fe_blkno = res.sr_blkno;
1820 ac->ac_bits_given++; 2221 ac->ac_bits_given++;
1821 ocfs2_save_inode_ac_group(dir, ac); 2222 ocfs2_save_inode_ac_group(dir, ac);
1822 status = 0; 2223 status = 0;
@@ -1886,8 +2287,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1886 * contig. allocation, set to '1' to indicate we can deal with extents 2287 * contig. allocation, set to '1' to indicate we can deal with extents
1887 * of any size. 2288 * of any size.
1888 */ 2289 */
1889int __ocfs2_claim_clusters(struct ocfs2_super *osb, 2290int __ocfs2_claim_clusters(handle_t *handle,
1890 handle_t *handle,
1891 struct ocfs2_alloc_context *ac, 2291 struct ocfs2_alloc_context *ac,
1892 u32 min_clusters, 2292 u32 min_clusters,
1893 u32 max_clusters, 2293 u32 max_clusters,
@@ -1896,8 +2296,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1896{ 2296{
1897 int status; 2297 int status;
1898 unsigned int bits_wanted = max_clusters; 2298 unsigned int bits_wanted = max_clusters;
1899 u64 bg_blkno = 0; 2299 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1900 u16 bg_bit_off; 2300 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
1901 2301
1902 mlog_entry_void(); 2302 mlog_entry_void();
1903 2303
@@ -1907,6 +2307,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1907 && ac->ac_which != OCFS2_AC_USE_MAIN); 2307 && ac->ac_which != OCFS2_AC_USE_MAIN);
1908 2308
1909 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2309 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2310 WARN_ON(min_clusters > 1);
2311
1910 status = ocfs2_claim_local_alloc_bits(osb, 2312 status = ocfs2_claim_local_alloc_bits(osb,
1911 handle, 2313 handle,
1912 ac, 2314 ac,
@@ -1929,20 +2331,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1929 if (bits_wanted > (osb->bitmap_cpg - 1)) 2331 if (bits_wanted > (osb->bitmap_cpg - 1))
1930 bits_wanted = osb->bitmap_cpg - 1; 2332 bits_wanted = osb->bitmap_cpg - 1;
1931 2333
1932 status = ocfs2_claim_suballoc_bits(osb, 2334 status = ocfs2_claim_suballoc_bits(ac,
1933 ac,
1934 handle, 2335 handle,
1935 bits_wanted, 2336 bits_wanted,
1936 min_clusters, 2337 min_clusters,
1937 &bg_bit_off, 2338 &res);
1938 num_clusters,
1939 &bg_blkno);
1940 if (!status) { 2339 if (!status) {
2340 BUG_ON(res.sr_blkno); /* cluster alloc can't set */
1941 *cluster_start = 2341 *cluster_start =
1942 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2342 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1943 bg_blkno, 2343 res.sr_bg_blkno,
1944 bg_bit_off); 2344 res.sr_bit_offset);
1945 atomic_inc(&osb->alloc_stats.bitmap_data); 2345 atomic_inc(&osb->alloc_stats.bitmap_data);
2346 *num_clusters = res.sr_bits;
1946 } 2347 }
1947 } 2348 }
1948 if (status < 0) { 2349 if (status < 0) {
@@ -1958,8 +2359,7 @@ bail:
1958 return status; 2359 return status;
1959} 2360}
1960 2361
1961int ocfs2_claim_clusters(struct ocfs2_super *osb, 2362int ocfs2_claim_clusters(handle_t *handle,
1962 handle_t *handle,
1963 struct ocfs2_alloc_context *ac, 2363 struct ocfs2_alloc_context *ac,
1964 u32 min_clusters, 2364 u32 min_clusters,
1965 u32 *cluster_start, 2365 u32 *cluster_start,
@@ -1967,7 +2367,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1967{ 2367{
1968 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2368 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1969 2369
1970 return __ocfs2_claim_clusters(osb, handle, ac, min_clusters, 2370 return __ocfs2_claim_clusters(handle, ac, min_clusters,
1971 bits_wanted, cluster_start, num_clusters); 2371 bits_wanted, cluster_start, num_clusters);
1972} 2372}
1973 2373
@@ -2023,9 +2423,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
2023 if (undo_fn) 2423 if (undo_fn)
2024 jbd_unlock_bh_state(group_bh); 2424 jbd_unlock_bh_state(group_bh);
2025 2425
2026 status = ocfs2_journal_dirty(handle, group_bh); 2426 ocfs2_journal_dirty(handle, group_bh);
2027 if (status < 0)
2028 mlog_errno(status);
2029bail: 2427bail:
2030 return status; 2428 return status;
2031} 2429}
@@ -2092,12 +2490,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
2092 count); 2490 count);
2093 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2491 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2094 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2492 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2095 2493 ocfs2_journal_dirty(handle, alloc_bh);
2096 status = ocfs2_journal_dirty(handle, alloc_bh);
2097 if (status < 0) {
2098 mlog_errno(status);
2099 goto bail;
2100 }
2101 2494
2102bail: 2495bail:
2103 brelse(group_bh); 2496 brelse(group_bh);
@@ -2126,6 +2519,8 @@ int ocfs2_free_dinode(handle_t *handle,
2126 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2519 u16 bit = le16_to_cpu(di->i_suballoc_bit);
2127 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2520 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2128 2521
2522 if (di->i_suballoc_loc)
2523 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2129 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2524 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2130 inode_alloc_bh, bit, bg_blkno, 1); 2525 inode_alloc_bh, bit, bg_blkno, 1);
2131} 2526}
@@ -2338,7 +2733,8 @@ out:
2338 * suballoc_bit. 2733 * suballoc_bit.
2339 */ 2734 */
2340static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 2735static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2341 u16 *suballoc_slot, u16 *suballoc_bit) 2736 u16 *suballoc_slot, u64 *group_blkno,
2737 u16 *suballoc_bit)
2342{ 2738{
2343 int status; 2739 int status;
2344 struct buffer_head *inode_bh = NULL; 2740 struct buffer_head *inode_bh = NULL;
@@ -2375,6 +2771,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2375 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 2771 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2376 if (suballoc_bit) 2772 if (suballoc_bit)
2377 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 2773 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2774 if (group_blkno)
2775 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2378 2776
2379bail: 2777bail:
2380 brelse(inode_bh); 2778 brelse(inode_bh);
@@ -2392,10 +2790,11 @@ bail:
2392 */ 2790 */
2393static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 2791static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2394 struct inode *suballoc, 2792 struct inode *suballoc,
2395 struct buffer_head *alloc_bh, u64 blkno, 2793 struct buffer_head *alloc_bh,
2794 u64 group_blkno, u64 blkno,
2396 u16 bit, int *res) 2795 u16 bit, int *res)
2397{ 2796{
2398 struct ocfs2_dinode *alloc_fe; 2797 struct ocfs2_dinode *alloc_di;
2399 struct ocfs2_group_desc *group; 2798 struct ocfs2_group_desc *group;
2400 struct buffer_head *group_bh = NULL; 2799 struct buffer_head *group_bh = NULL;
2401 u64 bg_blkno; 2800 u64 bg_blkno;
@@ -2404,17 +2803,18 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2404 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, 2803 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2405 (unsigned int)bit); 2804 (unsigned int)bit);
2406 2805
2407 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; 2806 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2408 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { 2807 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2409 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 2808 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2410 (unsigned int)bit, 2809 (unsigned int)bit,
2411 ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); 2810 ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2412 status = -EINVAL; 2811 status = -EINVAL;
2413 goto bail; 2812 goto bail;
2414 } 2813 }
2415 2814
2416 bg_blkno = ocfs2_which_suballoc_group(blkno, bit); 2815 bg_blkno = group_blkno ? group_blkno :
2417 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, 2816 ocfs2_which_suballoc_group(blkno, bit);
2817 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2418 &group_bh); 2818 &group_bh);
2419 if (status < 0) { 2819 if (status < 0) {
2420 mlog(ML_ERROR, "read group %llu failed %d\n", 2820 mlog(ML_ERROR, "read group %llu failed %d\n",
@@ -2448,6 +2848,7 @@ bail:
2448int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 2848int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2449{ 2849{
2450 int status; 2850 int status;
2851 u64 group_blkno = 0;
2451 u16 suballoc_bit = 0, suballoc_slot = 0; 2852 u16 suballoc_bit = 0, suballoc_slot = 0;
2452 struct inode *inode_alloc_inode; 2853 struct inode *inode_alloc_inode;
2453 struct buffer_head *alloc_bh = NULL; 2854 struct buffer_head *alloc_bh = NULL;
@@ -2455,7 +2856,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2455 mlog_entry("blkno: %llu", (unsigned long long)blkno); 2856 mlog_entry("blkno: %llu", (unsigned long long)blkno);
2456 2857
2457 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2858 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2458 &suballoc_bit); 2859 &group_blkno, &suballoc_bit);
2459 if (status < 0) { 2860 if (status < 0) {
2460 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 2861 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2461 goto bail; 2862 goto bail;
@@ -2483,7 +2884,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2483 } 2884 }
2484 2885
2485 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 2886 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2486 blkno, suballoc_bit, res); 2887 group_blkno, blkno, suballoc_bit, res);
2487 if (status < 0) 2888 if (status < 0)
2488 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 2889 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2489 2890
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e0f46df357e6..b8afabfeede4 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
26#ifndef _CHAINALLOC_H_ 26#ifndef _CHAINALLOC_H_
27#define _CHAINALLOC_H_ 27#define _CHAINALLOC_H_
28 28
29struct ocfs2_suballoc_result;
29typedef int (group_search_t)(struct inode *, 30typedef int (group_search_t)(struct inode *,
30 struct buffer_head *, 31 struct buffer_head *,
31 u32, /* bits_wanted */ 32 u32, /* bits_wanted */
32 u32, /* min_bits */ 33 u32, /* min_bits */
33 u64, /* max_block */ 34 u64, /* max_block */
34 u16 *, /* *bit_off */ 35 struct ocfs2_suballoc_result *);
35 u16 *); /* *bits_found */ 36 /* found bits */
36 37
37struct ocfs2_alloc_context { 38struct ocfs2_alloc_context {
38 struct inode *ac_inode; /* which bitmap are we allocating from? */ 39 struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -54,6 +55,11 @@ struct ocfs2_alloc_context {
54 u64 ac_last_group; 55 u64 ac_last_group;
55 u64 ac_max_block; /* Highest block number to allocate. 0 is 56 u64 ac_max_block; /* Highest block number to allocate. 0 is
56 is the same as ~0 - unlimited */ 57 is the same as ~0 - unlimited */
58
59 int ac_find_loc_only; /* hack for reflink operation ordering */
60 struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
61
62 struct ocfs2_alloc_reservation *ac_resv;
57}; 63};
58 64
59void ocfs2_init_steal_slots(struct ocfs2_super *osb); 65void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +86,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
80 u32 bits_wanted, 86 u32 bits_wanted,
81 struct ocfs2_alloc_context **ac); 87 struct ocfs2_alloc_context **ac);
82 88
83int ocfs2_claim_metadata(struct ocfs2_super *osb, 89int ocfs2_claim_metadata(handle_t *handle,
84 handle_t *handle,
85 struct ocfs2_alloc_context *ac, 90 struct ocfs2_alloc_context *ac,
86 u32 bits_wanted, 91 u32 bits_wanted,
92 u64 *suballoc_loc,
87 u16 *suballoc_bit_start, 93 u16 *suballoc_bit_start,
88 u32 *num_bits, 94 u32 *num_bits,
89 u64 *blkno_start); 95 u64 *blkno_start);
90int ocfs2_claim_new_inode(struct ocfs2_super *osb, 96int ocfs2_claim_new_inode(handle_t *handle,
91 handle_t *handle,
92 struct inode *dir, 97 struct inode *dir,
93 struct buffer_head *parent_fe_bh, 98 struct buffer_head *parent_fe_bh,
94 struct ocfs2_alloc_context *ac, 99 struct ocfs2_alloc_context *ac,
100 u64 *suballoc_loc,
95 u16 *suballoc_bit, 101 u16 *suballoc_bit,
96 u64 *fe_blkno); 102 u64 *fe_blkno);
97int ocfs2_claim_clusters(struct ocfs2_super *osb, 103int ocfs2_claim_clusters(handle_t *handle,
98 handle_t *handle,
99 struct ocfs2_alloc_context *ac, 104 struct ocfs2_alloc_context *ac,
100 u32 min_clusters, 105 u32 min_clusters,
101 u32 *cluster_start, 106 u32 *cluster_start,
@@ -104,8 +109,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
104 * Use this variant of ocfs2_claim_clusters to specify a maxiumum 109 * Use this variant of ocfs2_claim_clusters to specify a maxiumum
105 * number of clusters smaller than the allocation reserved. 110 * number of clusters smaller than the allocation reserved.
106 */ 111 */
107int __ocfs2_claim_clusters(struct ocfs2_super *osb, 112int __ocfs2_claim_clusters(handle_t *handle,
108 handle_t *handle,
109 struct ocfs2_alloc_context *ac, 113 struct ocfs2_alloc_context *ac,
110 u32 min_clusters, 114 u32 min_clusters,
111 u32 max_clusters, 115 u32 max_clusters,
@@ -196,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
196 struct ocfs2_alloc_context **meta_ac); 200 struct ocfs2_alloc_context **meta_ac);
197 201
198int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); 202int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
203
204
205
206/*
207 * The following two interfaces are for ocfs2_create_inode_in_orphan().
208 */
209int ocfs2_find_new_inode_loc(struct inode *dir,
210 struct buffer_head *parent_fe_bh,
211 struct ocfs2_alloc_context *ac,
212 u64 *fe_blkno);
213
214int ocfs2_claim_new_inode_at_loc(handle_t *handle,
215 struct inode *dir,
216 struct ocfs2_alloc_context *ac,
217 u64 *suballoc_loc,
218 u16 *suballoc_bit,
219 u64 di_blkno);
220
199#endif /* _CHAINALLOC_H_ */ 221#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a494..fa1be1b304d1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -94,7 +94,9 @@ struct mount_options
94 unsigned long mount_opt; 94 unsigned long mount_opt;
95 unsigned int atime_quantum; 95 unsigned int atime_quantum;
96 signed short slot; 96 signed short slot;
97 unsigned int localalloc_opt; 97 int localalloc_opt;
98 unsigned int resv_level;
99 int dir_resv_level;
98 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 100 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
99}; 101};
100 102
@@ -143,8 +145,7 @@ static const struct super_operations ocfs2_sops = {
143 .alloc_inode = ocfs2_alloc_inode, 145 .alloc_inode = ocfs2_alloc_inode,
144 .destroy_inode = ocfs2_destroy_inode, 146 .destroy_inode = ocfs2_destroy_inode,
145 .drop_inode = ocfs2_drop_inode, 147 .drop_inode = ocfs2_drop_inode,
146 .clear_inode = ocfs2_clear_inode, 148 .evict_inode = ocfs2_evict_inode,
147 .delete_inode = ocfs2_delete_inode,
148 .sync_fs = ocfs2_sync_fs, 149 .sync_fs = ocfs2_sync_fs,
149 .put_super = ocfs2_put_super, 150 .put_super = ocfs2_put_super,
150 .remount_fs = ocfs2_remount, 151 .remount_fs = ocfs2_remount,
@@ -176,6 +177,8 @@ enum {
176 Opt_noacl, 177 Opt_noacl,
177 Opt_usrquota, 178 Opt_usrquota,
178 Opt_grpquota, 179 Opt_grpquota,
180 Opt_resv_level,
181 Opt_dir_resv_level,
179 Opt_err, 182 Opt_err,
180}; 183};
181 184
@@ -202,6 +205,8 @@ static const match_table_t tokens = {
202 {Opt_noacl, "noacl"}, 205 {Opt_noacl, "noacl"},
203 {Opt_usrquota, "usrquota"}, 206 {Opt_usrquota, "usrquota"},
204 {Opt_grpquota, "grpquota"}, 207 {Opt_grpquota, "grpquota"},
208 {Opt_resv_level, "resv_level=%u"},
209 {Opt_dir_resv_level, "dir_resv_level=%u"},
205 {Opt_err, NULL} 210 {Opt_err, NULL}
206}; 211};
207 212
@@ -873,13 +878,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
873 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 878 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
874 continue; 879 continue;
875 if (unsuspend) 880 if (unsuspend)
876 status = vfs_quota_enable( 881 status = dquot_resume(sb, type);
877 sb_dqopt(sb)->files[type], 882 else {
878 type, QFMT_OCFS2, 883 struct ocfs2_mem_dqinfo *oinfo;
879 DQUOT_SUSPENDED); 884
880 else 885 /* Cancel periodic syncing before suspending */
881 status = vfs_quota_disable(sb, type, 886 oinfo = sb_dqinfo(sb, type)->dqi_priv;
882 DQUOT_SUSPENDED); 887 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
888 status = dquot_suspend(sb, type);
889 }
883 if (status < 0) 890 if (status < 0)
884 break; 891 break;
885 } 892 }
@@ -910,8 +917,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
910 status = -ENOENT; 917 status = -ENOENT;
911 goto out_quota_off; 918 goto out_quota_off;
912 } 919 }
913 status = vfs_quota_enable(inode[type], type, QFMT_OCFS2, 920 status = dquot_enable(inode[type], type, QFMT_OCFS2,
914 DQUOT_USAGE_ENABLED); 921 DQUOT_USAGE_ENABLED);
915 if (status < 0) 922 if (status < 0)
916 goto out_quota_off; 923 goto out_quota_off;
917 } 924 }
@@ -932,18 +939,22 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
932 int type; 939 int type;
933 struct inode *inode; 940 struct inode *inode;
934 struct super_block *sb = osb->sb; 941 struct super_block *sb = osb->sb;
942 struct ocfs2_mem_dqinfo *oinfo;
935 943
936 /* We mostly ignore errors in this function because there's not much 944 /* We mostly ignore errors in this function because there's not much
937 * we can do when we see them */ 945 * we can do when we see them */
938 for (type = 0; type < MAXQUOTAS; type++) { 946 for (type = 0; type < MAXQUOTAS; type++) {
939 if (!sb_has_quota_loaded(sb, type)) 947 if (!sb_has_quota_loaded(sb, type))
940 continue; 948 continue;
949 /* Cancel periodic syncing before we grab dqonoff_mutex */
950 oinfo = sb_dqinfo(sb, type)->dqi_priv;
951 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
941 inode = igrab(sb->s_dquot.files[type]); 952 inode = igrab(sb->s_dquot.files[type]);
942 /* Turn off quotas. This will remove all dquot structures from 953 /* Turn off quotas. This will remove all dquot structures from
943 * memory and so they will be automatically synced to global 954 * memory and so they will be automatically synced to global
944 * quota files */ 955 * quota files */
945 vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED | 956 dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
946 DQUOT_LIMITS_ENABLED); 957 DQUOT_LIMITS_ENABLED);
947 if (!inode) 958 if (!inode)
948 continue; 959 continue;
949 iput(inode); 960 iput(inode);
@@ -952,7 +963,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
952 963
953/* Handle quota on quotactl */ 964/* Handle quota on quotactl */
954static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, 965static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
955 char *path, int remount) 966 char *path)
956{ 967{
957 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 968 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
958 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 969 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -960,30 +971,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
960 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 971 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
961 return -EINVAL; 972 return -EINVAL;
962 973
963 if (remount) 974 return dquot_enable(sb_dqopt(sb)->files[type], type,
964 return 0; /* Just ignore it has been handled in 975 format_id, DQUOT_LIMITS_ENABLED);
965 * ocfs2_remount() */
966 return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
967 format_id, DQUOT_LIMITS_ENABLED);
968} 976}
969 977
970/* Handle quota off quotactl */ 978/* Handle quota off quotactl */
971static int ocfs2_quota_off(struct super_block *sb, int type, int remount) 979static int ocfs2_quota_off(struct super_block *sb, int type)
972{ 980{
973 if (remount) 981 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
974 return 0; /* Ignore now and handle later in
975 * ocfs2_remount() */
976 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
977} 982}
978 983
979static const struct quotactl_ops ocfs2_quotactl_ops = { 984static const struct quotactl_ops ocfs2_quotactl_ops = {
980 .quota_on = ocfs2_quota_on, 985 .quota_on = ocfs2_quota_on,
981 .quota_off = ocfs2_quota_off, 986 .quota_off = ocfs2_quota_off,
982 .quota_sync = vfs_quota_sync, 987 .quota_sync = dquot_quota_sync,
983 .get_info = vfs_get_dqinfo, 988 .get_info = dquot_get_dqinfo,
984 .set_info = vfs_set_dqinfo, 989 .set_info = dquot_set_dqinfo,
985 .get_dqblk = vfs_get_dqblk, 990 .get_dqblk = dquot_get_dqblk,
986 .set_dqblk = vfs_set_dqblk, 991 .set_dqblk = dquot_set_dqblk,
987}; 992};
988 993
989static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 994static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
@@ -1028,8 +1033,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1028 osb->s_atime_quantum = parsed_options.atime_quantum; 1033 osb->s_atime_quantum = parsed_options.atime_quantum;
1029 osb->preferred_slot = parsed_options.slot; 1034 osb->preferred_slot = parsed_options.slot;
1030 osb->osb_commit_interval = parsed_options.commit_interval; 1035 osb->osb_commit_interval = parsed_options.commit_interval;
1031 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); 1036
1032 osb->local_alloc_bits = osb->local_alloc_default_bits; 1037 ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
1038 osb->osb_resv_level = parsed_options.resv_level;
1039 osb->osb_dir_resv_level = parsed_options.resv_level;
1040 if (parsed_options.dir_resv_level == -1)
1041 osb->osb_dir_resv_level = parsed_options.resv_level;
1042 else
1043 osb->osb_dir_resv_level = parsed_options.dir_resv_level;
1033 1044
1034 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 1045 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
1035 if (status) 1046 if (status)
@@ -1285,11 +1296,13 @@ static int ocfs2_parse_options(struct super_block *sb,
1285 options ? options : "(none)"); 1296 options ? options : "(none)");
1286 1297
1287 mopt->commit_interval = 0; 1298 mopt->commit_interval = 0;
1288 mopt->mount_opt = 0; 1299 mopt->mount_opt = OCFS2_MOUNT_NOINTR;
1289 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 1300 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
1290 mopt->slot = OCFS2_INVALID_SLOT; 1301 mopt->slot = OCFS2_INVALID_SLOT;
1291 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; 1302 mopt->localalloc_opt = -1;
1292 mopt->cluster_stack[0] = '\0'; 1303 mopt->cluster_stack[0] = '\0';
1304 mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
1305 mopt->dir_resv_level = -1;
1293 1306
1294 if (!options) { 1307 if (!options) {
1295 status = 1; 1308 status = 1;
@@ -1380,7 +1393,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1380 status = 0; 1393 status = 0;
1381 goto bail; 1394 goto bail;
1382 } 1395 }
1383 if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8)) 1396 if (option >= 0)
1384 mopt->localalloc_opt = option; 1397 mopt->localalloc_opt = option;
1385 break; 1398 break;
1386 case Opt_localflocks: 1399 case Opt_localflocks:
@@ -1433,6 +1446,28 @@ static int ocfs2_parse_options(struct super_block *sb,
1433 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; 1446 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1434 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; 1447 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1435 break; 1448 break;
1449 case Opt_resv_level:
1450 if (is_remount)
1451 break;
1452 if (match_int(&args[0], &option)) {
1453 status = 0;
1454 goto bail;
1455 }
1456 if (option >= OCFS2_MIN_RESV_LEVEL &&
1457 option < OCFS2_MAX_RESV_LEVEL)
1458 mopt->resv_level = option;
1459 break;
1460 case Opt_dir_resv_level:
1461 if (is_remount)
1462 break;
1463 if (match_int(&args[0], &option)) {
1464 status = 0;
1465 goto bail;
1466 }
1467 if (option >= OCFS2_MIN_RESV_LEVEL &&
1468 option < OCFS2_MAX_RESV_LEVEL)
1469 mopt->dir_resv_level = option;
1470 break;
1436 default: 1471 default:
1437 mlog(ML_ERROR, 1472 mlog(ML_ERROR,
1438 "Unrecognized mount option \"%s\" " 1473 "Unrecognized mount option \"%s\" "
@@ -1487,7 +1522,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1487 (unsigned) (osb->osb_commit_interval / HZ)); 1522 (unsigned) (osb->osb_commit_interval / HZ));
1488 1523
1489 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); 1524 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
1490 if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) 1525 if (local_alloc_megs != ocfs2_la_default_mb(osb))
1491 seq_printf(s, ",localalloc=%d", local_alloc_megs); 1526 seq_printf(s, ",localalloc=%d", local_alloc_megs);
1492 1527
1493 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 1528 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1549,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1514 else 1549 else
1515 seq_printf(s, ",noacl"); 1550 seq_printf(s, ",noacl");
1516 1551
1552 if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
1553 seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
1554
1555 if (osb->osb_dir_resv_level != osb->osb_resv_level)
1556 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
1557
1517 return 0; 1558 return 0;
1518} 1559}
1519 1560
@@ -1688,6 +1729,8 @@ static void ocfs2_inode_init_once(void *data)
1688 oi->ip_blkno = 0ULL; 1729 oi->ip_blkno = 0ULL;
1689 oi->ip_clusters = 0; 1730 oi->ip_clusters = 0;
1690 1731
1732 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1733
1691 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1734 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1692 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1735 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1693 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1736 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2085,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
2042 2085
2043 init_waitqueue_head(&osb->osb_mount_event); 2086 init_waitqueue_head(&osb->osb_mount_event);
2044 2087
2088 status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
2089 if (status) {
2090 mlog_errno(status);
2091 goto bail;
2092 }
2093
2045 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); 2094 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
2046 if (!osb->vol_label) { 2095 if (!osb->vol_label) {
2047 mlog(ML_ERROR, "unable to alloc vol label\n"); 2096 mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2273,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
2224 } 2273 }
2225 2274
2226 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 2275 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
2276 osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
2227 iput(inode); 2277 iput(inode);
2228 2278
2229 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8; 2279 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
2280 osb->s_feature_incompat) * 8;
2230 2281
2231 status = ocfs2_init_slot_info(osb); 2282 status = ocfs2_init_slot_info(osb);
2232 if (status < 0) { 2283 if (status < 0) {
@@ -2420,7 +2471,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
2420 kfree(osb->slot_recovery_generations); 2471 kfree(osb->slot_recovery_generations);
2421 /* FIXME 2472 /* FIXME
2422 * This belongs in journal shutdown, but because we have to 2473 * This belongs in journal shutdown, but because we have to
2423 * allocate osb->journal at the start of ocfs2_initalize_osb(), 2474 * allocate osb->journal at the start of ocfs2_initialize_osb(),
2424 * we free it here. 2475 * we free it here.
2425 */ 2476 */
2426 kfree(osb->journal); 2477 kfree(osb->journal);
@@ -2509,5 +2560,25 @@ void __ocfs2_abort(struct super_block* sb,
2509 ocfs2_handle_error(sb); 2560 ocfs2_handle_error(sb);
2510} 2561}
2511 2562
2563/*
2564 * Void signal blockers, because in-kernel sigprocmask() only fails
2565 * when SIG_* is wrong.
2566 */
2567void ocfs2_block_signals(sigset_t *oldset)
2568{
2569 int rc;
2570 sigset_t blocked;
2571
2572 sigfillset(&blocked);
2573 rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
2574 BUG_ON(rc);
2575}
2576
2577void ocfs2_unblock_signals(sigset_t *oldset)
2578{
2579 int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
2580 BUG_ON(rc);
2581}
2582
2512module_init(ocfs2_init); 2583module_init(ocfs2_init);
2513module_exit(ocfs2_exit); 2584module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..40c7de084c10 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
45 45
46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
47 47
48/*
49 * Void signal blockers, because in-kernel sigprocmask() only fails
50 * when SIG_* is wrong.
51 */
52void ocfs2_block_signals(sigset_t *oldset);
53void ocfs2_unblock_signals(sigset_t *oldset);
54
48#endif /* OCFS2_SUPER_H */ 55#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc4..9975457c981f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
128 } 128 }
129 129
130 /* Fast symlinks can't be large */ 130 /* Fast symlinks can't be large */
131 len = strlen(target); 131 len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
132 link = kzalloc(len + 1, GFP_NOFS); 132 link = kzalloc(len + 1, GFP_NOFS);
133 if (!link) { 133 if (!link) {
134 status = -ENOMEM; 134 status = -ENOMEM;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3e7773089b96..06fa5e77c40e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
79 struct ocfs2_alloc_context *meta_ac; 79 struct ocfs2_alloc_context *meta_ac;
80 struct ocfs2_alloc_context *data_ac; 80 struct ocfs2_alloc_context *data_ac;
81 struct ocfs2_cached_dealloc_ctxt dealloc; 81 struct ocfs2_cached_dealloc_ctxt dealloc;
82 int set_abort;
82}; 83};
83 84
84#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) 85#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
@@ -96,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
96 .xv.xr_list.l_count = cpu_to_le16(1), 97 .xv.xr_list.l_count = cpu_to_le16(1),
97}; 98};
98 99
99struct xattr_handler *ocfs2_xattr_handlers[] = { 100const struct xattr_handler *ocfs2_xattr_handlers[] = {
100 &ocfs2_xattr_user_handler, 101 &ocfs2_xattr_user_handler,
101 &ocfs2_xattr_acl_access_handler, 102 &ocfs2_xattr_acl_access_handler,
102 &ocfs2_xattr_acl_default_handler, 103 &ocfs2_xattr_acl_default_handler,
@@ -105,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
105 NULL 106 NULL
106}; 107};
107 108
108static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 109static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
109 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 110 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
110 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] 111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
111 = &ocfs2_xattr_acl_access_handler, 112 = &ocfs2_xattr_acl_access_handler,
@@ -539,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
539 540
540static inline const char *ocfs2_xattr_prefix(int name_index) 541static inline const char *ocfs2_xattr_prefix(int name_index)
541{ 542{
542 struct xattr_handler *handler = NULL; 543 const struct xattr_handler *handler = NULL;
543 544
544 if (name_index > 0 && name_index < OCFS2_XATTR_MAX) 545 if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
545 handler = ocfs2_xattr_handler_map[name_index]; 546 handler = ocfs2_xattr_handler_map[name_index];
@@ -708,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
708 struct ocfs2_xattr_value_buf *vb, 709 struct ocfs2_xattr_value_buf *vb,
709 struct ocfs2_xattr_set_ctxt *ctxt) 710 struct ocfs2_xattr_set_ctxt *ctxt)
710{ 711{
711 int status = 0; 712 int status = 0, credits;
712 handle_t *handle = ctxt->handle; 713 handle_t *handle = ctxt->handle;
713 enum ocfs2_alloc_restarted why; 714 enum ocfs2_alloc_restarted why;
714 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); 715 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
@@ -718,42 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
718 719
719 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); 720 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
720 721
721 status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 722 while (clusters_to_add) {
722 OCFS2_JOURNAL_ACCESS_WRITE); 723 status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
723 if (status < 0) { 724 OCFS2_JOURNAL_ACCESS_WRITE);
724 mlog_errno(status); 725 if (status < 0) {
725 goto leave; 726 mlog_errno(status);
726 } 727 break;
727 728 }
728 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
729 status = ocfs2_add_clusters_in_btree(handle,
730 &et,
731 &logical_start,
732 clusters_to_add,
733 0,
734 ctxt->data_ac,
735 ctxt->meta_ac,
736 &why);
737 if (status < 0) {
738 mlog_errno(status);
739 goto leave;
740 }
741 729
742 status = ocfs2_journal_dirty(handle, vb->vb_bh); 730 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
743 if (status < 0) { 731 status = ocfs2_add_clusters_in_btree(handle,
744 mlog_errno(status); 732 &et,
745 goto leave; 733 &logical_start,
746 } 734 clusters_to_add,
735 0,
736 ctxt->data_ac,
737 ctxt->meta_ac,
738 &why);
739 if ((status < 0) && (status != -EAGAIN)) {
740 if (status != -ENOSPC)
741 mlog_errno(status);
742 break;
743 }
747 744
748 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; 745 ocfs2_journal_dirty(handle, vb->vb_bh);
749 746
750 /* 747 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
751 * We should have already allocated enough space before the transaction, 748 prev_clusters;
752 * so no need to restart.
753 */
754 BUG_ON(why != RESTART_NONE || clusters_to_add);
755 749
756leave: 750 if (why != RESTART_NONE && clusters_to_add) {
751 /*
752 * We can only fail in case the alloc file doesn't give
753 * up enough clusters.
754 */
755 BUG_ON(why == RESTART_META);
756
757 mlog(0, "restarting xattr value extension for %u"
758 " clusters,.\n", clusters_to_add);
759 credits = ocfs2_calc_extend_credits(inode->i_sb,
760 &vb->vb_xv->xr_list,
761 clusters_to_add);
762 status = ocfs2_extend_trans(handle, credits);
763 if (status < 0) {
764 status = -ENOMEM;
765 mlog_errno(status);
766 break;
767 }
768 }
769 }
757 770
758 return status; 771 return status;
759} 772}
@@ -786,12 +799,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
786 } 799 }
787 800
788 le32_add_cpu(&vb->vb_xv->xr_clusters, -len); 801 le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
789 802 ocfs2_journal_dirty(handle, vb->vb_bh);
790 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
791 if (ret) {
792 mlog_errno(ret);
793 goto out;
794 }
795 803
796 if (ext_flags & OCFS2_EXT_REFCOUNTED) 804 if (ext_flags & OCFS2_EXT_REFCOUNTED)
797 ret = ocfs2_decrease_refcount(inode, handle, 805 ret = ocfs2_decrease_refcount(inode, handle,
@@ -1278,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
1278 xis.inode_bh = xbs.inode_bh = di_bh; 1286 xis.inode_bh = xbs.inode_bh = di_bh;
1279 di = (struct ocfs2_dinode *)di_bh->b_data; 1287 di = (struct ocfs2_dinode *)di_bh->b_data;
1280 1288
1281 down_read(&oi->ip_xattr_sem);
1282 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, 1289 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
1283 buffer_size, &xis); 1290 buffer_size, &xis);
1284 if (ret == -ENODATA && di->i_xattr_loc) 1291 if (ret == -ENODATA && di->i_xattr_loc)
1285 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, 1292 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
1286 buffer_size, &xbs); 1293 buffer_size, &xbs);
1287 up_read(&oi->ip_xattr_sem);
1288 1294
1289 return ret; 1295 return ret;
1290} 1296}
@@ -1308,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
1308 mlog_errno(ret); 1314 mlog_errno(ret);
1309 return ret; 1315 return ret;
1310 } 1316 }
1317 down_read(&OCFS2_I(inode)->ip_xattr_sem);
1311 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, 1318 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
1312 name, buffer, buffer_size); 1319 name, buffer, buffer_size);
1320 up_read(&OCFS2_I(inode)->ip_xattr_sem);
1313 1321
1314 ocfs2_inode_unlock(inode, 0); 1322 ocfs2_inode_unlock(inode, 0);
1315 1323
@@ -1374,11 +1382,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1374 memset(bh->b_data + cp_len, 0, 1382 memset(bh->b_data + cp_len, 0,
1375 blocksize - cp_len); 1383 blocksize - cp_len);
1376 1384
1377 ret = ocfs2_journal_dirty(handle, bh); 1385 ocfs2_journal_dirty(handle, bh);
1378 if (ret < 0) {
1379 mlog_errno(ret);
1380 goto out;
1381 }
1382 brelse(bh); 1386 brelse(bh);
1383 bh = NULL; 1387 bh = NULL;
1384 1388
@@ -2148,15 +2152,19 @@ alloc_value:
2148 orig_clusters = ocfs2_xa_value_clusters(loc); 2152 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); 2153 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) { 2154 if (rc < 0) {
2151 /* 2155 ctxt->set_abort = 1;
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing", 2156 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters); 2157 orig_clusters);
2158 /*
2159 * If we were growing an existing value,
2160 * ocfs2_xa_cleanup_value_truncate() won't remove
2161 * the entry. We need to restore the original value
2162 * size.
2163 */
2164 if (loc->xl_entry) {
2165 BUG_ON(!orig_value_size);
2166 loc->xl_entry->xe_value_size = orig_value_size;
2167 }
2160 mlog_errno(rc); 2168 mlog_errno(rc);
2161 } 2169 }
2162 } 2170 }
@@ -2479,7 +2487,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
2479 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 2487 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
2480 blk = le64_to_cpu(xb->xb_blkno); 2488 blk = le64_to_cpu(xb->xb_blkno);
2481 bit = le16_to_cpu(xb->xb_suballoc_bit); 2489 bit = le16_to_cpu(xb->xb_suballoc_bit);
2482 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2490 if (xb->xb_suballoc_loc)
2491 bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
2492 else
2493 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2483 2494
2484 xb_alloc_inode = ocfs2_get_system_file_inode(osb, 2495 xb_alloc_inode = ocfs2_get_system_file_inode(osb,
2485 EXTENT_ALLOC_SYSTEM_INODE, 2496 EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2605,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
2594 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2605 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2595 spin_unlock(&oi->ip_lock); 2606 spin_unlock(&oi->ip_lock);
2596 2607
2597 ret = ocfs2_journal_dirty(handle, di_bh); 2608 ocfs2_journal_dirty(handle, di_bh);
2598 if (ret < 0)
2599 mlog_errno(ret);
2600out_commit: 2609out_commit:
2601 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 2610 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
2602out: 2611out:
@@ -2724,9 +2733,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2733 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock); 2734 spin_unlock(&oi->ip_lock);
2726 2735
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh); 2736 ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730 2737
2731out: 2738out:
2732 return ret; 2739 return ret;
@@ -2846,9 +2853,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2846 int ret; 2853 int ret;
2847 u16 suballoc_bit_start; 2854 u16 suballoc_bit_start;
2848 u32 num_got; 2855 u32 num_got;
2849 u64 first_blkno; 2856 u64 suballoc_loc, first_blkno;
2850 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; 2857 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
2851 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2852 struct buffer_head *new_bh = NULL; 2858 struct buffer_head *new_bh = NULL;
2853 struct ocfs2_xattr_block *xblk; 2859 struct ocfs2_xattr_block *xblk;
2854 2860
@@ -2859,9 +2865,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2859 goto end; 2865 goto end;
2860 } 2866 }
2861 2867
2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1, 2868 ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
2863 &suballoc_bit_start, &num_got, 2869 &suballoc_loc, &suballoc_bit_start,
2864 &first_blkno); 2870 &num_got, &first_blkno);
2865 if (ret < 0) { 2871 if (ret < 0) {
2866 mlog_errno(ret); 2872 mlog_errno(ret);
2867 goto end; 2873 goto end;
@@ -2883,8 +2889,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2883 memset(xblk, 0, inode->i_sb->s_blocksize); 2889 memset(xblk, 0, inode->i_sb->s_blocksize);
2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2890 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); 2891 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2892 xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2893 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2894 xblk->xb_fs_generation =
2895 cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
2888 xblk->xb_blkno = cpu_to_le64(first_blkno); 2896 xblk->xb_blkno = cpu_to_le64(first_blkno);
2889 if (indexed) { 2897 if (indexed) {
2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2898 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2964,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2956 ret = ocfs2_xa_set(&loc, xi, ctxt); 2964 ret = ocfs2_xa_set(&loc, xi, ctxt);
2957 if (!ret) 2965 if (!ret)
2958 xs->here = loc.xl_entry; 2966 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC) 2967 else if ((ret != -ENOSPC) || ctxt->set_abort)
2960 goto end; 2968 goto end;
2961 else { 2969 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2970 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3320,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3312 goto out; 3320 goto out;
3313 } 3321 }
3314 3322
3315 ret = ocfs2_extend_trans(ctxt->handle, credits + 3323 ret = ocfs2_extend_trans(ctxt->handle, credits);
3316 ctxt->handle->h_buffer_credits);
3317 if (ret) { 3324 if (ret) {
3318 mlog_errno(ret); 3325 mlog_errno(ret);
3319 goto out; 3326 goto out;
3320 } 3327 }
3321 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); 3328 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
3322 } else if (ret == -ENOSPC) { 3329 } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
3323 if (di->i_xattr_loc && !xbs->xattr_bh) { 3330 if (di->i_xattr_loc && !xbs->xattr_bh) {
3324 ret = ocfs2_xattr_block_find(inode, 3331 ret = ocfs2_xattr_block_find(inode,
3325 xi->xi_name_index, 3332 xi->xi_name_index,
@@ -3343,8 +3350,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3343 goto out; 3350 goto out;
3344 } 3351 }
3345 3352
3346 ret = ocfs2_extend_trans(ctxt->handle, credits + 3353 ret = ocfs2_extend_trans(ctxt->handle, credits);
3347 ctxt->handle->h_buffer_credits);
3348 if (ret) { 3354 if (ret) {
3349 mlog_errno(ret); 3355 mlog_errno(ret);
3350 goto out; 3356 goto out;
@@ -3378,8 +3384,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3378 goto out; 3384 goto out;
3379 } 3385 }
3380 3386
3381 ret = ocfs2_extend_trans(ctxt->handle, credits + 3387 ret = ocfs2_extend_trans(ctxt->handle, credits);
3382 ctxt->handle->h_buffer_credits);
3383 if (ret) { 3388 if (ret) {
3384 mlog_errno(ret); 3389 mlog_errno(ret);
3385 goto out; 3390 goto out;
@@ -4249,7 +4254,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4249 u32 bit_off, len; 4254 u32 bit_off, len;
4250 u64 blkno; 4255 u64 blkno;
4251 handle_t *handle = ctxt->handle; 4256 handle_t *handle = ctxt->handle;
4252 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4253 struct ocfs2_inode_info *oi = OCFS2_I(inode); 4257 struct ocfs2_inode_info *oi = OCFS2_I(inode);
4254 struct buffer_head *xb_bh = xs->xattr_bh; 4258 struct buffer_head *xb_bh = xs->xattr_bh;
4255 struct ocfs2_xattr_block *xb = 4259 struct ocfs2_xattr_block *xb =
@@ -4277,7 +4281,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4277 goto out; 4281 goto out;
4278 } 4282 }
4279 4283
4280 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 4284 ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
4281 1, 1, &bit_off, &len); 4285 1, 1, &bit_off, &len);
4282 if (ret) { 4286 if (ret) {
4283 mlog_errno(ret); 4287 mlog_errno(ret);
@@ -4887,8 +4891,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
4887 * We need to update the first bucket of the old extent and all 4891 * We need to update the first bucket of the old extent and all
4888 * the buckets going to the new extent. 4892 * the buckets going to the new extent.
4889 */ 4893 */
4890 credits = ((num_buckets + 1) * blks_per_bucket) + 4894 credits = ((num_buckets + 1) * blks_per_bucket);
4891 handle->h_buffer_credits;
4892 ret = ocfs2_extend_trans(handle, credits); 4895 ret = ocfs2_extend_trans(handle, credits);
4893 if (ret) { 4896 if (ret) {
4894 mlog_errno(ret); 4897 mlog_errno(ret);
@@ -4958,7 +4961,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
4958 u32 *first_hash) 4961 u32 *first_hash)
4959{ 4962{
4960 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4963 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4961 int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits; 4964 int ret, credits = 2 * blk_per_bucket;
4962 4965
4963 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); 4966 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
4964 4967
@@ -5099,7 +5102,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5099 goto leave; 5102 goto leave;
5100 } 5103 }
5101 5104
5102 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1, 5105 ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
5103 clusters_to_add, &bit_off, &num_bits); 5106 clusters_to_add, &bit_off, &num_bits);
5104 if (ret < 0) { 5107 if (ret < 0) {
5105 if (ret != -ENOSPC) 5108 if (ret != -ENOSPC)
@@ -5153,9 +5156,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5153 goto leave; 5156 goto leave;
5154 } 5157 }
5155 5158
5156 ret = ocfs2_journal_dirty(handle, root_bh); 5159 ocfs2_journal_dirty(handle, root_bh);
5157 if (ret < 0)
5158 mlog_errno(ret);
5159 5160
5160leave: 5161leave:
5161 return ret; 5162 return ret;
@@ -5200,8 +5201,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
5200 * existing bucket. Then we add the last existing bucket, the 5201 * existing bucket. Then we add the last existing bucket, the
5201 * new bucket, and the first bucket (3 * blk_per_bucket). 5202 * new bucket, and the first bucket (3 * blk_per_bucket).
5202 */ 5203 */
5203 credits = (end_blk - target_blk) + (3 * blk_per_bucket) + 5204 credits = (end_blk - target_blk) + (3 * blk_per_bucket);
5204 handle->h_buffer_credits;
5205 ret = ocfs2_extend_trans(handle, credits); 5205 ret = ocfs2_extend_trans(handle, credits);
5206 if (ret) { 5206 if (ret) {
5207 mlog_errno(ret); 5207 mlog_errno(ret);
@@ -5477,12 +5477,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
5477 } 5477 }
5478 5478
5479 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); 5479 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
5480 5480 ocfs2_journal_dirty(handle, root_bh);
5481 ret = ocfs2_journal_dirty(handle, root_bh);
5482 if (ret) {
5483 mlog_errno(ret);
5484 goto out_commit;
5485 }
5486 5481
5487 ret = ocfs2_truncate_log_append(osb, handle, blkno, len); 5482 ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
5488 if (ret) 5483 if (ret)
@@ -6809,16 +6804,15 @@ out:
6809 return ret; 6804 return ret;
6810} 6805}
6811 6806
6812static int ocfs2_reflink_xattr_buckets(handle_t *handle, 6807static int ocfs2_reflink_xattr_bucket(handle_t *handle,
6813 u64 blkno, u64 new_blkno, u32 clusters, 6808 u64 blkno, u64 new_blkno, u32 clusters,
6809 u32 *cpos, int num_buckets,
6814 struct ocfs2_alloc_context *meta_ac, 6810 struct ocfs2_alloc_context *meta_ac,
6815 struct ocfs2_alloc_context *data_ac, 6811 struct ocfs2_alloc_context *data_ac,
6816 struct ocfs2_reflink_xattr_tree_args *args) 6812 struct ocfs2_reflink_xattr_tree_args *args)
6817{ 6813{
6818 int i, j, ret = 0; 6814 int i, j, ret = 0;
6819 struct super_block *sb = args->reflink->old_inode->i_sb; 6815 struct super_block *sb = args->reflink->old_inode->i_sb;
6820 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
6821 u32 num_buckets = clusters * bpc;
6822 int bpb = args->old_bucket->bu_blocks; 6816 int bpb = args->old_bucket->bu_blocks;
6823 struct ocfs2_xattr_value_buf vb = { 6817 struct ocfs2_xattr_value_buf vb = {
6824 .vb_access = ocfs2_journal_access, 6818 .vb_access = ocfs2_journal_access,
@@ -6837,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6837 break; 6831 break;
6838 } 6832 }
6839 6833
6840 /*
6841 * The real bucket num in this series of blocks is stored
6842 * in the 1st bucket.
6843 */
6844 if (i == 0)
6845 num_buckets = le16_to_cpu(
6846 bucket_xh(args->old_bucket)->xh_num_buckets);
6847
6848 ret = ocfs2_xattr_bucket_journal_access(handle, 6834 ret = ocfs2_xattr_bucket_journal_access(handle,
6849 args->new_bucket, 6835 args->new_bucket,
6850 OCFS2_JOURNAL_ACCESS_CREATE); 6836 OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6858,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6858 bucket_block(args->old_bucket, j), 6844 bucket_block(args->old_bucket, j),
6859 sb->s_blocksize); 6845 sb->s_blocksize);
6860 6846
6847 /*
6848 * Record the start cpos so that we can use it to initialize
6849 * our xattr tree we also set the xh_num_bucket for the new
6850 * bucket.
6851 */
6852 if (i == 0) {
6853 *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
6854 xh_entries[0].xe_name_hash);
6855 bucket_xh(args->new_bucket)->xh_num_buckets =
6856 cpu_to_le16(num_buckets);
6857 }
6858
6861 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); 6859 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6862 6860
6863 ret = ocfs2_reflink_xattr_header(handle, args->reflink, 6861 ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6887,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6887 } 6885 }
6888 6886
6889 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); 6887 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6888
6890 ocfs2_xattr_bucket_relse(args->old_bucket); 6889 ocfs2_xattr_bucket_relse(args->old_bucket);
6891 ocfs2_xattr_bucket_relse(args->new_bucket); 6890 ocfs2_xattr_bucket_relse(args->new_bucket);
6892 } 6891 }
@@ -6895,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6895 ocfs2_xattr_bucket_relse(args->new_bucket); 6894 ocfs2_xattr_bucket_relse(args->new_bucket);
6896 return ret; 6895 return ret;
6897} 6896}
6897
6898static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6899 struct inode *inode,
6900 struct ocfs2_reflink_xattr_tree_args *args,
6901 struct ocfs2_extent_tree *et,
6902 struct ocfs2_alloc_context *meta_ac,
6903 struct ocfs2_alloc_context *data_ac,
6904 u64 blkno, u32 cpos, u32 len)
6905{
6906 int ret, first_inserted = 0;
6907 u32 p_cluster, num_clusters, reflink_cpos = 0;
6908 u64 new_blkno;
6909 unsigned int num_buckets, reflink_buckets;
6910 unsigned int bpc =
6911 ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
6912
6913 ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
6914 if (ret) {
6915 mlog_errno(ret);
6916 goto out;
6917 }
6918 num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
6919 ocfs2_xattr_bucket_relse(args->old_bucket);
6920
6921 while (len && num_buckets) {
6922 ret = ocfs2_claim_clusters(handle, data_ac,
6923 1, &p_cluster, &num_clusters);
6924 if (ret) {
6925 mlog_errno(ret);
6926 goto out;
6927 }
6928
6929 new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
6930 reflink_buckets = min(num_buckets, bpc * num_clusters);
6931
6932 ret = ocfs2_reflink_xattr_bucket(handle, blkno,
6933 new_blkno, num_clusters,
6934 &reflink_cpos, reflink_buckets,
6935 meta_ac, data_ac, args);
6936 if (ret) {
6937 mlog_errno(ret);
6938 goto out;
6939 }
6940
6941 /*
6942 * For the 1st allocated cluster, we make it use the same cpos
6943 * so that the xattr tree looks the same as the original one
6944 * in the most case.
6945 */
6946 if (!first_inserted) {
6947 reflink_cpos = cpos;
6948 first_inserted = 1;
6949 }
6950 ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
6951 num_clusters, 0, meta_ac);
6952 if (ret)
6953 mlog_errno(ret);
6954
6955 mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
6956 (unsigned long long)new_blkno, num_clusters, reflink_cpos);
6957
6958 len -= num_clusters;
6959 blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
6960 num_buckets -= reflink_buckets;
6961 }
6962out:
6963 return ret;
6964}
6965
6898/* 6966/*
6899 * Create the same xattr extent record in the new inode's xattr tree. 6967 * Create the same xattr extent record in the new inode's xattr tree.
6900 */ 6968 */
@@ -6906,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6906 void *para) 6974 void *para)
6907{ 6975{
6908 int ret, credits = 0; 6976 int ret, credits = 0;
6909 u32 p_cluster, num_clusters;
6910 u64 new_blkno;
6911 handle_t *handle; 6977 handle_t *handle;
6912 struct ocfs2_reflink_xattr_tree_args *args = 6978 struct ocfs2_reflink_xattr_tree_args *args =
6913 (struct ocfs2_reflink_xattr_tree_args *)para; 6979 (struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6916,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6916 struct ocfs2_alloc_context *data_ac = NULL; 6982 struct ocfs2_alloc_context *data_ac = NULL;
6917 struct ocfs2_extent_tree et; 6983 struct ocfs2_extent_tree et;
6918 6984
6985 mlog(0, "reflink xattr buckets %llu len %u\n",
6986 (unsigned long long)blkno, len);
6987
6919 ocfs2_init_xattr_tree_extent_tree(&et, 6988 ocfs2_init_xattr_tree_extent_tree(&et,
6920 INODE_CACHE(args->reflink->new_inode), 6989 INODE_CACHE(args->reflink->new_inode),
6921 args->new_blk_bh); 6990 args->new_blk_bh);
@@ -6935,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6935 goto out; 7004 goto out;
6936 } 7005 }
6937 7006
6938 ret = ocfs2_claim_clusters(osb, handle, data_ac, 7007 ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
6939 len, &p_cluster, &num_clusters); 7008 meta_ac, data_ac,
6940 if (ret) { 7009 blkno, cpos, len);
6941 mlog_errno(ret);
6942 goto out_commit;
6943 }
6944
6945 new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
6946
6947 mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
6948 (unsigned long long)blkno, (unsigned long long)new_blkno, len);
6949 ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
6950 meta_ac, data_ac, args);
6951 if (ret) {
6952 mlog_errno(ret);
6953 goto out_commit;
6954 }
6955
6956 mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
6957 (unsigned long long)new_blkno, len, cpos);
6958 ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
6959 len, 0, meta_ac);
6960 if (ret) 7010 if (ret)
6961 mlog_errno(ret); 7011 mlog_errno(ret);
6962 7012
6963out_commit:
6964 ocfs2_commit_trans(osb, handle); 7013 ocfs2_commit_trans(osb, handle);
6965 7014
6966out: 7015out:
@@ -7234,7 +7283,7 @@ int ocfs2_init_security_set(handle_t *handle,
7234 xattr_ac, data_ac); 7283 xattr_ac, data_ac);
7235} 7284}
7236 7285
7237struct xattr_handler ocfs2_xattr_security_handler = { 7286const struct xattr_handler ocfs2_xattr_security_handler = {
7238 .prefix = XATTR_SECURITY_PREFIX, 7287 .prefix = XATTR_SECURITY_PREFIX,
7239 .list = ocfs2_xattr_security_list, 7288 .list = ocfs2_xattr_security_list,
7240 .get = ocfs2_xattr_security_get, 7289 .get = ocfs2_xattr_security_get,
@@ -7278,7 +7327,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
7278 name, value, size, flags); 7327 name, value, size, flags);
7279} 7328}
7280 7329
7281struct xattr_handler ocfs2_xattr_trusted_handler = { 7330const struct xattr_handler ocfs2_xattr_trusted_handler = {
7282 .prefix = XATTR_TRUSTED_PREFIX, 7331 .prefix = XATTR_TRUSTED_PREFIX,
7283 .list = ocfs2_xattr_trusted_list, 7332 .list = ocfs2_xattr_trusted_list,
7284 .get = ocfs2_xattr_trusted_get, 7333 .get = ocfs2_xattr_trusted_get,
@@ -7334,7 +7383,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
7334 name, value, size, flags); 7383 name, value, size, flags);
7335} 7384}
7336 7385
7337struct xattr_handler ocfs2_xattr_user_handler = { 7386const struct xattr_handler ocfs2_xattr_user_handler = {
7338 .prefix = XATTR_USER_PREFIX, 7387 .prefix = XATTR_USER_PREFIX,
7339 .list = ocfs2_xattr_user_list, 7388 .list = ocfs2_xattr_user_list,
7340 .get = ocfs2_xattr_user_get, 7389 .get = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f520..aa64bb37a65b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
37 size_t value_len; 37 size_t value_len;
38}; 38};
39 39
40extern struct xattr_handler ocfs2_xattr_user_handler; 40extern const struct xattr_handler ocfs2_xattr_user_handler;
41extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern const struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler; 42extern const struct xattr_handler ocfs2_xattr_security_handler;
43extern struct xattr_handler ocfs2_xattr_acl_access_handler; 43extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
44extern struct xattr_handler ocfs2_xattr_acl_default_handler; 44extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
45extern struct xattr_handler *ocfs2_xattr_handlers[]; 45extern const struct xattr_handler *ocfs2_xattr_handlers[];
46 46
47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, 48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index b42d62419034..393f3f659da7 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -25,11 +25,10 @@ static struct buffer_head *omfs_get_bucket(struct inode *dir,
25 const char *name, int namelen, int *ofs) 25 const char *name, int namelen, int *ofs)
26{ 26{
27 int nbuckets = (dir->i_size - OMFS_DIR_START)/8; 27 int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
28 int block = clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino);
29 int bucket = omfs_hash(name, namelen, nbuckets); 28 int bucket = omfs_hash(name, namelen, nbuckets);
30 29
31 *ofs = OMFS_DIR_START + bucket * 8; 30 *ofs = OMFS_DIR_START + bucket * 8;
32 return sb_bread(dir->i_sb, block); 31 return omfs_bread(dir->i_sb, dir->i_ino);
33} 32}
34 33
35static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block, 34static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
@@ -42,8 +41,7 @@ static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
42 *prev_block = ~0; 41 *prev_block = ~0;
43 42
44 while (block != ~0) { 43 while (block != ~0) {
45 bh = sb_bread(dir->i_sb, 44 bh = omfs_bread(dir->i_sb, block);
46 clus_to_blk(OMFS_SB(dir->i_sb), block));
47 if (!bh) { 45 if (!bh) {
48 err = -EIO; 46 err = -EIO;
49 goto err; 47 goto err;
@@ -86,11 +84,10 @@ static struct buffer_head *omfs_find_entry(struct inode *dir,
86int omfs_make_empty(struct inode *inode, struct super_block *sb) 84int omfs_make_empty(struct inode *inode, struct super_block *sb)
87{ 85{
88 struct omfs_sb_info *sbi = OMFS_SB(sb); 86 struct omfs_sb_info *sbi = OMFS_SB(sb);
89 int block = clus_to_blk(sbi, inode->i_ino);
90 struct buffer_head *bh; 87 struct buffer_head *bh;
91 struct omfs_inode *oi; 88 struct omfs_inode *oi;
92 89
93 bh = sb_bread(sb, block); 90 bh = omfs_bread(sb, inode->i_ino);
94 if (!bh) 91 if (!bh)
95 return -ENOMEM; 92 return -ENOMEM;
96 93
@@ -134,7 +131,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode)
134 brelse(bh); 131 brelse(bh);
135 132
136 /* now set the sibling and parent pointers on the new inode */ 133 /* now set the sibling and parent pointers on the new inode */
137 bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), inode->i_ino)); 134 bh = omfs_bread(dir->i_sb, inode->i_ino);
138 if (!bh) 135 if (!bh)
139 goto out; 136 goto out;
140 137
@@ -190,8 +187,7 @@ static int omfs_delete_entry(struct dentry *dentry)
190 if (prev != ~0) { 187 if (prev != ~0) {
191 /* found in middle of list, get list ptr */ 188 /* found in middle of list, get list ptr */
192 brelse(bh); 189 brelse(bh);
193 bh = sb_bread(dir->i_sb, 190 bh = omfs_bread(dir->i_sb, prev);
194 clus_to_blk(OMFS_SB(dir->i_sb), prev));
195 if (!bh) 191 if (!bh)
196 goto out; 192 goto out;
197 193
@@ -224,8 +220,7 @@ static int omfs_dir_is_empty(struct inode *inode)
224 u64 *ptr; 220 u64 *ptr;
225 int i; 221 int i;
226 222
227 bh = sb_bread(inode->i_sb, clus_to_blk(OMFS_SB(inode->i_sb), 223 bh = omfs_bread(inode->i_sb, inode->i_ino);
228 inode->i_ino));
229 224
230 if (!bh) 225 if (!bh)
231 return 0; 226 return 0;
@@ -353,8 +348,7 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
353 348
354 /* follow chain in this bucket */ 349 /* follow chain in this bucket */
355 while (fsblock != ~0) { 350 while (fsblock != ~0) {
356 bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), 351 bh = omfs_bread(dir->i_sb, fsblock);
357 fsblock));
358 if (!bh) 352 if (!bh)
359 goto out; 353 goto out;
360 354
@@ -466,7 +460,7 @@ static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
466 hchain = (filp->f_pos >> 20) - 1; 460 hchain = (filp->f_pos >> 20) - 1;
467 hindex = filp->f_pos & 0xfffff; 461 hindex = filp->f_pos & 0xfffff;
468 462
469 bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino)); 463 bh = omfs_bread(dir->i_sb, dir->i_ino);
470 if (!bh) 464 if (!bh)
471 goto out; 465 goto out;
472 466
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..8a6d34fa668a 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -50,7 +50,7 @@ int omfs_shrink_inode(struct inode *inode)
50 if (inode->i_size != 0) 50 if (inode->i_size != 0)
51 goto out; 51 goto out;
52 52
53 bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next)); 53 bh = omfs_bread(inode->i_sb, next);
54 if (!bh) 54 if (!bh)
55 goto out; 55 goto out;
56 56
@@ -90,7 +90,7 @@ int omfs_shrink_inode(struct inode *inode)
90 if (next == ~0) 90 if (next == ~0)
91 break; 91 break;
92 92
93 bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next)); 93 bh = omfs_bread(inode->i_sb, next);
94 if (!bh) 94 if (!bh)
95 goto out; 95 goto out;
96 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); 96 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
@@ -222,7 +222,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
222 struct buffer_head *bh; 222 struct buffer_head *bh;
223 sector_t next, offset; 223 sector_t next, offset;
224 int ret; 224 int ret;
225 u64 new_block; 225 u64 uninitialized_var(new_block);
226 u32 max_extents; 226 u32 max_extents;
227 int extent_count; 227 int extent_count;
228 struct omfs_extent *oe; 228 struct omfs_extent *oe;
@@ -232,7 +232,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
232 int remain; 232 int remain;
233 233
234 ret = -EIO; 234 ret = -EIO;
235 bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino)); 235 bh = omfs_bread(inode->i_sb, inode->i_ino);
236 if (!bh) 236 if (!bh)
237 goto out; 237 goto out;
238 238
@@ -265,7 +265,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
265 break; 265 break;
266 266
267 brelse(bh); 267 brelse(bh);
268 bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next)); 268 bh = omfs_bread(inode->i_sb, next);
269 if (!bh) 269 if (!bh)
270 goto out; 270 goto out;
271 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); 271 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
@@ -312,9 +312,17 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
312 loff_t pos, unsigned len, unsigned flags, 312 loff_t pos, unsigned len, unsigned flags,
313 struct page **pagep, void **fsdata) 313 struct page **pagep, void **fsdata)
314{ 314{
315 *pagep = NULL; 315 int ret;
316 return block_write_begin(file, mapping, pos, len, flags, 316
317 pagep, fsdata, omfs_get_block); 317 ret = block_write_begin(mapping, pos, len, flags, pagep,
318 omfs_get_block);
319 if (unlikely(ret)) {
320 loff_t isize = mapping->host->i_size;
321 if (pos + len > isize)
322 vmtruncate(mapping->host, isize);
323 }
324
325 return ret;
318} 326}
319 327
320static sector_t omfs_bmap(struct address_space *mapping, sector_t block) 328static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
@@ -329,11 +337,33 @@ const struct file_operations omfs_file_operations = {
329 .aio_read = generic_file_aio_read, 337 .aio_read = generic_file_aio_read,
330 .aio_write = generic_file_aio_write, 338 .aio_write = generic_file_aio_write,
331 .mmap = generic_file_mmap, 339 .mmap = generic_file_mmap,
332 .fsync = simple_fsync, 340 .fsync = generic_file_fsync,
333 .splice_read = generic_file_splice_read, 341 .splice_read = generic_file_splice_read,
334}; 342};
335 343
344static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
345{
346 struct inode *inode = dentry->d_inode;
347 int error;
348
349 error = inode_change_ok(inode, attr);
350 if (error)
351 return error;
352
353 if ((attr->ia_valid & ATTR_SIZE) &&
354 attr->ia_size != i_size_read(inode)) {
355 error = vmtruncate(inode, attr->ia_size);
356 if (error)
357 return error;
358 }
359
360 setattr_copy(inode, attr);
361 mark_inode_dirty(inode);
362 return 0;
363}
364
336const struct inode_operations omfs_file_inops = { 365const struct inode_operations omfs_file_inops = {
366 .setattr = omfs_setattr,
337 .truncate = omfs_truncate 367 .truncate = omfs_truncate
338}; 368};
339 369
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index c82af6acc2e7..14a22863291a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -3,7 +3,6 @@
3 * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com> 3 * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
4 * Released under GPL v2. 4 * Released under GPL v2.
5 */ 5 */
6#include <linux/version.h>
7#include <linux/module.h> 6#include <linux/module.h>
8#include <linux/sched.h> 7#include <linux/sched.h>
9#include <linux/slab.h> 8#include <linux/slab.h>
@@ -20,6 +19,15 @@ MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
20MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux"); 19MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
21MODULE_LICENSE("GPL"); 20MODULE_LICENSE("GPL");
22 21
22struct buffer_head *omfs_bread(struct super_block *sb, sector_t block)
23{
24 struct omfs_sb_info *sbi = OMFS_SB(sb);
25 if (block >= sbi->s_num_blocks)
26 return NULL;
27
28 return sb_bread(sb, clus_to_blk(sbi, block));
29}
30
23struct inode *omfs_new_inode(struct inode *dir, int mode) 31struct inode *omfs_new_inode(struct inode *dir, int mode)
24{ 32{
25 struct inode *inode; 33 struct inode *inode;
@@ -38,9 +46,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
38 goto fail; 46 goto fail;
39 47
40 inode->i_ino = new_block; 48 inode->i_ino = new_block;
41 inode->i_mode = mode; 49 inode_init_owner(inode, NULL, mode);
42 inode->i_uid = current_fsuid();
43 inode->i_gid = current_fsgid();
44 inode->i_mapping->a_ops = &omfs_aops; 50 inode->i_mapping->a_ops = &omfs_aops;
45 51
46 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 52 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -96,15 +102,13 @@ static int __omfs_write_inode(struct inode *inode, int wait)
96 struct omfs_inode *oi; 102 struct omfs_inode *oi;
97 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); 103 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
98 struct buffer_head *bh, *bh2; 104 struct buffer_head *bh, *bh2;
99 unsigned int block;
100 u64 ctime; 105 u64 ctime;
101 int i; 106 int i;
102 int ret = -EIO; 107 int ret = -EIO;
103 int sync_failed = 0; 108 int sync_failed = 0;
104 109
105 /* get current inode since we may have written sibling ptrs etc. */ 110 /* get current inode since we may have written sibling ptrs etc. */
106 block = clus_to_blk(sbi, inode->i_ino); 111 bh = omfs_bread(inode->i_sb, inode->i_ino);
107 bh = sb_bread(inode->i_sb, block);
108 if (!bh) 112 if (!bh)
109 goto out; 113 goto out;
110 114
@@ -143,8 +147,7 @@ static int __omfs_write_inode(struct inode *inode, int wait)
143 147
144 /* if mirroring writes, copy to next fsblock */ 148 /* if mirroring writes, copy to next fsblock */
145 for (i = 1; i < sbi->s_mirrors; i++) { 149 for (i = 1; i < sbi->s_mirrors; i++) {
146 bh2 = sb_bread(inode->i_sb, block + i * 150 bh2 = omfs_bread(inode->i_sb, inode->i_ino + i);
147 (sbi->s_blocksize / sbi->s_sys_blocksize));
148 if (!bh2) 151 if (!bh2)
149 goto out_brelse; 152 goto out_brelse;
150 153
@@ -178,9 +181,13 @@ int omfs_sync_inode(struct inode *inode)
178 * called when an entry is deleted, need to clear the bits in the 181 * called when an entry is deleted, need to clear the bits in the
179 * bitmaps. 182 * bitmaps.
180 */ 183 */
181static void omfs_delete_inode(struct inode *inode) 184static void omfs_evict_inode(struct inode *inode)
182{ 185{
183 truncate_inode_pages(&inode->i_data, 0); 186 truncate_inode_pages(&inode->i_data, 0);
187 end_writeback(inode);
188
189 if (inode->i_nlink)
190 return;
184 191
185 if (S_ISREG(inode->i_mode)) { 192 if (S_ISREG(inode->i_mode)) {
186 inode->i_size = 0; 193 inode->i_size = 0;
@@ -188,7 +195,6 @@ static void omfs_delete_inode(struct inode *inode)
188 } 195 }
189 196
190 omfs_clear_range(inode->i_sb, inode->i_ino, 2); 197 omfs_clear_range(inode->i_sb, inode->i_ino, 2);
191 clear_inode(inode);
192} 198}
193 199
194struct inode *omfs_iget(struct super_block *sb, ino_t ino) 200struct inode *omfs_iget(struct super_block *sb, ino_t ino)
@@ -196,7 +202,6 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
196 struct omfs_sb_info *sbi = OMFS_SB(sb); 202 struct omfs_sb_info *sbi = OMFS_SB(sb);
197 struct omfs_inode *oi; 203 struct omfs_inode *oi;
198 struct buffer_head *bh; 204 struct buffer_head *bh;
199 unsigned int block;
200 u64 ctime; 205 u64 ctime;
201 unsigned long nsecs; 206 unsigned long nsecs;
202 struct inode *inode; 207 struct inode *inode;
@@ -207,8 +212,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
207 if (!(inode->i_state & I_NEW)) 212 if (!(inode->i_state & I_NEW))
208 return inode; 213 return inode;
209 214
210 block = clus_to_blk(sbi, ino); 215 bh = omfs_bread(inode->i_sb, ino);
211 bh = sb_bread(inode->i_sb, block);
212 if (!bh) 216 if (!bh)
213 goto iget_failed; 217 goto iget_failed;
214 218
@@ -287,7 +291,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
287 291
288static const struct super_operations omfs_sops = { 292static const struct super_operations omfs_sops = {
289 .write_inode = omfs_write_inode, 293 .write_inode = omfs_write_inode,
290 .delete_inode = omfs_delete_inode, 294 .evict_inode = omfs_evict_inode,
291 .put_super = omfs_put_super, 295 .put_super = omfs_put_super,
292 .statfs = omfs_statfs, 296 .statfs = omfs_statfs,
293 .show_options = generic_show_options, 297 .show_options = generic_show_options,
@@ -322,6 +326,9 @@ static int omfs_get_imap(struct super_block *sb)
322 goto nomem; 326 goto nomem;
323 327
324 block = clus_to_blk(sbi, sbi->s_bitmap_ino); 328 block = clus_to_blk(sbi, sbi->s_bitmap_ino);
329 if (block >= sbi->s_num_blocks)
330 goto nomem;
331
325 ptr = sbi->s_imap; 332 ptr = sbi->s_imap;
326 for (count = bitmap_size; count > 0; count -= sb->s_blocksize) { 333 for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
327 bh = sb_bread(sb, block++); 334 bh = sb_bread(sb, block++);
@@ -420,7 +427,6 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
420 struct omfs_root_block *omfs_rb; 427 struct omfs_root_block *omfs_rb;
421 struct omfs_sb_info *sbi; 428 struct omfs_sb_info *sbi;
422 struct inode *root; 429 struct inode *root;
423 sector_t start;
424 int ret = -EINVAL; 430 int ret = -EINVAL;
425 431
426 save_mount_options(sb, (char *) data); 432 save_mount_options(sb, (char *) data);
@@ -489,8 +495,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
489 sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) - 495 sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
490 get_bitmask_order(sbi->s_sys_blocksize); 496 get_bitmask_order(sbi->s_sys_blocksize);
491 497
492 start = clus_to_blk(sbi, be64_to_cpu(omfs_sb->s_root_block)); 498 bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block));
493 bh2 = sb_bread(sb, start);
494 if (!bh2) 499 if (!bh2)
495 goto out_brelse_bh; 500 goto out_brelse_bh;
496 501
@@ -507,6 +512,21 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
507 goto out_brelse_bh2; 512 goto out_brelse_bh2;
508 } 513 }
509 514
515 if (sbi->s_bitmap_ino != ~0ULL &&
516 sbi->s_bitmap_ino > sbi->s_num_blocks) {
517 printk(KERN_ERR "omfs: free space bitmap location is corrupt "
518 "(%llx, total blocks %llx)\n",
519 (unsigned long long) sbi->s_bitmap_ino,
520 (unsigned long long) sbi->s_num_blocks);
521 goto out_brelse_bh2;
522 }
523 if (sbi->s_clustersize < 1 ||
524 sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) {
525 printk(KERN_ERR "omfs: cluster size out of range (%d)",
526 sbi->s_clustersize);
527 goto out_brelse_bh2;
528 }
529
510 ret = omfs_get_imap(sb); 530 ret = omfs_get_imap(sb);
511 if (ret) 531 if (ret)
512 goto out_brelse_bh2; 532 goto out_brelse_bh2;
@@ -532,6 +552,8 @@ out_brelse_bh2:
532out_brelse_bh: 552out_brelse_bh:
533 brelse(bh); 553 brelse(bh);
534end: 554end:
555 if (ret)
556 kfree(sbi);
535 return ret; 557 return ret;
536} 558}
537 559
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index ebe2fdbe535e..7d414fef501a 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -58,6 +58,7 @@ extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
58extern int omfs_shrink_inode(struct inode *inode); 58extern int omfs_shrink_inode(struct inode *inode);
59 59
60/* inode.c */ 60/* inode.c */
61extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block);
61extern struct inode *omfs_iget(struct super_block *sb, ino_t inode); 62extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
62extern struct inode *omfs_new_inode(struct inode *dir, int mode); 63extern struct inode *omfs_new_inode(struct inode *dir, int mode);
63extern int omfs_reserve_block(struct super_block *sb, sector_t block); 64extern int omfs_reserve_block(struct super_block *sb, sector_t block);
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
index 12cca245d6e8..ee5e4327de92 100644
--- a/fs/omfs/omfs_fs.h
+++ b/fs/omfs/omfs_fs.h
@@ -17,6 +17,7 @@
17#define OMFS_EXTENT_CONT 0x40 17#define OMFS_EXTENT_CONT 0x40
18#define OMFS_XOR_COUNT 19 18#define OMFS_XOR_COUNT 19
19#define OMFS_MAX_BLOCK_SIZE 8192 19#define OMFS_MAX_BLOCK_SIZE 8192
20#define OMFS_MAX_CLUSTER_SIZE 8
20 21
21struct omfs_super_block { 22struct omfs_super_block {
22 char s_fill1[256]; 23 char s_fill1[256];
diff --git a/fs/open.c b/fs/open.c
index 74e5cd9f718e..d74e1983e8dc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -17,7 +17,6 @@
17#include <linux/securebits.h> 17#include <linux/securebits.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/vfs.h>
21#include <linux/fcntl.h> 20#include <linux/fcntl.h>
22#include <linux/slab.h> 21#include <linux/slab.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
@@ -30,174 +29,10 @@
30#include <linux/falloc.h> 29#include <linux/falloc.h>
31#include <linux/fs_struct.h> 30#include <linux/fs_struct.h>
32#include <linux/ima.h> 31#include <linux/ima.h>
32#include <linux/dnotify.h>
33 33
34#include "internal.h" 34#include "internal.h"
35 35
36int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
37{
38 int retval = -ENODEV;
39
40 if (dentry) {
41 retval = -ENOSYS;
42 if (dentry->d_sb->s_op->statfs) {
43 memset(buf, 0, sizeof(*buf));
44 retval = security_sb_statfs(dentry);
45 if (retval)
46 return retval;
47 retval = dentry->d_sb->s_op->statfs(dentry, buf);
48 if (retval == 0 && buf->f_frsize == 0)
49 buf->f_frsize = buf->f_bsize;
50 }
51 }
52 return retval;
53}
54
55EXPORT_SYMBOL(vfs_statfs);
56
57static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
58{
59 struct kstatfs st;
60 int retval;
61
62 retval = vfs_statfs(dentry, &st);
63 if (retval)
64 return retval;
65
66 if (sizeof(*buf) == sizeof(st))
67 memcpy(buf, &st, sizeof(st));
68 else {
69 if (sizeof buf->f_blocks == 4) {
70 if ((st.f_blocks | st.f_bfree | st.f_bavail |
71 st.f_bsize | st.f_frsize) &
72 0xffffffff00000000ULL)
73 return -EOVERFLOW;
74 /*
75 * f_files and f_ffree may be -1; it's okay to stuff
76 * that into 32 bits
77 */
78 if (st.f_files != -1 &&
79 (st.f_files & 0xffffffff00000000ULL))
80 return -EOVERFLOW;
81 if (st.f_ffree != -1 &&
82 (st.f_ffree & 0xffffffff00000000ULL))
83 return -EOVERFLOW;
84 }
85
86 buf->f_type = st.f_type;
87 buf->f_bsize = st.f_bsize;
88 buf->f_blocks = st.f_blocks;
89 buf->f_bfree = st.f_bfree;
90 buf->f_bavail = st.f_bavail;
91 buf->f_files = st.f_files;
92 buf->f_ffree = st.f_ffree;
93 buf->f_fsid = st.f_fsid;
94 buf->f_namelen = st.f_namelen;
95 buf->f_frsize = st.f_frsize;
96 memset(buf->f_spare, 0, sizeof(buf->f_spare));
97 }
98 return 0;
99}
100
101static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
102{
103 struct kstatfs st;
104 int retval;
105
106 retval = vfs_statfs(dentry, &st);
107 if (retval)
108 return retval;
109
110 if (sizeof(*buf) == sizeof(st))
111 memcpy(buf, &st, sizeof(st));
112 else {
113 buf->f_type = st.f_type;
114 buf->f_bsize = st.f_bsize;
115 buf->f_blocks = st.f_blocks;
116 buf->f_bfree = st.f_bfree;
117 buf->f_bavail = st.f_bavail;
118 buf->f_files = st.f_files;
119 buf->f_ffree = st.f_ffree;
120 buf->f_fsid = st.f_fsid;
121 buf->f_namelen = st.f_namelen;
122 buf->f_frsize = st.f_frsize;
123 memset(buf->f_spare, 0, sizeof(buf->f_spare));
124 }
125 return 0;
126}
127
128SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
129{
130 struct path path;
131 int error;
132
133 error = user_path(pathname, &path);
134 if (!error) {
135 struct statfs tmp;
136 error = vfs_statfs_native(path.dentry, &tmp);
137 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
138 error = -EFAULT;
139 path_put(&path);
140 }
141 return error;
142}
143
144SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
145{
146 struct path path;
147 long error;
148
149 if (sz != sizeof(*buf))
150 return -EINVAL;
151 error = user_path(pathname, &path);
152 if (!error) {
153 struct statfs64 tmp;
154 error = vfs_statfs64(path.dentry, &tmp);
155 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
156 error = -EFAULT;
157 path_put(&path);
158 }
159 return error;
160}
161
162SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
163{
164 struct file * file;
165 struct statfs tmp;
166 int error;
167
168 error = -EBADF;
169 file = fget(fd);
170 if (!file)
171 goto out;
172 error = vfs_statfs_native(file->f_path.dentry, &tmp);
173 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
174 error = -EFAULT;
175 fput(file);
176out:
177 return error;
178}
179
180SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
181{
182 struct file * file;
183 struct statfs64 tmp;
184 int error;
185
186 if (sz != sizeof(*buf))
187 return -EINVAL;
188
189 error = -EBADF;
190 file = fget(fd);
191 if (!file)
192 goto out;
193 error = vfs_statfs64(file->f_path.dentry, &tmp);
194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
195 error = -EFAULT;
196 fput(file);
197out:
198 return error;
199}
200
201int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, 36int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
202 struct file *filp) 37 struct file *filp)
203{ 38{
@@ -276,7 +111,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
276 111
277 error = locks_verify_truncate(inode, NULL, length); 112 error = locks_verify_truncate(inode, NULL, length);
278 if (!error) 113 if (!error)
279 error = security_path_truncate(&path, length, 0); 114 error = security_path_truncate(&path);
280 if (!error) 115 if (!error)
281 error = do_truncate(path.dentry, length, 0, NULL); 116 error = do_truncate(path.dentry, length, 0, NULL);
282 117
@@ -331,8 +166,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
331 166
332 error = locks_verify_truncate(inode, file, length); 167 error = locks_verify_truncate(inode, file, length);
333 if (!error) 168 if (!error)
334 error = security_path_truncate(&file->f_path, length, 169 error = security_path_truncate(&file->f_path);
335 ATTR_MTIME|ATTR_CTIME);
336 if (!error) 170 if (!error)
337 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); 171 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
338out_putf: 172out_putf:
@@ -533,7 +367,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
533 if (error) 367 if (error)
534 goto out; 368 goto out;
535 369
536 error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); 370 error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
537 if (error) 371 if (error)
538 goto dput_and_out; 372 goto dput_and_out;
539 373
@@ -562,7 +396,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
562 if (!S_ISDIR(inode->i_mode)) 396 if (!S_ISDIR(inode->i_mode))
563 goto out_putf; 397 goto out_putf;
564 398
565 error = inode_permission(inode, MAY_EXEC | MAY_ACCESS); 399 error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
566 if (!error) 400 if (!error)
567 set_fs_pwd(current->fs, &file->f_path); 401 set_fs_pwd(current->fs, &file->f_path);
568out_putf: 402out_putf:
@@ -580,7 +414,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
580 if (error) 414 if (error)
581 goto out; 415 goto out;
582 416
583 error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); 417 error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
584 if (error) 418 if (error)
585 goto dput_and_out; 419 goto dput_and_out;
586 420
@@ -841,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
841 f->f_path.mnt = mnt; 675 f->f_path.mnt = mnt;
842 f->f_pos = 0; 676 f->f_pos = 0;
843 f->f_op = fops_get(inode->i_fop); 677 f->f_op = fops_get(inode->i_fop);
844 file_move(f, &inode->i_sb->s_files); 678 file_sb_list_add(f, inode->i_sb);
845 679
846 error = security_dentry_open(f, cred); 680 error = security_dentry_open(f, cred);
847 if (error) 681 if (error)
@@ -887,7 +721,7 @@ cleanup_all:
887 mnt_drop_write(mnt); 721 mnt_drop_write(mnt);
888 } 722 }
889 } 723 }
890 file_kill(f); 724 file_sb_list_del(f);
891 f->f_path.dentry = NULL; 725 f->f_path.dentry = NULL;
892 f->f_path.mnt = NULL; 726 f->f_path.mnt = NULL;
893cleanup_file: 727cleanup_file:
@@ -1054,7 +888,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
1054 put_unused_fd(fd); 888 put_unused_fd(fd);
1055 fd = PTR_ERR(f); 889 fd = PTR_ERR(f);
1056 } else { 890 } else {
1057 fsnotify_open(f->f_path.dentry); 891 fsnotify_open(f);
1058 fd_install(fd, f); 892 fd_install(fd, f);
1059 } 893 }
1060 } 894 }
@@ -1197,7 +1031,9 @@ EXPORT_SYMBOL(generic_file_open);
1197 1031
1198/* 1032/*
1199 * This is used by subsystems that don't want seekable 1033 * This is used by subsystems that don't want seekable
1200 * file descriptors 1034 * file descriptors. The function is not supposed to ever fail, the only
1035 * reason it returns an 'int' and not 'void' is so that it can be plugged
1036 * directly into file_operations structure.
1201 */ 1037 */
1202int nonseekable_open(struct inode *inode, struct file *filp) 1038int nonseekable_open(struct inode *inode, struct file *filp)
1203{ 1039{
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..fbeb697374d5 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -45,8 +45,11 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
45 nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | 45 nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
46 (le32_to_cpu(dr->disc_size) >> 9); 46 (le32_to_cpu(dr->disc_size) >> 9);
47 47
48 if (name) 48 if (name) {
49 printk(" [%s]", name); 49 strlcat(state->pp_buf, " [", PAGE_SIZE);
50 strlcat(state->pp_buf, name, PAGE_SIZE);
51 strlcat(state->pp_buf, "]", PAGE_SIZE);
52 }
50 put_partition(state, slot, first_sector, nr_sects); 53 put_partition(state, slot, first_sector, nr_sects);
51 return dr; 54 return dr;
52} 55}
@@ -70,25 +73,25 @@ struct riscix_record {
70 73
71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 74#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
72 defined(CONFIG_ACORN_PARTITION_ADFS) 75 defined(CONFIG_ACORN_PARTITION_ADFS)
73static int 76static int riscix_partition(struct parsed_partitions *state,
74riscix_partition(struct parsed_partitions *state, struct block_device *bdev, 77 unsigned long first_sect, int slot,
75 unsigned long first_sect, int slot, unsigned long nr_sects) 78 unsigned long nr_sects)
76{ 79{
77 Sector sect; 80 Sector sect;
78 struct riscix_record *rr; 81 struct riscix_record *rr;
79 82
80 rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect); 83 rr = read_part_sector(state, first_sect, &sect);
81 if (!rr) 84 if (!rr)
82 return -1; 85 return -1;
83 86
84 printk(" [RISCiX]"); 87 strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
85 88
86 89
87 if (rr->magic == RISCIX_MAGIC) { 90 if (rr->magic == RISCIX_MAGIC) {
88 unsigned long size = nr_sects > 2 ? 2 : nr_sects; 91 unsigned long size = nr_sects > 2 ? 2 : nr_sects;
89 int part; 92 int part;
90 93
91 printk(" <"); 94 strlcat(state->pp_buf, " <", PAGE_SIZE);
92 95
93 put_partition(state, slot++, first_sect, size); 96 put_partition(state, slot++, first_sect, size);
94 for (part = 0; part < 8; part++) { 97 for (part = 0; part < 8; part++) {
@@ -97,11 +100,13 @@ riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
97 put_partition(state, slot++, 100 put_partition(state, slot++,
98 le32_to_cpu(rr->part[part].start), 101 le32_to_cpu(rr->part[part].start),
99 le32_to_cpu(rr->part[part].length)); 102 le32_to_cpu(rr->part[part].length));
100 printk("(%s)", rr->part[part].name); 103 strlcat(state->pp_buf, "(", PAGE_SIZE);
104 strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
105 strlcat(state->pp_buf, ")", PAGE_SIZE);
101 } 106 }
102 } 107 }
103 108
104 printk(" >\n"); 109 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
105 } else { 110 } else {
106 put_partition(state, slot++, first_sect, nr_sects); 111 put_partition(state, slot++, first_sect, nr_sects);
107 } 112 }
@@ -123,23 +128,23 @@ struct linux_part {
123 128
124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 129#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
125 defined(CONFIG_ACORN_PARTITION_ADFS) 130 defined(CONFIG_ACORN_PARTITION_ADFS)
126static int 131static int linux_partition(struct parsed_partitions *state,
127linux_partition(struct parsed_partitions *state, struct block_device *bdev, 132 unsigned long first_sect, int slot,
128 unsigned long first_sect, int slot, unsigned long nr_sects) 133 unsigned long nr_sects)
129{ 134{
130 Sector sect; 135 Sector sect;
131 struct linux_part *linuxp; 136 struct linux_part *linuxp;
132 unsigned long size = nr_sects > 2 ? 2 : nr_sects; 137 unsigned long size = nr_sects > 2 ? 2 : nr_sects;
133 138
134 printk(" [Linux]"); 139 strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
135 140
136 put_partition(state, slot++, first_sect, size); 141 put_partition(state, slot++, first_sect, size);
137 142
138 linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect); 143 linuxp = read_part_sector(state, first_sect, &sect);
139 if (!linuxp) 144 if (!linuxp)
140 return -1; 145 return -1;
141 146
142 printk(" <"); 147 strlcat(state->pp_buf, " <", PAGE_SIZE);
143 while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || 148 while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
144 linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { 149 linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
145 if (slot == state->limit) 150 if (slot == state->limit)
@@ -149,7 +154,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
149 le32_to_cpu(linuxp->nr_sects)); 154 le32_to_cpu(linuxp->nr_sects));
150 linuxp ++; 155 linuxp ++;
151 } 156 }
152 printk(" >"); 157 strlcat(state->pp_buf, " >", PAGE_SIZE);
153 158
154 put_dev_sector(sect); 159 put_dev_sector(sect);
155 return slot; 160 return slot;
@@ -157,8 +162,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
157#endif 162#endif
158 163
159#ifdef CONFIG_ACORN_PARTITION_CUMANA 164#ifdef CONFIG_ACORN_PARTITION_CUMANA
160int 165int adfspart_check_CUMANA(struct parsed_partitions *state)
161adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
162{ 166{
163 unsigned long first_sector = 0; 167 unsigned long first_sector = 0;
164 unsigned int start_blk = 0; 168 unsigned int start_blk = 0;
@@ -185,7 +189,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
185 struct adfs_discrecord *dr; 189 struct adfs_discrecord *dr;
186 unsigned int nr_sects; 190 unsigned int nr_sects;
187 191
188 data = read_dev_sector(bdev, start_blk * 2 + 6, &sect); 192 data = read_part_sector(state, start_blk * 2 + 6, &sect);
189 if (!data) 193 if (!data)
190 return -1; 194 return -1;
191 195
@@ -217,14 +221,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
217#ifdef CONFIG_ACORN_PARTITION_RISCIX 221#ifdef CONFIG_ACORN_PARTITION_RISCIX
218 case PARTITION_RISCIX_SCSI: 222 case PARTITION_RISCIX_SCSI:
219 /* RISCiX - we don't know how to find the next one. */ 223 /* RISCiX - we don't know how to find the next one. */
220 slot = riscix_partition(state, bdev, first_sector, 224 slot = riscix_partition(state, first_sector, slot,
221 slot, nr_sects); 225 nr_sects);
222 break; 226 break;
223#endif 227#endif
224 228
225 case PARTITION_LINUX: 229 case PARTITION_LINUX:
226 slot = linux_partition(state, bdev, first_sector, 230 slot = linux_partition(state, first_sector, slot,
227 slot, nr_sects); 231 nr_sects);
228 break; 232 break;
229 } 233 }
230 put_dev_sector(sect); 234 put_dev_sector(sect);
@@ -249,8 +253,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
249 * hda1 = ADFS partition on first drive. 253 * hda1 = ADFS partition on first drive.
250 * hda2 = non-ADFS partition. 254 * hda2 = non-ADFS partition.
251 */ 255 */
252int 256int adfspart_check_ADFS(struct parsed_partitions *state)
253adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
254{ 257{
255 unsigned long start_sect, nr_sects, sectscyl, heads; 258 unsigned long start_sect, nr_sects, sectscyl, heads;
256 Sector sect; 259 Sector sect;
@@ -259,7 +262,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
259 unsigned char id; 262 unsigned char id;
260 int slot = 1; 263 int slot = 1;
261 264
262 data = read_dev_sector(bdev, 6, &sect); 265 data = read_part_sector(state, 6, &sect);
263 if (!data) 266 if (!data)
264 return -1; 267 return -1;
265 268
@@ -278,25 +281,25 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
278 /* 281 /*
279 * Work out start of non-adfs partition. 282 * Work out start of non-adfs partition.
280 */ 283 */
281 nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; 284 nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
282 285
283 if (start_sect) { 286 if (start_sect) {
284 switch (id) { 287 switch (id) {
285#ifdef CONFIG_ACORN_PARTITION_RISCIX 288#ifdef CONFIG_ACORN_PARTITION_RISCIX
286 case PARTITION_RISCIX_SCSI: 289 case PARTITION_RISCIX_SCSI:
287 case PARTITION_RISCIX_MFM: 290 case PARTITION_RISCIX_MFM:
288 slot = riscix_partition(state, bdev, start_sect, 291 slot = riscix_partition(state, start_sect, slot,
289 slot, nr_sects); 292 nr_sects);
290 break; 293 break;
291#endif 294#endif
292 295
293 case PARTITION_LINUX: 296 case PARTITION_LINUX:
294 slot = linux_partition(state, bdev, start_sect, 297 slot = linux_partition(state, start_sect, slot,
295 slot, nr_sects); 298 nr_sects);
296 break; 299 break;
297 } 300 }
298 } 301 }
299 printk("\n"); 302 strlcat(state->pp_buf, "\n", PAGE_SIZE);
300 return 1; 303 return 1;
301} 304}
302#endif 305#endif
@@ -308,10 +311,11 @@ struct ics_part {
308 __le32 size; 311 __le32 size;
309}; 312};
310 313
311static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) 314static int adfspart_check_ICSLinux(struct parsed_partitions *state,
315 unsigned long block)
312{ 316{
313 Sector sect; 317 Sector sect;
314 unsigned char *data = read_dev_sector(bdev, block, &sect); 318 unsigned char *data = read_part_sector(state, block, &sect);
315 int result = 0; 319 int result = 0;
316 320
317 if (data) { 321 if (data) {
@@ -349,8 +353,7 @@ static inline int valid_ics_sector(const unsigned char *data)
349 * hda2 = ADFS partition 1 on first drive. 353 * hda2 = ADFS partition 1 on first drive.
350 * ..etc.. 354 * ..etc..
351 */ 355 */
352int 356int adfspart_check_ICS(struct parsed_partitions *state)
353adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
354{ 357{
355 const unsigned char *data; 358 const unsigned char *data;
356 const struct ics_part *p; 359 const struct ics_part *p;
@@ -360,7 +363,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
360 /* 363 /*
361 * Try ICS style partitions - sector 0 contains partition info. 364 * Try ICS style partitions - sector 0 contains partition info.
362 */ 365 */
363 data = read_dev_sector(bdev, 0, &sect); 366 data = read_part_sector(state, 0, &sect);
364 if (!data) 367 if (!data)
365 return -1; 368 return -1;
366 369
@@ -369,7 +372,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
369 return 0; 372 return 0;
370 } 373 }
371 374
372 printk(" [ICS]"); 375 strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
373 376
374 for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { 377 for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
375 u32 start = le32_to_cpu(p->start); 378 u32 start = le32_to_cpu(p->start);
@@ -392,7 +395,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
392 * partition is. We must not make this visible 395 * partition is. We must not make this visible
393 * to the filesystem. 396 * to the filesystem.
394 */ 397 */
395 if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { 398 if (size > 1 && adfspart_check_ICSLinux(state, start)) {
396 start += 1; 399 start += 1;
397 size -= 1; 400 size -= 1;
398 } 401 }
@@ -403,7 +406,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
403 } 406 }
404 407
405 put_dev_sector(sect); 408 put_dev_sector(sect);
406 printk("\n"); 409 strlcat(state->pp_buf, "\n", PAGE_SIZE);
407 return 1; 410 return 1;
408} 411}
409#endif 412#endif
@@ -446,8 +449,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
446 * hda2 = ADFS partition 1 on first drive. 449 * hda2 = ADFS partition 1 on first drive.
447 * ..etc.. 450 * ..etc..
448 */ 451 */
449int 452int adfspart_check_POWERTEC(struct parsed_partitions *state)
450adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
451{ 453{
452 Sector sect; 454 Sector sect;
453 const unsigned char *data; 455 const unsigned char *data;
@@ -455,7 +457,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
455 int slot = 1; 457 int slot = 1;
456 int i; 458 int i;
457 459
458 data = read_dev_sector(bdev, 0, &sect); 460 data = read_part_sector(state, 0, &sect);
459 if (!data) 461 if (!data)
460 return -1; 462 return -1;
461 463
@@ -464,7 +466,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
464 return 0; 466 return 0;
465 } 467 }
466 468
467 printk(" [POWERTEC]"); 469 strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
468 470
469 for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { 471 for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
470 u32 start = le32_to_cpu(p->start); 472 u32 start = le32_to_cpu(p->start);
@@ -475,7 +477,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
475 } 477 }
476 478
477 put_dev_sector(sect); 479 put_dev_sector(sect);
478 printk("\n"); 480 strlcat(state->pp_buf, "\n", PAGE_SIZE);
479 return 1; 481 return 1;
480} 482}
481#endif 483#endif
@@ -508,8 +510,7 @@ static const char eesox_name[] = {
508 * 1. The individual ADFS boot block entries that are placed on the disk. 510 * 1. The individual ADFS boot block entries that are placed on the disk.
509 * 2. The start address of the next entry. 511 * 2. The start address of the next entry.
510 */ 512 */
511int 513int adfspart_check_EESOX(struct parsed_partitions *state)
512adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
513{ 514{
514 Sector sect; 515 Sector sect;
515 const unsigned char *data; 516 const unsigned char *data;
@@ -518,7 +519,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
518 sector_t start = 0; 519 sector_t start = 0;
519 int i, slot = 1; 520 int i, slot = 1;
520 521
521 data = read_dev_sector(bdev, 7, &sect); 522 data = read_part_sector(state, 7, &sect);
522 if (!data) 523 if (!data)
523 return -1; 524 return -1;
524 525
@@ -545,9 +546,9 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
545 if (i != 0) { 546 if (i != 0) {
546 sector_t size; 547 sector_t size;
547 548
548 size = get_capacity(bdev->bd_disk); 549 size = get_capacity(state->bdev->bd_disk);
549 put_partition(state, slot++, start, size - start); 550 put_partition(state, slot++, start, size - start);
550 printk("\n"); 551 strlcat(state->pp_buf, "\n", PAGE_SIZE);
551 } 552 }
552 553
553 return i ? 1 : 0; 554 return i ? 1 : 0;
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
7 * format, and everyone stick to it? 7 * format, and everyone stick to it?
8 */ 8 */
9 9
10int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); 10int adfspart_check_CUMANA(struct parsed_partitions *state);
11int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); 11int adfspart_check_ADFS(struct parsed_partitions *state);
12int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); 12int adfspart_check_ICS(struct parsed_partitions *state);
13int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); 13int adfspart_check_POWERTEC(struct parsed_partitions *state);
14int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); 14int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..70cbf44a1560 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
23 return sum; 23 return sum;
24} 24}
25 25
26int 26int amiga_partition(struct parsed_partitions *state)
27amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
28{ 27{
29 Sector sect; 28 Sector sect;
30 unsigned char *data; 29 unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
38 for (blk = 0; ; blk++, put_dev_sector(sect)) { 37 for (blk = 0; ; blk++, put_dev_sector(sect)) {
39 if (blk == RDB_ALLOCATION_LIMIT) 38 if (blk == RDB_ALLOCATION_LIMIT)
40 goto rdb_done; 39 goto rdb_done;
41 data = read_dev_sector(bdev, blk, &sect); 40 data = read_part_sector(state, blk, &sect);
42 if (!data) { 41 if (!data) {
43 if (warn_no_part) 42 if (warn_no_part)
44 printk("Dev %s: unable to read RDB block %d\n", 43 printk("Dev %s: unable to read RDB block %d\n",
45 bdevname(bdev, b), blk); 44 bdevname(state->bdev, b), blk);
46 res = -1; 45 res = -1;
47 goto rdb_done; 46 goto rdb_done;
48 } 47 }
@@ -64,22 +63,28 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
64 } 63 }
65 64
66 printk("Dev %s: RDB in block %d has bad checksum\n", 65 printk("Dev %s: RDB in block %d has bad checksum\n",
67 bdevname(bdev, b), blk); 66 bdevname(state->bdev, b), blk);
68 } 67 }
69 68
70 /* blksize is blocks per 512 byte standard block */ 69 /* blksize is blocks per 512 byte standard block */
71 blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; 70 blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
72 71
73 printk(" RDSK (%d)", blksize * 512); /* Be more informative */ 72 {
73 char tmp[7 + 10 + 1 + 1];
74
75 /* Be more informative */
76 snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
77 strlcat(state->pp_buf, tmp, PAGE_SIZE);
78 }
74 blk = be32_to_cpu(rdb->rdb_PartitionList); 79 blk = be32_to_cpu(rdb->rdb_PartitionList);
75 put_dev_sector(sect); 80 put_dev_sector(sect);
76 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { 81 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
77 blk *= blksize; /* Read in terms partition table understands */ 82 blk *= blksize; /* Read in terms partition table understands */
78 data = read_dev_sector(bdev, blk, &sect); 83 data = read_part_sector(state, blk, &sect);
79 if (!data) { 84 if (!data) {
80 if (warn_no_part) 85 if (warn_no_part)
81 printk("Dev %s: unable to read partition block %d\n", 86 printk("Dev %s: unable to read partition block %d\n",
82 bdevname(bdev, b), blk); 87 bdevname(state->bdev, b), blk);
83 res = -1; 88 res = -1;
84 goto rdb_done; 89 goto rdb_done;
85 } 90 }
@@ -107,23 +112,27 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
107 { 112 {
108 /* Be even more informative to aid mounting */ 113 /* Be even more informative to aid mounting */
109 char dostype[4]; 114 char dostype[4];
115 char tmp[42];
116
110 __be32 *dt = (__be32 *)dostype; 117 __be32 *dt = (__be32 *)dostype;
111 *dt = pb->pb_Environment[16]; 118 *dt = pb->pb_Environment[16];
112 if (dostype[3] < ' ') 119 if (dostype[3] < ' ')
113 printk(" (%c%c%c^%c)", 120 snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
114 dostype[0], dostype[1], 121 dostype[0], dostype[1],
115 dostype[2], dostype[3] + '@' ); 122 dostype[2], dostype[3] + '@' );
116 else 123 else
117 printk(" (%c%c%c%c)", 124 snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
118 dostype[0], dostype[1], 125 dostype[0], dostype[1],
119 dostype[2], dostype[3]); 126 dostype[2], dostype[3]);
120 printk("(res %d spb %d)", 127 strlcat(state->pp_buf, tmp, PAGE_SIZE);
128 snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
121 be32_to_cpu(pb->pb_Environment[6]), 129 be32_to_cpu(pb->pb_Environment[6]),
122 be32_to_cpu(pb->pb_Environment[4])); 130 be32_to_cpu(pb->pb_Environment[4]));
131 strlcat(state->pp_buf, tmp, PAGE_SIZE);
123 } 132 }
124 res = 1; 133 res = 1;
125 } 134 }
126 printk("\n"); 135 strlcat(state->pp_buf, "\n", PAGE_SIZE);
127 136
128rdb_done: 137rdb_done:
129 return res; 138 return res;
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
2 * fs/partitions/amiga.h 2 * fs/partitions/amiga.h
3 */ 3 */
4 4
5int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); 5int amiga_partition(struct parsed_partitions *state);
6 6
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..9875b05e80a2 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
30 memcmp (s, "RAW", 3) == 0 ; 30 memcmp (s, "RAW", 3) == 0 ;
31} 31}
32 32
33int atari_partition(struct parsed_partitions *state, struct block_device *bdev) 33int atari_partition(struct parsed_partitions *state)
34{ 34{
35 Sector sect; 35 Sector sect;
36 struct rootsector *rs; 36 struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ 42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
43#endif 43#endif
44 44
45 rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect); 45 rs = read_part_sector(state, 0, &sect);
46 if (!rs) 46 if (!rs)
47 return -1; 47 return -1;
48 48
49 /* Verify this is an Atari rootsector: */ 49 /* Verify this is an Atari rootsector: */
50 hd_size = bdev->bd_inode->i_size >> 9; 50 hd_size = state->bdev->bd_inode->i_size >> 9;
51 if (!VALID_PARTITION(&rs->part[0], hd_size) && 51 if (!VALID_PARTITION(&rs->part[0], hd_size) &&
52 !VALID_PARTITION(&rs->part[1], hd_size) && 52 !VALID_PARTITION(&rs->part[1], hd_size) &&
53 !VALID_PARTITION(&rs->part[2], hd_size) && 53 !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -62,7 +62,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
62 } 62 }
63 63
64 pi = &rs->part[0]; 64 pi = &rs->part[0];
65 printk (" AHDI"); 65 strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
66 for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { 66 for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
67 struct rootsector *xrs; 67 struct rootsector *xrs;
68 Sector sect2; 68 Sector sect2;
@@ -81,10 +81,10 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
81#ifdef ICD_PARTS 81#ifdef ICD_PARTS
82 part_fmt = 1; 82 part_fmt = 1;
83#endif 83#endif
84 printk(" XGM<"); 84 strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
85 partsect = extensect = be32_to_cpu(pi->st); 85 partsect = extensect = be32_to_cpu(pi->st);
86 while (1) { 86 while (1) {
87 xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2); 87 xrs = read_part_sector(state, partsect, &sect2);
88 if (!xrs) { 88 if (!xrs) {
89 printk (" block %ld read failed\n", partsect); 89 printk (" block %ld read failed\n", partsect);
90 put_dev_sector(sect); 90 put_dev_sector(sect);
@@ -120,14 +120,14 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
120 break; 120 break;
121 } 121 }
122 } 122 }
123 printk(" >"); 123 strlcat(state->pp_buf, " >", PAGE_SIZE);
124 } 124 }
125#ifdef ICD_PARTS 125#ifdef ICD_PARTS
126 if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ 126 if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
127 pi = &rs->icdpart[0]; 127 pi = &rs->icdpart[0];
128 /* sanity check: no ICD format if first partition invalid */ 128 /* sanity check: no ICD format if first partition invalid */
129 if (OK_id(pi->id)) { 129 if (OK_id(pi->id)) {
130 printk(" ICD<"); 130 strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
131 for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { 131 for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
132 /* accept only GEM,BGM,RAW,LNX,SWP partitions */ 132 /* accept only GEM,BGM,RAW,LNX,SWP partitions */
133 if (!((pi->flg & 1) && OK_id(pi->id))) 133 if (!((pi->flg & 1) && OK_id(pi->id)))
@@ -137,13 +137,13 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
137 be32_to_cpu(pi->st), 137 be32_to_cpu(pi->st),
138 be32_to_cpu(pi->siz)); 138 be32_to_cpu(pi->siz));
139 } 139 }
140 printk(" >"); 140 strlcat(state->pp_buf, " >", PAGE_SIZE);
141 } 141 }
142 } 142 }
143#endif 143#endif
144 put_dev_sector(sect); 144 put_dev_sector(sect);
145 145
146 printk ("\n"); 146 strlcat(state->pp_buf, "\n", PAGE_SIZE);
147 147
148 return 1; 148 return 1;
149} 149}
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
31 u16 checksum; /* checksum for bootable disks */ 31 u16 checksum; /* checksum for bootable disks */
32} __attribute__((__packed__)); 32} __attribute__((__packed__));
33 33
34int atari_partition(struct parsed_partitions *state, struct block_device *bdev); 34int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab23a9e7..79fbf3f390f0 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
45 45
46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ 46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
47 47
48static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { 48static int (*check_part[])(struct parsed_partitions *) = {
49 /* 49 /*
50 * Probe partition formats with tables at disk address 0 50 * Probe partition formats with tables at disk address 0
51 * that also have an ADFS boot block at 0xdc0. 51 * that also have an ADFS boot block at 0xdc0.
@@ -161,12 +161,19 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
161 struct parsed_partitions *state; 161 struct parsed_partitions *state;
162 int i, res, err; 162 int i, res, err;
163 163
164 state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); 164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
165 if (!state) 165 if (!state)
166 return NULL; 166 return NULL;
167 state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
168 if (!state->pp_buf) {
169 kfree(state);
170 return NULL;
171 }
172 state->pp_buf[0] = '\0';
167 173
174 state->bdev = bdev;
168 disk_name(hd, 0, state->name); 175 disk_name(hd, 0, state->name);
169 printk(KERN_INFO " %s:", state->name); 176 snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
170 if (isdigit(state->name[strlen(state->name)-1])) 177 if (isdigit(state->name[strlen(state->name)-1]))
171 sprintf(state->name, "p"); 178 sprintf(state->name, "p");
172 179
@@ -174,7 +181,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
174 i = res = err = 0; 181 i = res = err = 0;
175 while (!res && check_part[i]) { 182 while (!res && check_part[i]) {
176 memset(&state->parts, 0, sizeof(state->parts)); 183 memset(&state->parts, 0, sizeof(state->parts));
177 res = check_part[i++](state, bdev); 184 res = check_part[i++](state);
178 if (res < 0) { 185 if (res < 0) {
179 /* We have hit an I/O error which we don't report now. 186 /* We have hit an I/O error which we don't report now.
180 * But record it, and let the others do their job. 187 * But record it, and let the others do their job.
@@ -184,15 +191,25 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
184 } 191 }
185 192
186 } 193 }
187 if (res > 0) 194 if (res > 0) {
195 printk(KERN_INFO "%s", state->pp_buf);
196
197 free_page((unsigned long)state->pp_buf);
188 return state; 198 return state;
199 }
200 if (state->access_beyond_eod)
201 err = -ENOSPC;
189 if (err) 202 if (err)
190 /* The partition is unrecognized. So report I/O errors if there were any */ 203 /* The partition is unrecognized. So report I/O errors if there were any */
191 res = err; 204 res = err;
192 if (!res) 205 if (!res)
193 printk(" unknown partition table\n"); 206 strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
194 else if (warn_no_part) 207 else if (warn_no_part)
195 printk(" unable to read partition table\n"); 208 strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
209
210 printk(KERN_INFO "%s", state->pp_buf);
211
212 free_page((unsigned long)state->pp_buf);
196 kfree(state); 213 kfree(state);
197 return ERR_PTR(res); 214 return ERR_PTR(res);
198} 215}
@@ -456,7 +473,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
456 } 473 }
457 474
458 /* everything is up and running, commence */ 475 /* everything is up and running, commence */
459 INIT_RCU_HEAD(&p->rcu_head);
460 rcu_assign_pointer(ptbl->part[partno], p); 476 rcu_assign_pointer(ptbl->part[partno], p);
461 477
462 /* suppress uevent if the disk supresses it */ 478 /* suppress uevent if the disk supresses it */
@@ -538,12 +554,33 @@ exit:
538 disk_part_iter_exit(&piter); 554 disk_part_iter_exit(&piter);
539} 555}
540 556
557static bool disk_unlock_native_capacity(struct gendisk *disk)
558{
559 const struct block_device_operations *bdops = disk->fops;
560
561 if (bdops->unlock_native_capacity &&
562 !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
563 printk(KERN_CONT "enabling native capacity\n");
564 bdops->unlock_native_capacity(disk);
565 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
566 return true;
567 } else {
568 printk(KERN_CONT "truncated\n");
569 return false;
570 }
571}
572
541int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 573int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
542{ 574{
575 struct parsed_partitions *state = NULL;
543 struct disk_part_iter piter; 576 struct disk_part_iter piter;
544 struct hd_struct *part; 577 struct hd_struct *part;
545 struct parsed_partitions *state;
546 int p, highest, res; 578 int p, highest, res;
579rescan:
580 if (state && !IS_ERR(state)) {
581 kfree(state);
582 state = NULL;
583 }
547 584
548 if (bdev->bd_part_count) 585 if (bdev->bd_part_count)
549 return -EBUSY; 586 return -EBUSY;
@@ -562,8 +599,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
562 bdev->bd_invalidated = 0; 599 bdev->bd_invalidated = 0;
563 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 600 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
564 return 0; 601 return 0;
565 if (IS_ERR(state)) /* I/O error reading the partition table */ 602 if (IS_ERR(state)) {
603 /*
604 * I/O error reading the partition table. If any
605 * partition code tried to read beyond EOD, retry
606 * after unlocking native capacity.
607 */
608 if (PTR_ERR(state) == -ENOSPC) {
609 printk(KERN_WARNING "%s: partition table beyond EOD, ",
610 disk->disk_name);
611 if (disk_unlock_native_capacity(disk))
612 goto rescan;
613 }
566 return -EIO; 614 return -EIO;
615 }
616 /*
617 * If any partition code tried to read beyond EOD, try
618 * unlocking native capacity even if partition table is
619 * sucessfully read as we could be missing some partitions.
620 */
621 if (state->access_beyond_eod) {
622 printk(KERN_WARNING
623 "%s: partition table partially beyond EOD, ",
624 disk->disk_name);
625 if (disk_unlock_native_capacity(disk))
626 goto rescan;
627 }
567 628
568 /* tell userspace that the media / partition table may have changed */ 629 /* tell userspace that the media / partition table may have changed */
569 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); 630 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -581,7 +642,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
581 /* add partitions */ 642 /* add partitions */
582 for (p = 1; p < state->limit; p++) { 643 for (p = 1; p < state->limit; p++) {
583 sector_t size, from; 644 sector_t size, from;
584try_scan: 645
585 size = state->parts[p].size; 646 size = state->parts[p].size;
586 if (!size) 647 if (!size)
587 continue; 648 continue;
@@ -589,30 +650,21 @@ try_scan:
589 from = state->parts[p].from; 650 from = state->parts[p].from;
590 if (from >= get_capacity(disk)) { 651 if (from >= get_capacity(disk)) {
591 printk(KERN_WARNING 652 printk(KERN_WARNING
592 "%s: p%d ignored, start %llu is behind the end of the disk\n", 653 "%s: p%d start %llu is beyond EOD, ",
593 disk->disk_name, p, (unsigned long long) from); 654 disk->disk_name, p, (unsigned long long) from);
655 if (disk_unlock_native_capacity(disk))
656 goto rescan;
594 continue; 657 continue;
595 } 658 }
596 659
597 if (from + size > get_capacity(disk)) { 660 if (from + size > get_capacity(disk)) {
598 const struct block_device_operations *bdops = disk->fops;
599 unsigned long long capacity;
600
601 printk(KERN_WARNING 661 printk(KERN_WARNING
602 "%s: p%d size %llu exceeds device capacity, ", 662 "%s: p%d size %llu extends beyond EOD, ",
603 disk->disk_name, p, (unsigned long long) size); 663 disk->disk_name, p, (unsigned long long) size);
604 664
605 if (bdops->set_capacity && 665 if (disk_unlock_native_capacity(disk)) {
606 (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { 666 /* free state and restart */
607 printk(KERN_CONT "enabling native capacity\n"); 667 goto rescan;
608 capacity = bdops->set_capacity(disk, ~0ULL);
609 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
610 if (capacity > get_capacity(disk)) {
611 set_capacity(disk, capacity);
612 check_disk_size_change(disk, bdev);
613 bdev->bd_invalidated = 0;
614 }
615 goto try_scan;
616 } else { 668 } else {
617 /* 669 /*
618 * we can not ignore partitions of broken tables 670 * we can not ignore partitions of broken tables
@@ -620,7 +672,6 @@ try_scan:
620 * we limit them to the end of the disk to avoid 672 * we limit them to the end of the disk to avoid
621 * creating invalid block devices 673 * creating invalid block devices
622 */ 674 */
623 printk(KERN_CONT "limited to end of disk\n");
624 size = get_capacity(disk) - from; 675 size = get_capacity(disk) - from;
625 } 676 }
626 } 677 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..8e4e103ba216 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
6 * description. 6 * description.
7 */ 7 */
8struct parsed_partitions { 8struct parsed_partitions {
9 struct block_device *bdev;
9 char name[BDEVNAME_SIZE]; 10 char name[BDEVNAME_SIZE];
10 struct { 11 struct {
11 sector_t from; 12 sector_t from;
@@ -14,15 +15,30 @@ struct parsed_partitions {
14 } parts[DISK_MAX_PARTS]; 15 } parts[DISK_MAX_PARTS];
15 int next; 16 int next;
16 int limit; 17 int limit;
18 bool access_beyond_eod;
19 char *pp_buf;
17}; 20};
18 21
22static inline void *read_part_sector(struct parsed_partitions *state,
23 sector_t n, Sector *p)
24{
25 if (n >= get_capacity(state->bdev->bd_disk)) {
26 state->access_beyond_eod = true;
27 return NULL;
28 }
29 return read_dev_sector(state->bdev, n, p);
30}
31
19static inline void 32static inline void
20put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) 33put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
21{ 34{
22 if (n < p->limit) { 35 if (n < p->limit) {
36 char tmp[1 + BDEVNAME_SIZE + 10 + 1];
37
23 p->parts[n].from = from; 38 p->parts[n].from = from;
24 p->parts[n].size = size; 39 p->parts[n].size = size;
25 printk(" %s%d", p->name, n); 40 snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
41 strlcat(p->pp_buf, tmp, PAGE_SIZE);
26 } 42 }
27} 43}
28 44
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 91babdae7587..dbb44d4bb8a7 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
140 * the part[0] entry for this disk, and is the number of 140 * the part[0] entry for this disk, and is the number of
141 * physical sectors available on the disk. 141 * physical sectors available on the disk.
142 */ 142 */
143static u64 143static u64 last_lba(struct block_device *bdev)
144last_lba(struct block_device *bdev)
145{ 144{
146 if (!bdev || !bdev->bd_inode) 145 if (!bdev || !bdev->bd_inode)
147 return 0; 146 return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
181 180
182/** 181/**
183 * read_lba(): Read bytes from disk, starting at given LBA 182 * read_lba(): Read bytes from disk, starting at given LBA
184 * @bdev 183 * @state
185 * @lba 184 * @lba
186 * @buffer 185 * @buffer
187 * @size_t 186 * @size_t
188 * 187 *
189 * Description: Reads @count bytes from @bdev into @buffer. 188 * Description: Reads @count bytes from @state->bdev into @buffer.
190 * Returns number of bytes read on success, 0 on error. 189 * Returns number of bytes read on success, 0 on error.
191 */ 190 */
192static size_t 191static size_t read_lba(struct parsed_partitions *state,
193read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) 192 u64 lba, u8 *buffer, size_t count)
194{ 193{
195 size_t totalreadcount = 0; 194 size_t totalreadcount = 0;
195 struct block_device *bdev = state->bdev;
196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512); 196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
197 197
198 if (!bdev || !buffer || lba > last_lba(bdev)) 198 if (!buffer || lba > last_lba(bdev))
199 return 0; 199 return 0;
200 200
201 while (count) { 201 while (count) {
202 int copied = 512; 202 int copied = 512;
203 Sector sect; 203 Sector sect;
204 unsigned char *data = read_dev_sector(bdev, n++, &sect); 204 unsigned char *data = read_part_sector(state, n++, &sect);
205 if (!data) 205 if (!data)
206 break; 206 break;
207 if (copied > count) 207 if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
217 217
218/** 218/**
219 * alloc_read_gpt_entries(): reads partition entries from disk 219 * alloc_read_gpt_entries(): reads partition entries from disk
220 * @bdev 220 * @state
221 * @gpt - GPT header 221 * @gpt - GPT header
222 * 222 *
223 * Description: Returns ptes on success, NULL on error. 223 * Description: Returns ptes on success, NULL on error.
224 * Allocates space for PTEs based on information found in @gpt. 224 * Allocates space for PTEs based on information found in @gpt.
225 * Notes: remember to free pte when you're done! 225 * Notes: remember to free pte when you're done!
226 */ 226 */
227static gpt_entry * 227static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
228alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) 228 gpt_header *gpt)
229{ 229{
230 size_t count; 230 size_t count;
231 gpt_entry *pte; 231 gpt_entry *pte;
232 if (!bdev || !gpt) 232
233 if (!gpt)
233 return NULL; 234 return NULL;
234 235
235 count = le32_to_cpu(gpt->num_partition_entries) * 236 count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
240 if (!pte) 241 if (!pte)
241 return NULL; 242 return NULL;
242 243
243 if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), 244 if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
244 (u8 *) pte, 245 (u8 *) pte,
245 count) < count) { 246 count) < count) {
246 kfree(pte); 247 kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
252 253
253/** 254/**
254 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk 255 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
255 * @bdev 256 * @state
256 * @lba is the Logical Block Address of the partition table 257 * @lba is the Logical Block Address of the partition table
257 * 258 *
258 * Description: returns GPT header on success, NULL on error. Allocates 259 * Description: returns GPT header on success, NULL on error. Allocates
259 * and fills a GPT header starting at @ from @bdev. 260 * and fills a GPT header starting at @ from @state->bdev.
260 * Note: remember to free gpt when finished with it. 261 * Note: remember to free gpt when finished with it.
261 */ 262 */
262static gpt_header * 263static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
263alloc_read_gpt_header(struct block_device *bdev, u64 lba) 264 u64 lba)
264{ 265{
265 gpt_header *gpt; 266 gpt_header *gpt;
266 unsigned ssz = bdev_logical_block_size(bdev); 267 unsigned ssz = bdev_logical_block_size(state->bdev);
267
268 if (!bdev)
269 return NULL;
270 268
271 gpt = kzalloc(ssz, GFP_KERNEL); 269 gpt = kzalloc(ssz, GFP_KERNEL);
272 if (!gpt) 270 if (!gpt)
273 return NULL; 271 return NULL;
274 272
275 if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { 273 if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
276 kfree(gpt); 274 kfree(gpt);
277 gpt=NULL; 275 gpt=NULL;
278 return NULL; 276 return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
283 281
284/** 282/**
285 * is_gpt_valid() - tests one GPT header and PTEs for validity 283 * is_gpt_valid() - tests one GPT header and PTEs for validity
286 * @bdev 284 * @state
287 * @lba is the logical block address of the GPT header to test 285 * @lba is the logical block address of the GPT header to test
288 * @gpt is a GPT header ptr, filled on return. 286 * @gpt is a GPT header ptr, filled on return.
289 * @ptes is a PTEs ptr, filled on return. 287 * @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
291 * Description: returns 1 if valid, 0 on error. 289 * Description: returns 1 if valid, 0 on error.
292 * If valid, returns pointers to newly allocated GPT header and PTEs. 290 * If valid, returns pointers to newly allocated GPT header and PTEs.
293 */ 291 */
294static int 292static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
295is_gpt_valid(struct block_device *bdev, u64 lba, 293 gpt_header **gpt, gpt_entry **ptes)
296 gpt_header **gpt, gpt_entry **ptes)
297{ 294{
298 u32 crc, origcrc; 295 u32 crc, origcrc;
299 u64 lastlba; 296 u64 lastlba;
300 297
301 if (!bdev || !gpt || !ptes) 298 if (!ptes)
302 return 0; 299 return 0;
303 if (!(*gpt = alloc_read_gpt_header(bdev, lba))) 300 if (!(*gpt = alloc_read_gpt_header(state, lba)))
304 return 0; 301 return 0;
305 302
306 /* Check the GUID Partition Table signature */ 303 /* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
336 /* Check the first_usable_lba and last_usable_lba are 333 /* Check the first_usable_lba and last_usable_lba are
337 * within the disk. 334 * within the disk.
338 */ 335 */
339 lastlba = last_lba(bdev); 336 lastlba = last_lba(state->bdev);
340 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { 337 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
341 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", 338 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
342 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), 339 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
350 goto fail; 347 goto fail;
351 } 348 }
352 349
353 if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) 350 if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
354 goto fail; 351 goto fail;
355 352
356 /* Check the GUID Partition Entry Array CRC */ 353 /* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
495 492
496/** 493/**
497 * find_valid_gpt() - Search disk for valid GPT headers and PTEs 494 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
498 * @bdev 495 * @state
499 * @gpt is a GPT header ptr, filled on return. 496 * @gpt is a GPT header ptr, filled on return.
500 * @ptes is a PTEs ptr, filled on return. 497 * @ptes is a PTEs ptr, filled on return.
501 * Description: Returns 1 if valid, 0 on error. 498 * Description: Returns 1 if valid, 0 on error.
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
508 * This protects against devices which misreport their size, and forces 505 * This protects against devices which misreport their size, and forces
509 * the user to decide to use the Alternate GPT. 506 * the user to decide to use the Alternate GPT.
510 */ 507 */
511static int 508static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
512find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) 509 gpt_entry **ptes)
513{ 510{
514 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; 511 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
515 gpt_header *pgpt = NULL, *agpt = NULL; 512 gpt_header *pgpt = NULL, *agpt = NULL;
516 gpt_entry *pptes = NULL, *aptes = NULL; 513 gpt_entry *pptes = NULL, *aptes = NULL;
517 legacy_mbr *legacymbr; 514 legacy_mbr *legacymbr;
518 u64 lastlba; 515 u64 lastlba;
519 if (!bdev || !gpt || !ptes) 516
517 if (!ptes)
520 return 0; 518 return 0;
521 519
522 lastlba = last_lba(bdev); 520 lastlba = last_lba(state->bdev);
523 if (!force_gpt) { 521 if (!force_gpt) {
524 /* This will be added to the EFI Spec. per Intel after v1.02. */ 522 /* This will be added to the EFI Spec. per Intel after v1.02. */
525 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); 523 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
526 if (legacymbr) { 524 if (legacymbr) {
527 read_lba(bdev, 0, (u8 *) legacymbr, 525 read_lba(state, 0, (u8 *) legacymbr,
528 sizeof (*legacymbr)); 526 sizeof (*legacymbr));
529 good_pmbr = is_pmbr_valid(legacymbr); 527 good_pmbr = is_pmbr_valid(legacymbr);
530 kfree(legacymbr); 528 kfree(legacymbr);
531 } 529 }
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
533 goto fail; 531 goto fail;
534 } 532 }
535 533
536 good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, 534 good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
537 &pgpt, &pptes); 535 &pgpt, &pptes);
538 if (good_pgpt) 536 if (good_pgpt)
539 good_agpt = is_gpt_valid(bdev, 537 good_agpt = is_gpt_valid(state,
540 le64_to_cpu(pgpt->alternate_lba), 538 le64_to_cpu(pgpt->alternate_lba),
541 &agpt, &aptes); 539 &agpt, &aptes);
542 if (!good_agpt && force_gpt) 540 if (!good_agpt && force_gpt)
543 good_agpt = is_gpt_valid(bdev, lastlba, 541 good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
544 &agpt, &aptes);
545 542
546 /* The obviously unsuccessful case */ 543 /* The obviously unsuccessful case */
547 if (!good_pgpt && !good_agpt) 544 if (!good_pgpt && !good_agpt)
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
583} 580}
584 581
585/** 582/**
586 * efi_partition(struct parsed_partitions *state, struct block_device *bdev) 583 * efi_partition(struct parsed_partitions *state)
587 * @state 584 * @state
588 * @bdev
589 * 585 *
590 * Description: called from check.c, if the disk contains GPT 586 * Description: called from check.c, if the disk contains GPT
591 * partitions, sets up partition entries in the kernel. 587 * partitions, sets up partition entries in the kernel.
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
602 * 1 if successful 598 * 1 if successful
603 * 599 *
604 */ 600 */
605int 601int efi_partition(struct parsed_partitions *state)
606efi_partition(struct parsed_partitions *state, struct block_device *bdev)
607{ 602{
608 gpt_header *gpt = NULL; 603 gpt_header *gpt = NULL;
609 gpt_entry *ptes = NULL; 604 gpt_entry *ptes = NULL;
610 u32 i; 605 u32 i;
611 unsigned ssz = bdev_logical_block_size(bdev) / 512; 606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
612 607
613 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { 608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
614 kfree(gpt); 609 kfree(gpt);
615 kfree(ptes); 610 kfree(ptes);
616 return 0; 611 return 0;
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
623 u64 size = le64_to_cpu(ptes[i].ending_lba) - 618 u64 size = le64_to_cpu(ptes[i].ending_lba) -
624 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 619 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
625 620
626 if (!is_pte_valid(&ptes[i], last_lba(bdev))) 621 if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
627 continue; 622 continue;
628 623
629 put_partition(state, i+1, start * ssz, size * ssz); 624 put_partition(state, i+1, start * ssz, size * ssz);
@@ -631,10 +626,10 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
631 /* If this is a RAID volume, tell md */ 626 /* If this is a RAID volume, tell md */
632 if (!efi_guidcmp(ptes[i].partition_type_guid, 627 if (!efi_guidcmp(ptes[i].partition_type_guid,
633 PARTITION_LINUX_RAID_GUID)) 628 PARTITION_LINUX_RAID_GUID))
634 state->parts[i+1].flags = 1; 629 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
635 } 630 }
636 kfree(ptes); 631 kfree(ptes);
637 kfree(gpt); 632 kfree(gpt);
638 printk("\n"); 633 strlcat(state->pp_buf, "\n", PAGE_SIZE);
639 return 1; 634 return 1;
640} 635}
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
110} __attribute__ ((packed)) legacy_mbr; 110} __attribute__ ((packed)) legacy_mbr;
111 111
112/* Functions */ 112/* Functions */
113extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); 113extern int efi_partition(struct parsed_partitions *state);
114 114
115#endif 115#endif
116 116
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..d513a07f44bb 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
58 58
59/* 59/*
60 */ 60 */
61int 61int ibm_partition(struct parsed_partitions *state)
62ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
63{ 62{
63 struct block_device *bdev = state->bdev;
64 int blocksize, res; 64 int blocksize, res;
65 loff_t i_size, offset, size, fmt_size; 65 loff_t i_size, offset, size, fmt_size;
66 dasd_information2_t *info; 66 dasd_information2_t *info;
@@ -74,6 +74,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
74 } *label; 74 } *label;
75 unsigned char *data; 75 unsigned char *data;
76 Sector sect; 76 Sector sect;
77 sector_t labelsect;
78 char tmp[64];
77 79
78 res = 0; 80 res = 0;
79 blocksize = bdev_logical_block_size(bdev); 81 blocksize = bdev_logical_block_size(bdev);
@@ -98,9 +100,19 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
98 goto out_freeall; 100 goto out_freeall;
99 101
100 /* 102 /*
103 * Special case for FBA disks: label sector does not depend on
104 * blocksize.
105 */
106 if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
107 (info->cu_type == 0x3880 && info->dev_type == 0x3370))
108 labelsect = info->label_block;
109 else
110 labelsect = info->label_block * (blocksize >> 9);
111
112 /*
101 * Get volume label, extract name and type. 113 * Get volume label, extract name and type.
102 */ 114 */
103 data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect); 115 data = read_part_sector(state, labelsect, &sect);
104 if (data == NULL) 116 if (data == NULL)
105 goto out_readerr; 117 goto out_readerr;
106 118
@@ -133,13 +145,15 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
133 */ 145 */
134 blocksize = label->cms.block_size; 146 blocksize = label->cms.block_size;
135 if (label->cms.disk_offset != 0) { 147 if (label->cms.disk_offset != 0) {
136 printk("CMS1/%8s(MDSK):", name); 148 snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
149 strlcat(state->pp_buf, tmp, PAGE_SIZE);
137 /* disk is reserved minidisk */ 150 /* disk is reserved minidisk */
138 offset = label->cms.disk_offset; 151 offset = label->cms.disk_offset;
139 size = (label->cms.block_count - 1) 152 size = (label->cms.block_count - 1)
140 * (blocksize >> 9); 153 * (blocksize >> 9);
141 } else { 154 } else {
142 printk("CMS1/%8s:", name); 155 snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
156 strlcat(state->pp_buf, tmp, PAGE_SIZE);
143 offset = (info->label_block + 1); 157 offset = (info->label_block + 1);
144 size = label->cms.block_count 158 size = label->cms.block_count
145 * (blocksize >> 9); 159 * (blocksize >> 9);
@@ -148,7 +162,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
148 size-offset*(blocksize >> 9)); 162 size-offset*(blocksize >> 9));
149 } else { 163 } else {
150 if (strncmp(type, "LNX1", 4) == 0) { 164 if (strncmp(type, "LNX1", 4) == 0) {
151 printk("LNX1/%8s:", name); 165 snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
166 strlcat(state->pp_buf, tmp, PAGE_SIZE);
152 if (label->lnx.ldl_version == 0xf2) { 167 if (label->lnx.ldl_version == 0xf2) {
153 fmt_size = label->lnx.formatted_blocks 168 fmt_size = label->lnx.formatted_blocks
154 * (blocksize >> 9); 169 * (blocksize >> 9);
@@ -167,7 +182,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
167 offset = (info->label_block + 1); 182 offset = (info->label_block + 1);
168 } else { 183 } else {
169 /* unlabeled disk */ 184 /* unlabeled disk */
170 printk("(nonl)"); 185 strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
171 size = i_size >> 9; 186 size = i_size >> 9;
172 offset = (info->label_block + 1); 187 offset = (info->label_block + 1);
173 } 188 }
@@ -186,15 +201,16 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
186 * if not, something is wrong, skipping partition detection 201 * if not, something is wrong, skipping partition detection
187 */ 202 */
188 if (strncmp(type, "VOL1", 4) == 0) { 203 if (strncmp(type, "VOL1", 4) == 0) {
189 printk("VOL1/%8s:", name); 204 snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
205 strlcat(state->pp_buf, tmp, PAGE_SIZE);
190 /* 206 /*
191 * get block number and read then go through format1 207 * get block number and read then go through format1
192 * labels 208 * labels
193 */ 209 */
194 blk = cchhb2blk(&label->vol.vtoc, geo) + 1; 210 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
195 counter = 0; 211 counter = 0;
196 data = read_dev_sector(bdev, blk * (blocksize/512), 212 data = read_part_sector(state, blk * (blocksize/512),
197 &sect); 213 &sect);
198 while (data != NULL) { 214 while (data != NULL) {
199 struct vtoc_format1_label f1; 215 struct vtoc_format1_label f1;
200 216
@@ -208,9 +224,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
208 || f1.DS1FMTID == _ascebc['7'] 224 || f1.DS1FMTID == _ascebc['7']
209 || f1.DS1FMTID == _ascebc['9']) { 225 || f1.DS1FMTID == _ascebc['9']) {
210 blk++; 226 blk++;
211 data = read_dev_sector(bdev, blk * 227 data = read_part_sector(state,
212 (blocksize/512), 228 blk * (blocksize/512), &sect);
213 &sect);
214 continue; 229 continue;
215 } 230 }
216 231
@@ -230,9 +245,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
230 size * (blocksize >> 9)); 245 size * (blocksize >> 9));
231 counter++; 246 counter++;
232 blk++; 247 blk++;
233 data = read_dev_sector(bdev, 248 data = read_part_sector(state,
234 blk * (blocksize/512), 249 blk * (blocksize/512), &sect);
235 &sect);
236 } 250 }
237 251
238 if (!data) 252 if (!data)
@@ -244,7 +258,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
244 258
245 } 259 }
246 260
247 printk("\n"); 261 strlcat(state->pp_buf, "\n", PAGE_SIZE);
248 goto out_freeall; 262 goto out_freeall;
249 263
250 264
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
int ibm_partition(struct parsed_partitions *, struct block_device *); int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..0ea19312706b 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "karma.h" 10#include "karma.h"
11 11
12int karma_partition(struct parsed_partitions *state, struct block_device *bdev) 12int karma_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 int slot = 1; 15 int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
29 } __attribute__((packed)) *label; 29 } __attribute__((packed)) *label;
30 struct d_partition *p; 30 struct d_partition *p;
31 31
32 data = read_dev_sector(bdev, 0, &sect); 32 data = read_part_sector(state, 0, &sect);
33 if (!data) 33 if (!data)
34 return -1; 34 return -1;
35 35
@@ -50,7 +50,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
50 } 50 }
51 slot++; 51 slot++;
52 } 52 }
53 printk("\n"); 53 strlcat(state->pp_buf, "\n", PAGE_SIZE);
54 put_dev_sector(sect); 54 put_dev_sector(sect);
55 return 1; 55 return 1;
56} 56}
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
4 4
5#define KARMA_LABEL_MAGIC 0xAB56 5#define KARMA_LABEL_MAGIC 0xAB56
6 6
7int karma_partition(struct parsed_partitions *state, struct block_device *bdev); 7int karma_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..5bf8a04b5d9b 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/stringify.h> 28#include <linux/stringify.h>
29#include <linux/kernel.h>
29#include "ldm.h" 30#include "ldm.h"
30#include "check.h" 31#include "check.h"
31#include "msdos.h" 32#include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
77 int h; 78 int h;
78 79
79 /* high part */ 80 /* high part */
80 if ((x = src[0] - '0') <= '9'-'0') h = x; 81 x = h = hex_to_bin(src[0]);
81 else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; 82 if (h < 0)
82 else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; 83 return -1;
83 else return -1;
84 h <<= 4;
85 84
86 /* low part */ 85 /* low part */
87 if ((x = src[1] - '0') <= '9'-'0') return h | x; 86 h = hex_to_bin(src[1]);
88 if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); 87 if (h < 0)
89 if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); 88 return -1;
90 return -1; 89
90 return (x << 4) + h;
91} 91}
92 92
93/** 93/**
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
309 309
310/** 310/**
311 * ldm_validate_privheads - Compare the primary privhead with its backups 311 * ldm_validate_privheads - Compare the primary privhead with its backups
312 * @bdev: Device holding the LDM Database 312 * @state: Partition check state including device holding the LDM Database
313 * @ph1: Memory struct to fill with ph contents 313 * @ph1: Memory struct to fill with ph contents
314 * 314 *
315 * Read and compare all three privheads from disk. 315 * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
321 * Return: 'true' Success 321 * Return: 'true' Success
322 * 'false' Error 322 * 'false' Error
323 */ 323 */
324static bool ldm_validate_privheads (struct block_device *bdev, 324static bool ldm_validate_privheads(struct parsed_partitions *state,
325 struct privhead *ph1) 325 struct privhead *ph1)
326{ 326{
327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; 327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
328 struct privhead *ph[3] = { ph1 }; 328 struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
332 long num_sects; 332 long num_sects;
333 int i; 333 int i;
334 334
335 BUG_ON (!bdev || !ph1); 335 BUG_ON (!state || !ph1);
336 336
337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); 337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); 338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
346 346
347 /* Read and parse privheads */ 347 /* Read and parse privheads */
348 for (i = 0; i < 3; i++) { 348 for (i = 0; i < 3; i++) {
349 data = read_dev_sector (bdev, 349 data = read_part_sector(state, ph[0]->config_start + off[i],
350 ph[0]->config_start + off[i], &sect); 350 &sect);
351 if (!data) { 351 if (!data) {
352 ldm_crit ("Disk read failed."); 352 ldm_crit ("Disk read failed.");
353 goto out; 353 goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
363 } 363 }
364 } 364 }
365 365
366 num_sects = bdev->bd_inode->i_size >> 9; 366 num_sects = state->bdev->bd_inode->i_size >> 9;
367 367
368 if ((ph[0]->config_start > num_sects) || 368 if ((ph[0]->config_start > num_sects) ||
369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { 369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
397 397
398/** 398/**
399 * ldm_validate_tocblocks - Validate the table of contents and its backups 399 * ldm_validate_tocblocks - Validate the table of contents and its backups
400 * @bdev: Device holding the LDM Database 400 * @state: Partition check state including device holding the LDM Database
401 * @base: Offset, into @bdev, of the database 401 * @base: Offset, into @state->bdev, of the database
402 * @ldb: Cache of the database structures 402 * @ldb: Cache of the database structures
403 * 403 *
404 * Find and compare the four tables of contents of the LDM Database stored on 404 * Find and compare the four tables of contents of the LDM Database stored on
405 * @bdev and return the parsed information into @toc1. 405 * @state->bdev and return the parsed information into @toc1.
406 * 406 *
407 * The offsets and sizes of the configs are range-checked against a privhead. 407 * The offsets and sizes of the configs are range-checked against a privhead.
408 * 408 *
409 * Return: 'true' @toc1 contains validated TOCBLOCK info 409 * Return: 'true' @toc1 contains validated TOCBLOCK info
410 * 'false' @toc1 contents are undefined 410 * 'false' @toc1 contents are undefined
411 */ 411 */
412static bool ldm_validate_tocblocks(struct block_device *bdev, 412static bool ldm_validate_tocblocks(struct parsed_partitions *state,
413 unsigned long base, struct ldmdb *ldb) 413 unsigned long base, struct ldmdb *ldb)
414{ 414{
415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; 415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
416 struct tocblock *tb[4]; 416 struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
420 int i, nr_tbs; 420 int i, nr_tbs;
421 bool result = false; 421 bool result = false;
422 422
423 BUG_ON(!bdev || !ldb); 423 BUG_ON(!state || !ldb);
424 ph = &ldb->ph; 424 ph = &ldb->ph;
425 tb[0] = &ldb->toc; 425 tb[0] = &ldb->toc;
426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); 426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
437 * skip any that fail as long as we get at least one valid TOCBLOCK. 437 * skip any that fail as long as we get at least one valid TOCBLOCK.
438 */ 438 */
439 for (nr_tbs = i = 0; i < 4; i++) { 439 for (nr_tbs = i = 0; i < 4; i++) {
440 data = read_dev_sector(bdev, base + off[i], &sect); 440 data = read_part_sector(state, base + off[i], &sect);
441 if (!data) { 441 if (!data) {
442 ldm_error("Disk read failed for TOCBLOCK %d.", i); 442 ldm_error("Disk read failed for TOCBLOCK %d.", i);
443 continue; 443 continue;
@@ -473,7 +473,7 @@ err:
473 473
474/** 474/**
475 * ldm_validate_vmdb - Read the VMDB and validate it 475 * ldm_validate_vmdb - Read the VMDB and validate it
476 * @bdev: Device holding the LDM Database 476 * @state: Partition check state including device holding the LDM Database
477 * @base: Offset, into @bdev, of the database 477 * @base: Offset, into @bdev, of the database
478 * @ldb: Cache of the database structures 478 * @ldb: Cache of the database structures
479 * 479 *
@@ -483,8 +483,8 @@ err:
483 * Return: 'true' @ldb contains validated VBDB info 483 * Return: 'true' @ldb contains validated VBDB info
484 * 'false' @ldb contents are undefined 484 * 'false' @ldb contents are undefined
485 */ 485 */
486static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, 486static bool ldm_validate_vmdb(struct parsed_partitions *state,
487 struct ldmdb *ldb) 487 unsigned long base, struct ldmdb *ldb)
488{ 488{
489 Sector sect; 489 Sector sect;
490 u8 *data; 490 u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
492 struct vmdb *vm; 492 struct vmdb *vm;
493 struct tocblock *toc; 493 struct tocblock *toc;
494 494
495 BUG_ON (!bdev || !ldb); 495 BUG_ON (!state || !ldb);
496 496
497 vm = &ldb->vm; 497 vm = &ldb->vm;
498 toc = &ldb->toc; 498 toc = &ldb->toc;
499 499
500 data = read_dev_sector (bdev, base + OFF_VMDB, &sect); 500 data = read_part_sector(state, base + OFF_VMDB, &sect);
501 if (!data) { 501 if (!data) {
502 ldm_crit ("Disk read failed."); 502 ldm_crit ("Disk read failed.");
503 return false; 503 return false;
@@ -534,21 +534,21 @@ out:
534 534
535/** 535/**
536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk 536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
537 * @bdev: Device holding the LDM Database 537 * @state: Partition check state including device holding the LDM Database
538 * 538 *
539 * This function provides a weak test to decide whether the device is a dynamic 539 * This function provides a weak test to decide whether the device is a dynamic
540 * disk or not. It looks for an MS-DOS-style partition table containing at 540 * disk or not. It looks for an MS-DOS-style partition table containing at
541 * least one partition of type 0x42 (formerly SFS, now used by Windows for 541 * least one partition of type 0x42 (formerly SFS, now used by Windows for
542 * dynamic disks). 542 * dynamic disks).
543 * 543 *
544 * N.B. The only possible error can come from the read_dev_sector and that is 544 * N.B. The only possible error can come from the read_part_sector and that is
545 * only likely to happen if the underlying device is strange. If that IS 545 * only likely to happen if the underlying device is strange. If that IS
546 * the case we should return zero to let someone else try. 546 * the case we should return zero to let someone else try.
547 * 547 *
548 * Return: 'true' @bdev is a dynamic disk 548 * Return: 'true' @state->bdev is a dynamic disk
549 * 'false' @bdev is not a dynamic disk, or an error occurred 549 * 'false' @state->bdev is not a dynamic disk, or an error occurred
550 */ 550 */
551static bool ldm_validate_partition_table (struct block_device *bdev) 551static bool ldm_validate_partition_table(struct parsed_partitions *state)
552{ 552{
553 Sector sect; 553 Sector sect;
554 u8 *data; 554 u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
556 int i; 556 int i;
557 bool result = false; 557 bool result = false;
558 558
559 BUG_ON (!bdev); 559 BUG_ON(!state);
560 560
561 data = read_dev_sector (bdev, 0, &sect); 561 data = read_part_sector(state, 0, &sect);
562 if (!data) { 562 if (!data) {
563 ldm_crit ("Disk read failed."); 563 ldm_crit ("Disk read failed.");
564 return false; 564 return false;
@@ -643,7 +643,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
643 return false; 643 return false;
644 } 644 }
645 645
646 printk (" [LDM]"); 646 strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
647 647
648 /* Create the data partitions */ 648 /* Create the data partitions */
649 list_for_each (item, &ldb->v_part) { 649 list_for_each (item, &ldb->v_part) {
@@ -658,7 +658,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
658 part_num++; 658 part_num++;
659 } 659 }
660 660
661 printk ("\n"); 661 strlcat(pp->pp_buf, "\n", PAGE_SIZE);
662 return true; 662 return true;
663} 663}
664 664
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1391 1391
1392/** 1392/**
1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory 1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
1394 * @bdev: Device holding the LDM Database 1394 * @state: Partition check state including device holding the LDM Database
1395 * @base: Offset, into @bdev, of the database 1395 * @base: Offset, into @state->bdev, of the database
1396 * @ldb: Cache of the database structures 1396 * @ldb: Cache of the database structures
1397 * 1397 *
1398 * To use the information from the VBLKs, they need to be read from the disk, 1398 * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1401 * Return: 'true' All the VBLKs were read successfully 1401 * Return: 'true' All the VBLKs were read successfully
1402 * 'false' An error occurred 1402 * 'false' An error occurred
1403 */ 1403 */
1404static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, 1404static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
1405 struct ldmdb *ldb) 1405 struct ldmdb *ldb)
1406{ 1406{
1407 int size, perbuf, skip, finish, s, v, recs; 1407 int size, perbuf, skip, finish, s, v, recs;
1408 u8 *data = NULL; 1408 u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1410 bool result = false; 1410 bool result = false;
1411 LIST_HEAD (frags); 1411 LIST_HEAD (frags);
1412 1412
1413 BUG_ON (!bdev || !ldb); 1413 BUG_ON(!state || !ldb);
1414 1414
1415 size = ldb->vm.vblk_size; 1415 size = ldb->vm.vblk_size;
1416 perbuf = 512 / size; 1416 perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1418 finish = (size * ldb->vm.last_vblk_seq) >> 9; 1418 finish = (size * ldb->vm.last_vblk_seq) >> 9;
1419 1419
1420 for (s = skip; s < finish; s++) { /* For each sector */ 1420 for (s = skip; s < finish; s++) { /* For each sector */
1421 data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect); 1421 data = read_part_sector(state, base + OFF_VMDB + s, &sect);
1422 if (!data) { 1422 if (!data) {
1423 ldm_crit ("Disk read failed."); 1423 ldm_crit ("Disk read failed.");
1424 goto out; 1424 goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
1474 1474
1475/** 1475/**
1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it 1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it
1477 * @pp: List of the partitions parsed so far 1477 * @state: Partition check state including device holding the LDM Database
1478 * @bdev: Device holding the LDM Database
1479 * 1478 *
1480 * This determines whether the device @bdev is a dynamic disk and if so creates 1479 * This determines whether the device @bdev is a dynamic disk and if so creates
1481 * the partitions necessary in the gendisk structure pointed to by @hd. 1480 * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
1485 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, 1484 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
1486 * and so on: the actual data containing partitions. 1485 * and so on: the actual data containing partitions.
1487 * 1486 *
1488 * Return: 1 Success, @bdev is a dynamic disk and we handled it 1487 * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
1489 * 0 Success, @bdev is not a dynamic disk 1488 * 0 Success, @state->bdev is not a dynamic disk
1490 * -1 An error occurred before enough information had been read 1489 * -1 An error occurred before enough information had been read
1491 * Or @bdev is a dynamic disk, but it may be corrupted 1490 * Or @state->bdev is a dynamic disk, but it may be corrupted
1492 */ 1491 */
1493int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) 1492int ldm_partition(struct parsed_partitions *state)
1494{ 1493{
1495 struct ldmdb *ldb; 1494 struct ldmdb *ldb;
1496 unsigned long base; 1495 unsigned long base;
1497 int result = -1; 1496 int result = -1;
1498 1497
1499 BUG_ON (!pp || !bdev); 1498 BUG_ON(!state);
1500 1499
1501 /* Look for signs of a Dynamic Disk */ 1500 /* Look for signs of a Dynamic Disk */
1502 if (!ldm_validate_partition_table (bdev)) 1501 if (!ldm_validate_partition_table(state))
1503 return 0; 1502 return 0;
1504 1503
1505 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); 1504 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1509 } 1508 }
1510 1509
1511 /* Parse and check privheads. */ 1510 /* Parse and check privheads. */
1512 if (!ldm_validate_privheads (bdev, &ldb->ph)) 1511 if (!ldm_validate_privheads(state, &ldb->ph))
1513 goto out; /* Already logged */ 1512 goto out; /* Already logged */
1514 1513
1515 /* All further references are relative to base (database start). */ 1514 /* All further references are relative to base (database start). */
1516 base = ldb->ph.config_start; 1515 base = ldb->ph.config_start;
1517 1516
1518 /* Parse and check tocs and vmdb. */ 1517 /* Parse and check tocs and vmdb. */
1519 if (!ldm_validate_tocblocks (bdev, base, ldb) || 1518 if (!ldm_validate_tocblocks(state, base, ldb) ||
1520 !ldm_validate_vmdb (bdev, base, ldb)) 1519 !ldm_validate_vmdb(state, base, ldb))
1521 goto out; /* Already logged */ 1520 goto out; /* Already logged */
1522 1521
1523 /* Initialize vblk lists in ldmdb struct */ 1522 /* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1527 INIT_LIST_HEAD (&ldb->v_comp); 1526 INIT_LIST_HEAD (&ldb->v_comp);
1528 INIT_LIST_HEAD (&ldb->v_part); 1527 INIT_LIST_HEAD (&ldb->v_part);
1529 1528
1530 if (!ldm_get_vblks (bdev, base, ldb)) { 1529 if (!ldm_get_vblks(state, base, ldb)) {
1531 ldm_crit ("Failed to read the VBLKs from the database."); 1530 ldm_crit ("Failed to read the VBLKs from the database.");
1532 goto cleanup; 1531 goto cleanup;
1533 } 1532 }
1534 1533
1535 /* Finally, create the data partition devices. */ 1534 /* Finally, create the data partition devices. */
1536 if (ldm_create_data_partitions (pp, ldb)) { 1535 if (ldm_create_data_partitions(state, ldb)) {
1537 ldm_debug ("Parsed LDM database successfully."); 1536 ldm_debug ("Parsed LDM database successfully.");
1538 result = 1; 1537 result = 1;
1539 } 1538 }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */
209 struct list_head v_part; 209 struct list_head v_part;
210}; 210};
211 211
212int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); 212int ldm_partition(struct parsed_partitions *state);
213 213
214#endif /* _FS_PT_LDM_H_ */ 214#endif /* _FS_PT_LDM_H_ */
215 215
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..68d6a216ee79 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
27 stg[i] = 0; 27 stg[i] = 0;
28} 28}
29 29
30int mac_partition(struct parsed_partitions *state, struct block_device *bdev) 30int mac_partition(struct parsed_partitions *state)
31{ 31{
32 int slot = 1; 32 int slot = 1;
33 Sector sect; 33 Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
42 struct mac_driver_desc *md; 42 struct mac_driver_desc *md;
43 43
44 /* Get 0th block and look at the first partition map entry. */ 44 /* Get 0th block and look at the first partition map entry. */
45 md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect); 45 md = read_part_sector(state, 0, &sect);
46 if (!md) 46 if (!md)
47 return -1; 47 return -1;
48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { 48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
51 } 51 }
52 secsize = be16_to_cpu(md->block_size); 52 secsize = be16_to_cpu(md->block_size);
53 put_dev_sector(sect); 53 put_dev_sector(sect);
54 data = read_dev_sector(bdev, secsize/512, &sect); 54 data = read_part_sector(state, secsize/512, &sect);
55 if (!data) 55 if (!data)
56 return -1; 56 return -1;
57 part = (struct mac_partition *) (data + secsize%512); 57 part = (struct mac_partition *) (data + secsize%512);
@@ -59,12 +59,12 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
59 put_dev_sector(sect); 59 put_dev_sector(sect);
60 return 0; /* not a MacOS disk */ 60 return 0; /* not a MacOS disk */
61 } 61 }
62 printk(" [mac]"); 62 strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
63 blocks_in_map = be32_to_cpu(part->map_count); 63 blocks_in_map = be32_to_cpu(part->map_count);
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 64 for (blk = 1; blk <= blocks_in_map; ++blk) {
65 int pos = blk * secsize; 65 int pos = blk * secsize;
66 put_dev_sector(sect); 66 put_dev_sector(sect);
67 data = read_dev_sector(bdev, pos/512, &sect); 67 data = read_part_sector(state, pos/512, &sect);
68 if (!data) 68 if (!data)
69 return -1; 69 return -1;
70 part = (struct mac_partition *) (data + pos%512); 70 part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
75 be32_to_cpu(part->block_count) * (secsize/512)); 75 be32_to_cpu(part->block_count) * (secsize/512));
76 76
77 if (!strnicmp(part->type, "Linux_RAID", 10)) 77 if (!strnicmp(part->type, "Linux_RAID", 10))
78 state->parts[slot].flags = 1; 78 state->parts[slot].flags = ADDPART_FLAG_RAID;
79#ifdef CONFIG_PPC_PMAC 79#ifdef CONFIG_PPC_PMAC
80 /* 80 /*
81 * If this is the first bootable partition, tell the 81 * If this is the first bootable partition, tell the
@@ -123,10 +123,11 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
123 } 123 }
124#ifdef CONFIG_PPC_PMAC 124#ifdef CONFIG_PPC_PMAC
125 if (found_root_goodness) 125 if (found_root_goodness)
126 note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); 126 note_bootable_part(state->bdev->bd_dev, found_root,
127 found_root_goodness);
127#endif 128#endif
128 129
129 put_dev_sector(sect); 130 put_dev_sector(sect);
130 printk("\n"); 131 strlcat(state->pp_buf, "\n", PAGE_SIZE);
131 return 1; 132 return 1;
132} 133}
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
41 /* ... more stuff */ 41 /* ... more stuff */
42}; 42};
43 43
44int mac_partition(struct parsed_partitions *state, struct block_device *bdev); 44int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 90be97f1f5a8..5f79a6677c69 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
64#define AIX_LABEL_MAGIC2 0xC2 64#define AIX_LABEL_MAGIC2 0xC2
65#define AIX_LABEL_MAGIC3 0xD4 65#define AIX_LABEL_MAGIC3 0xD4
66#define AIX_LABEL_MAGIC4 0xC1 66#define AIX_LABEL_MAGIC4 0xC1
67static int aix_magic_present(unsigned char *p, struct block_device *bdev) 67static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
68{ 68{
69 struct partition *pt = (struct partition *) (p + 0x1be); 69 struct partition *pt = (struct partition *) (p + 0x1be);
70 Sector sect; 70 Sector sect;
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
85 is_extended_partition(pt)) 85 is_extended_partition(pt))
86 return 0; 86 return 0;
87 } 87 }
88 d = read_dev_sector(bdev, 7, &sect); 88 d = read_part_sector(state, 7, &sect);
89 if (d) { 89 if (d) {
90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') 90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
91 ret = 1; 91 ret = 1;
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
105 * only for the actual data partitions. 105 * only for the actual data partitions.
106 */ 106 */
107 107
108static void 108static void parse_extended(struct parsed_partitions *state,
109parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109 sector_t first_sector, sector_t first_size)
110 sector_t first_sector, sector_t first_size)
111{ 110{
112 struct partition *p; 111 struct partition *p;
113 Sector sect; 112 Sector sect;
114 unsigned char *data; 113 unsigned char *data;
115 sector_t this_sector, this_size; 114 sector_t this_sector, this_size;
116 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 115 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
117 int loopct = 0; /* number of links followed 116 int loopct = 0; /* number of links followed
118 without finding a data partition */ 117 without finding a data partition */
119 int i; 118 int i;
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
126 return; 125 return;
127 if (state->next == state->limit) 126 if (state->next == state->limit)
128 return; 127 return;
129 data = read_dev_sector(bdev, this_sector, &sect); 128 data = read_part_sector(state, this_sector, &sect);
130 if (!data) 129 if (!data)
131 return; 130 return;
132 131
@@ -198,9 +197,8 @@ done:
198/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also 197/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
199 indicates linux swap. Be careful before believing this is Solaris. */ 198 indicates linux swap. Be careful before believing this is Solaris. */
200 199
201static void 200static void parse_solaris_x86(struct parsed_partitions *state,
202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 201 sector_t offset, sector_t size, int origin)
203 sector_t offset, sector_t size, int origin)
204{ 202{
205#ifdef CONFIG_SOLARIS_X86_PARTITION 203#ifdef CONFIG_SOLARIS_X86_PARTITION
206 Sector sect; 204 Sector sect;
@@ -208,17 +206,25 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
208 int i; 206 int i;
209 short max_nparts; 207 short max_nparts;
210 208
211 v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect); 209 v = read_part_sector(state, offset + 1, &sect);
212 if (!v) 210 if (!v)
213 return; 211 return;
214 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { 212 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
215 put_dev_sector(sect); 213 put_dev_sector(sect);
216 return; 214 return;
217 } 215 }
218 printk(" %s%d: <solaris:", state->name, origin); 216 {
217 char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
218
219 snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
220 strlcat(state->pp_buf, tmp, PAGE_SIZE);
221 }
219 if (le32_to_cpu(v->v_version) != 1) { 222 if (le32_to_cpu(v->v_version) != 1) {
220 printk(" cannot handle version %d vtoc>\n", 223 char tmp[64];
221 le32_to_cpu(v->v_version)); 224
225 snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n",
226 le32_to_cpu(v->v_version));
227 strlcat(state->pp_buf, tmp, PAGE_SIZE);
222 put_dev_sector(sect); 228 put_dev_sector(sect);
223 return; 229 return;
224 } 230 }
@@ -226,9 +232,12 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
226 max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; 232 max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
227 for (i=0; i<max_nparts && state->next<state->limit; i++) { 233 for (i=0; i<max_nparts && state->next<state->limit; i++) {
228 struct solaris_x86_slice *s = &v->v_slice[i]; 234 struct solaris_x86_slice *s = &v->v_slice[i];
235 char tmp[3 + 10 + 1 + 1];
236
229 if (s->s_size == 0) 237 if (s->s_size == 0)
230 continue; 238 continue;
231 printk(" [s%d]", i); 239 snprintf(tmp, sizeof(tmp), " [s%d]", i);
240 strlcat(state->pp_buf, tmp, PAGE_SIZE);
232 /* solaris partitions are relative to current MS-DOS 241 /* solaris partitions are relative to current MS-DOS
233 * one; must add the offset of the current partition */ 242 * one; must add the offset of the current partition */
234 put_partition(state, state->next++, 243 put_partition(state, state->next++,
@@ -236,7 +245,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
236 le32_to_cpu(s->s_size)); 245 le32_to_cpu(s->s_size));
237 } 246 }
238 put_dev_sector(sect); 247 put_dev_sector(sect);
239 printk(" >\n"); 248 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
240#endif 249#endif
241} 250}
242 251
@@ -245,23 +254,25 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
245 * Create devices for BSD partitions listed in a disklabel, under a 254 * Create devices for BSD partitions listed in a disklabel, under a
246 * dos-like partition. See parse_extended() for more information. 255 * dos-like partition. See parse_extended() for more information.
247 */ 256 */
248static void 257static void parse_bsd(struct parsed_partitions *state,
249parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 258 sector_t offset, sector_t size, int origin, char *flavour,
250 sector_t offset, sector_t size, int origin, char *flavour, 259 int max_partitions)
251 int max_partitions)
252{ 260{
253 Sector sect; 261 Sector sect;
254 struct bsd_disklabel *l; 262 struct bsd_disklabel *l;
255 struct bsd_partition *p; 263 struct bsd_partition *p;
264 char tmp[64];
256 265
257 l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect); 266 l = read_part_sector(state, offset + 1, &sect);
258 if (!l) 267 if (!l)
259 return; 268 return;
260 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { 269 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
261 put_dev_sector(sect); 270 put_dev_sector(sect);
262 return; 271 return;
263 } 272 }
264 printk(" %s%d: <%s:", state->name, origin, flavour); 273
274 snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
275 strlcat(state->pp_buf, tmp, PAGE_SIZE);
265 276
266 if (le16_to_cpu(l->d_npartitions) < max_partitions) 277 if (le16_to_cpu(l->d_npartitions) < max_partitions)
267 max_partitions = le16_to_cpu(l->d_npartitions); 278 max_partitions = le16_to_cpu(l->d_npartitions);
@@ -278,46 +289,43 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
278 /* full parent partition, we have it already */ 289 /* full parent partition, we have it already */
279 continue; 290 continue;
280 if (offset > bsd_start || offset+size < bsd_start+bsd_size) { 291 if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
281 printk("bad subpartition - ignored\n"); 292 strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
282 continue; 293 continue;
283 } 294 }
284 put_partition(state, state->next++, bsd_start, bsd_size); 295 put_partition(state, state->next++, bsd_start, bsd_size);
285 } 296 }
286 put_dev_sector(sect); 297 put_dev_sector(sect);
287 if (le16_to_cpu(l->d_npartitions) > max_partitions) 298 if (le16_to_cpu(l->d_npartitions) > max_partitions) {
288 printk(" (ignored %d more)", 299 snprintf(tmp, sizeof(tmp), " (ignored %d more)",
289 le16_to_cpu(l->d_npartitions) - max_partitions); 300 le16_to_cpu(l->d_npartitions) - max_partitions);
290 printk(" >\n"); 301 strlcat(state->pp_buf, tmp, PAGE_SIZE);
302 }
303 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
291} 304}
292#endif 305#endif
293 306
294static void 307static void parse_freebsd(struct parsed_partitions *state,
295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 308 sector_t offset, sector_t size, int origin)
296 sector_t offset, sector_t size, int origin)
297{ 309{
298#ifdef CONFIG_BSD_DISKLABEL 310#ifdef CONFIG_BSD_DISKLABEL
299 parse_bsd(state, bdev, offset, size, origin, 311 parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
300 "bsd", BSD_MAXPARTITIONS);
301#endif 312#endif
302} 313}
303 314
304static void 315static void parse_netbsd(struct parsed_partitions *state,
305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 316 sector_t offset, sector_t size, int origin)
306 sector_t offset, sector_t size, int origin)
307{ 317{
308#ifdef CONFIG_BSD_DISKLABEL 318#ifdef CONFIG_BSD_DISKLABEL
309 parse_bsd(state, bdev, offset, size, origin, 319 parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
310 "netbsd", BSD_MAXPARTITIONS);
311#endif 320#endif
312} 321}
313 322
314static void 323static void parse_openbsd(struct parsed_partitions *state,
315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 324 sector_t offset, sector_t size, int origin)
316 sector_t offset, sector_t size, int origin)
317{ 325{
318#ifdef CONFIG_BSD_DISKLABEL 326#ifdef CONFIG_BSD_DISKLABEL
319 parse_bsd(state, bdev, offset, size, origin, 327 parse_bsd(state, offset, size, origin, "openbsd",
320 "openbsd", OPENBSD_MAXPARTITIONS); 328 OPENBSD_MAXPARTITIONS);
321#endif 329#endif
322} 330}
323 331
@@ -325,16 +333,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
325 * Create devices for Unixware partitions listed in a disklabel, under a 333 * Create devices for Unixware partitions listed in a disklabel, under a
326 * dos-like partition. See parse_extended() for more information. 334 * dos-like partition. See parse_extended() for more information.
327 */ 335 */
328static void 336static void parse_unixware(struct parsed_partitions *state,
329parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 337 sector_t offset, sector_t size, int origin)
330 sector_t offset, sector_t size, int origin)
331{ 338{
332#ifdef CONFIG_UNIXWARE_DISKLABEL 339#ifdef CONFIG_UNIXWARE_DISKLABEL
333 Sector sect; 340 Sector sect;
334 struct unixware_disklabel *l; 341 struct unixware_disklabel *l;
335 struct unixware_slice *p; 342 struct unixware_slice *p;
336 343
337 l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect); 344 l = read_part_sector(state, offset + 29, &sect);
338 if (!l) 345 if (!l)
339 return; 346 return;
340 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || 347 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -342,7 +349,12 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
342 put_dev_sector(sect); 349 put_dev_sector(sect);
343 return; 350 return;
344 } 351 }
345 printk(" %s%d: <unixware:", state->name, origin); 352 {
353 char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
354
355 snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
356 strlcat(state->pp_buf, tmp, PAGE_SIZE);
357 }
346 p = &l->vtoc.v_slice[1]; 358 p = &l->vtoc.v_slice[1];
347 /* I omit the 0th slice as it is the same as whole disk. */ 359 /* I omit the 0th slice as it is the same as whole disk. */
348 while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { 360 while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
@@ -356,7 +368,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
356 p++; 368 p++;
357 } 369 }
358 put_dev_sector(sect); 370 put_dev_sector(sect);
359 printk(" >\n"); 371 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
360#endif 372#endif
361} 373}
362 374
@@ -365,9 +377,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
365 * Anand Krishnamurthy <anandk@wiproge.med.ge.com> 377 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
366 * Rajeev V. Pillai <rajeevvp@yahoo.com> 378 * Rajeev V. Pillai <rajeevvp@yahoo.com>
367 */ 379 */
368static void 380static void parse_minix(struct parsed_partitions *state,
369parse_minix(struct parsed_partitions *state, struct block_device *bdev, 381 sector_t offset, sector_t size, int origin)
370 sector_t offset, sector_t size, int origin)
371{ 382{
372#ifdef CONFIG_MINIX_SUBPARTITION 383#ifdef CONFIG_MINIX_SUBPARTITION
373 Sector sect; 384 Sector sect;
@@ -375,7 +386,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
375 struct partition *p; 386 struct partition *p;
376 int i; 387 int i;
377 388
378 data = read_dev_sector(bdev, offset, &sect); 389 data = read_part_sector(state, offset, &sect);
379 if (!data) 390 if (!data)
380 return; 391 return;
381 392
@@ -386,8 +397,10 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
386 * the normal boot sector. */ 397 * the normal boot sector. */
387 if (msdos_magic_present (data + 510) && 398 if (msdos_magic_present (data + 510) &&
388 SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ 399 SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
400 char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
389 401
390 printk(" %s%d: <minix:", state->name, origin); 402 snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
403 strlcat(state->pp_buf, tmp, PAGE_SIZE);
391 for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { 404 for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
392 if (state->next == state->limit) 405 if (state->next == state->limit)
393 break; 406 break;
@@ -396,7 +409,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
396 put_partition(state, state->next++, 409 put_partition(state, state->next++,
397 start_sect(p), nr_sects(p)); 410 start_sect(p), nr_sects(p));
398 } 411 }
399 printk(" >\n"); 412 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
400 } 413 }
401 put_dev_sector(sect); 414 put_dev_sector(sect);
402#endif /* CONFIG_MINIX_SUBPARTITION */ 415#endif /* CONFIG_MINIX_SUBPARTITION */
@@ -404,8 +417,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
404 417
405static struct { 418static struct {
406 unsigned char id; 419 unsigned char id;
407 void (*parse)(struct parsed_partitions *, struct block_device *, 420 void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
408 sector_t, sector_t, int);
409} subtypes[] = { 421} subtypes[] = {
410 {FREEBSD_PARTITION, parse_freebsd}, 422 {FREEBSD_PARTITION, parse_freebsd},
411 {NETBSD_PARTITION, parse_netbsd}, 423 {NETBSD_PARTITION, parse_netbsd},
@@ -417,16 +429,16 @@ static struct {
417 {0, NULL}, 429 {0, NULL},
418}; 430};
419 431
420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 432int msdos_partition(struct parsed_partitions *state)
421{ 433{
422 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 434 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
423 Sector sect; 435 Sector sect;
424 unsigned char *data; 436 unsigned char *data;
425 struct partition *p; 437 struct partition *p;
426 struct fat_boot_sector *fb; 438 struct fat_boot_sector *fb;
427 int slot; 439 int slot;
428 440
429 data = read_dev_sector(bdev, 0, &sect); 441 data = read_part_sector(state, 0, &sect);
430 if (!data) 442 if (!data)
431 return -1; 443 return -1;
432 if (!msdos_magic_present(data + 510)) { 444 if (!msdos_magic_present(data + 510)) {
@@ -434,9 +446,9 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
434 return 0; 446 return 0;
435 } 447 }
436 448
437 if (aix_magic_present(data, bdev)) { 449 if (aix_magic_present(state, data)) {
438 put_dev_sector(sect); 450 put_dev_sector(sect);
439 printk( " [AIX]"); 451 strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
440 return 0; 452 return 0;
441 } 453 }
442 454
@@ -457,7 +469,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
457 fb = (struct fat_boot_sector *) data; 469 fb = (struct fat_boot_sector *) data;
458 if (slot == 1 && fb->reserved && fb->fats 470 if (slot == 1 && fb->reserved && fb->fats
459 && fat_valid_media(fb->media)) { 471 && fat_valid_media(fb->media)) {
460 printk("\n"); 472 strlcat(state->pp_buf, "\n", PAGE_SIZE);
461 put_dev_sector(sect); 473 put_dev_sector(sect);
462 return 1; 474 return 1;
463 } else { 475 } else {
@@ -502,21 +514,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
502 n = min(size, max(sector_size, n)); 514 n = min(size, max(sector_size, n));
503 put_partition(state, slot, start, n); 515 put_partition(state, slot, start, n);
504 516
505 printk(" <"); 517 strlcat(state->pp_buf, " <", PAGE_SIZE);
506 parse_extended(state, bdev, start, size); 518 parse_extended(state, start, size);
507 printk(" >"); 519 strlcat(state->pp_buf, " >", PAGE_SIZE);
508 continue; 520 continue;
509 } 521 }
510 put_partition(state, slot, start, size); 522 put_partition(state, slot, start, size);
511 if (SYS_IND(p) == LINUX_RAID_PARTITION) 523 if (SYS_IND(p) == LINUX_RAID_PARTITION)
512 state->parts[slot].flags = 1; 524 state->parts[slot].flags = ADDPART_FLAG_RAID;
513 if (SYS_IND(p) == DM6_PARTITION) 525 if (SYS_IND(p) == DM6_PARTITION)
514 printk("[DM]"); 526 strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
515 if (SYS_IND(p) == EZD_PARTITION) 527 if (SYS_IND(p) == EZD_PARTITION)
516 printk("[EZD]"); 528 strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
517 } 529 }
518 530
519 printk("\n"); 531 strlcat(state->pp_buf, "\n", PAGE_SIZE);
520 532
521 /* second pass - output for each on a separate line */ 533 /* second pass - output for each on a separate line */
522 p = (struct partition *) (0x1be + data); 534 p = (struct partition *) (0x1be + data);
@@ -532,8 +544,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
532 544
533 if (!subtypes[n].parse) 545 if (!subtypes[n].parse)
534 continue; 546 continue;
535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size, 547 subtypes[n].parse(state, start_sect(p) * sector_size,
536 nr_sects(p)*sector_size, slot); 548 nr_sects(p) * sector_size, slot);
537 } 549 }
538 put_dev_sector(sect); 550 put_dev_sector(sect);
539 return 1; 551 return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
4 4
5#define MSDOS_LABEL_MAGIC 0xAA55 5#define MSDOS_LABEL_MAGIC 0xAA55
6 6
7int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); 7int msdos_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..48cec7cbca17 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "osf.h" 11#include "osf.h"
12 12
13int osf_partition(struct parsed_partitions *state, struct block_device *bdev) 13int osf_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 int slot = 1; 16 int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
49 } * label; 49 } * label;
50 struct d_partition * partition; 50 struct d_partition * partition;
51 51
52 data = read_dev_sector(bdev, 0, &sect); 52 data = read_part_sector(state, 0, &sect);
53 if (!data) 53 if (!data)
54 return -1; 54 return -1;
55 55
@@ -72,7 +72,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
72 le32_to_cpu(partition->p_size)); 72 le32_to_cpu(partition->p_size));
73 slot++; 73 slot++;
74 } 74 }
75 printk("\n"); 75 strlcat(state->pp_buf, "\n", PAGE_SIZE);
76 put_dev_sector(sect); 76 put_dev_sector(sect);
77 return 1; 77 return 1;
78} 78}
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
4 4
5#define DISKLABELMAGIC (0x82564557UL) 5#define DISKLABELMAGIC (0x82564557UL)
6 6
7int osf_partition(struct parsed_partitions *state, struct block_device *bdev); 7int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..ea8a86dceaf4 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
27 __be32 _unused1; /* Padding */ 27 __be32 _unused1; /* Padding */
28}; 28};
29 29
30int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) 30int sgi_partition(struct parsed_partitions *state)
31{ 31{
32 int i, csum; 32 int i, csum;
33 __be32 magic; 33 __be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
39 struct sgi_partition *p; 39 struct sgi_partition *p;
40 char b[BDEVNAME_SIZE]; 40 char b[BDEVNAME_SIZE];
41 41
42 label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect); 42 label = read_part_sector(state, 0, &sect);
43 if (!label) 43 if (!label)
44 return -1; 44 return -1;
45 p = &label->partitions[0]; 45 p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
57 } 57 }
58 if(csum) { 58 if(csum) {
59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", 59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
60 bdevname(bdev, b)); 60 bdevname(state->bdev, b));
61 put_dev_sector(sect); 61 put_dev_sector(sect);
62 return 0; 62 return 0;
63 } 63 }
@@ -76,7 +76,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
76 } 76 }
77 slot++; 77 slot++;
78 } 78 }
79 printk("\n"); 79 strlcat(state->pp_buf, "\n", PAGE_SIZE);
80 put_dev_sector(sect); 80 put_dev_sector(sect);
81 return 1; 81 return 1;
82} 82}
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
2 * fs/partitions/sgi.h 2 * fs/partitions/sgi.h
3 */ 3 */
4 4
5extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); 5extern int sgi_partition(struct parsed_partitions *state);
6 6
7#define SGI_LABEL_MAGIC 0x0be5a941 7#define SGI_LABEL_MAGIC 0x0be5a941
8 8
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..b5b6fcfb3d36 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "sun.h" 11#include "sun.h"
12 12
13int sun_partition(struct parsed_partitions *state, struct block_device *bdev) 13int sun_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 __be16 csum; 16 __be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
61 int use_vtoc; 61 int use_vtoc;
62 int nparts; 62 int nparts;
63 63
64 label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect); 64 label = read_part_sector(state, 0, &sect);
65 if (!label) 65 if (!label)
66 return -1; 66 return -1;
67 67
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
78 csum ^= *ush--; 78 csum ^= *ush--;
79 if (csum) { 79 if (csum) {
80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", 80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
81 bdevname(bdev, b)); 81 bdevname(state->bdev, b));
82 put_dev_sector(sect); 82 put_dev_sector(sect);
83 return 0; 83 return 0;
84 } 84 }
@@ -116,7 +116,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
116 } 116 }
117 slot++; 117 slot++;
118 } 118 }
119 printk("\n"); 119 strlcat(state->pp_buf, "\n", PAGE_SIZE);
120 put_dev_sector(sect); 120 put_dev_sector(sect);
121 return 1; 121 return 1;
122} 122}
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
5#define SUN_LABEL_MAGIC 0xDABE 5#define SUN_LABEL_MAGIC 0xDABE
6#define SUN_VTOC_SANITY 0x600DDEEE 6#define SUN_VTOC_SANITY 0x600DDEEE
7 7
8int sun_partition(struct parsed_partitions *state, struct block_device *bdev); 8int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9627ccffc1c4 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
46}; 46};
47 47
48 48
49int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) 49int sysv68_partition(struct parsed_partitions *state)
50{ 50{
51 int i, slices; 51 int i, slices;
52 int slot = 1; 52 int slot = 1;
@@ -54,8 +54,9 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
54 unsigned char *data; 54 unsigned char *data;
55 struct dkblk0 *b; 55 struct dkblk0 *b;
56 struct slice *slice; 56 struct slice *slice;
57 char tmp[64];
57 58
58 data = read_dev_sector(bdev, 0, &sect); 59 data = read_part_sector(state, 0, &sect);
59 if (!data) 60 if (!data)
60 return -1; 61 return -1;
61 62
@@ -68,12 +69,13 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
68 i = be32_to_cpu(b->dk_ios.ios_slcblk); 69 i = be32_to_cpu(b->dk_ios.ios_slcblk);
69 put_dev_sector(sect); 70 put_dev_sector(sect);
70 71
71 data = read_dev_sector(bdev, i, &sect); 72 data = read_part_sector(state, i, &sect);
72 if (!data) 73 if (!data)
73 return -1; 74 return -1;
74 75
75 slices -= 1; /* last slice is the whole disk */ 76 slices -= 1; /* last slice is the whole disk */
76 printk("sysV68: %s(s%u)", state->name, slices); 77 snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
78 strlcat(state->pp_buf, tmp, PAGE_SIZE);
77 slice = (struct slice *)data; 79 slice = (struct slice *)data;
78 for (i = 0; i < slices; i++, slice++) { 80 for (i = 0; i < slices; i++, slice++) {
79 if (slot == state->limit) 81 if (slot == state->limit)
@@ -82,11 +84,12 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
82 put_partition(state, slot, 84 put_partition(state, slot,
83 be32_to_cpu(slice->blkoff), 85 be32_to_cpu(slice->blkoff),
84 be32_to_cpu(slice->nblocks)); 86 be32_to_cpu(slice->nblocks));
85 printk("(s%u)", i); 87 snprintf(tmp, sizeof(tmp), "(s%u)", i);
88 strlcat(state->pp_buf, tmp, PAGE_SIZE);
86 } 89 }
87 slot++; 90 slot++;
88 } 91 }
89 printk("\n"); 92 strlcat(state->pp_buf, "\n", PAGE_SIZE);
90 put_dev_sector(sect); 93 put_dev_sector(sect);
91 return 1; 94 return 1;
92} 95}
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..8dbaf9f77a99 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "ultrix.h" 10#include "ultrix.h"
11 11
12int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) 12int ultrix_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 Sector sect; 15 Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
26#define PT_MAGIC 0x032957 /* Partition magic number */ 26#define PT_MAGIC 0x032957 /* Partition magic number */
27#define PT_VALID 1 /* Indicates if struct is valid */ 27#define PT_VALID 1 /* Indicates if struct is valid */
28 28
29 data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect); 29 data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
30 if (!data) 30 if (!data)
31 return -1; 31 return -1;
32 32
@@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
39 label->pt_part[i].pi_blkoff, 39 label->pt_part[i].pi_blkoff,
40 label->pt_part[i].pi_nblocks); 40 label->pt_part[i].pi_nblocks);
41 put_dev_sector(sect); 41 put_dev_sector(sect);
42 printk ("\n"); 42 strlcat(state->pp_buf, "\n", PAGE_SIZE);
43 return 1; 43 return 1;
44 } else { 44 } else {
45 put_dev_sector(sect); 45 put_dev_sector(sect);
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
2 * fs/partitions/ultrix.h 2 * fs/partitions/ultrix.h
3 */ 3 */
4 4
5int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); 5int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..279eef96c51c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/log2.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/pipe_fs_i.h> 16#include <linux/pipe_fs_i.h>
16#include <linux/uio.h> 17#include <linux/uio.h>
@@ -18,11 +19,23 @@
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/audit.h> 20#include <linux/audit.h>
20#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/fcntl.h>
21 23
22#include <asm/uaccess.h> 24#include <asm/uaccess.h>
23#include <asm/ioctls.h> 25#include <asm/ioctls.h>
24 26
25/* 27/*
28 * The max size that a non-root user is allowed to grow the pipe. Can
29 * be set by root in /proc/sys/fs/pipe-max-size
30 */
31unsigned int pipe_max_size = 1048576;
32
33/*
34 * Minimum pipe size, as required by POSIX
35 */
36unsigned int pipe_min_size = PAGE_SIZE;
37
38/*
26 * We use a start+len construction, which provides full use of the 39 * We use a start+len construction, which provides full use of the
27 * allocated memory. 40 * allocated memory.
28 * -- Florian Coosmann (FGC) 41 * -- Florian Coosmann (FGC)
@@ -222,6 +235,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
222 235
223 return kmap(buf->page); 236 return kmap(buf->page);
224} 237}
238EXPORT_SYMBOL(generic_pipe_buf_map);
225 239
226/** 240/**
227 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 241 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -241,6 +255,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
241 } else 255 } else
242 kunmap(buf->page); 256 kunmap(buf->page);
243} 257}
258EXPORT_SYMBOL(generic_pipe_buf_unmap);
244 259
245/** 260/**
246 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 261 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -271,6 +286,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
271 286
272 return 1; 287 return 1;
273} 288}
289EXPORT_SYMBOL(generic_pipe_buf_steal);
274 290
275/** 291/**
276 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 292 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -286,6 +302,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
286{ 302{
287 page_cache_get(buf->page); 303 page_cache_get(buf->page);
288} 304}
305EXPORT_SYMBOL(generic_pipe_buf_get);
289 306
290/** 307/**
291 * generic_pipe_buf_confirm - verify contents of the pipe buffer 308 * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -301,6 +318,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
301{ 318{
302 return 0; 319 return 0;
303} 320}
321EXPORT_SYMBOL(generic_pipe_buf_confirm);
304 322
305/** 323/**
306 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 324 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -315,6 +333,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
315{ 333{
316 page_cache_release(buf->page); 334 page_cache_release(buf->page);
317} 335}
336EXPORT_SYMBOL(generic_pipe_buf_release);
318 337
319static const struct pipe_buf_operations anon_pipe_buf_ops = { 338static const struct pipe_buf_operations anon_pipe_buf_ops = {
320 .can_merge = 1, 339 .can_merge = 1,
@@ -390,7 +409,7 @@ redo:
390 if (!buf->len) { 409 if (!buf->len) {
391 buf->ops = NULL; 410 buf->ops = NULL;
392 ops->release(pipe, buf); 411 ops->release(pipe, buf);
393 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); 412 curbuf = (curbuf + 1) & (pipe->buffers - 1);
394 pipe->curbuf = curbuf; 413 pipe->curbuf = curbuf;
395 pipe->nrbufs = --bufs; 414 pipe->nrbufs = --bufs;
396 do_wakeup = 1; 415 do_wakeup = 1;
@@ -472,7 +491,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
472 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 491 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
473 if (pipe->nrbufs && chars != 0) { 492 if (pipe->nrbufs && chars != 0) {
474 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 493 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
475 (PIPE_BUFFERS-1); 494 (pipe->buffers - 1);
476 struct pipe_buffer *buf = pipe->bufs + lastbuf; 495 struct pipe_buffer *buf = pipe->bufs + lastbuf;
477 const struct pipe_buf_operations *ops = buf->ops; 496 const struct pipe_buf_operations *ops = buf->ops;
478 int offset = buf->offset + buf->len; 497 int offset = buf->offset + buf->len;
@@ -518,8 +537,8 @@ redo1:
518 break; 537 break;
519 } 538 }
520 bufs = pipe->nrbufs; 539 bufs = pipe->nrbufs;
521 if (bufs < PIPE_BUFFERS) { 540 if (bufs < pipe->buffers) {
522 int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); 541 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
523 struct pipe_buffer *buf = pipe->bufs + newbuf; 542 struct pipe_buffer *buf = pipe->bufs + newbuf;
524 struct page *page = pipe->tmp_page; 543 struct page *page = pipe->tmp_page;
525 char *src; 544 char *src;
@@ -580,7 +599,7 @@ redo2:
580 if (!total_len) 599 if (!total_len)
581 break; 600 break;
582 } 601 }
583 if (bufs < PIPE_BUFFERS) 602 if (bufs < pipe->buffers)
584 continue; 603 continue;
585 if (filp->f_flags & O_NONBLOCK) { 604 if (filp->f_flags & O_NONBLOCK) {
586 if (!ret) 605 if (!ret)
@@ -640,7 +659,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
640 nrbufs = pipe->nrbufs; 659 nrbufs = pipe->nrbufs;
641 while (--nrbufs >= 0) { 660 while (--nrbufs >= 0) {
642 count += pipe->bufs[buf].len; 661 count += pipe->bufs[buf].len;
643 buf = (buf+1) & (PIPE_BUFFERS-1); 662 buf = (buf+1) & (pipe->buffers - 1);
644 } 663 }
645 mutex_unlock(&inode->i_mutex); 664 mutex_unlock(&inode->i_mutex);
646 665
@@ -671,7 +690,7 @@ pipe_poll(struct file *filp, poll_table *wait)
671 } 690 }
672 691
673 if (filp->f_mode & FMODE_WRITE) { 692 if (filp->f_mode & FMODE_WRITE) {
674 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; 693 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
675 /* 694 /*
676 * Most Unices do not set POLLERR for FIFOs but on Linux they 695 * Most Unices do not set POLLERR for FIFOs but on Linux they
677 * behave exactly like pipes for poll(). 696 * behave exactly like pipes for poll().
@@ -877,25 +896,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
877 896
878 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 897 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
879 if (pipe) { 898 if (pipe) {
880 init_waitqueue_head(&pipe->wait); 899 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
881 pipe->r_counter = pipe->w_counter = 1; 900 if (pipe->bufs) {
882 pipe->inode = inode; 901 init_waitqueue_head(&pipe->wait);
902 pipe->r_counter = pipe->w_counter = 1;
903 pipe->inode = inode;
904 pipe->buffers = PIPE_DEF_BUFFERS;
905 return pipe;
906 }
907 kfree(pipe);
883 } 908 }
884 909
885 return pipe; 910 return NULL;
886} 911}
887 912
888void __free_pipe_info(struct pipe_inode_info *pipe) 913void __free_pipe_info(struct pipe_inode_info *pipe)
889{ 914{
890 int i; 915 int i;
891 916
892 for (i = 0; i < PIPE_BUFFERS; i++) { 917 for (i = 0; i < pipe->buffers; i++) {
893 struct pipe_buffer *buf = pipe->bufs + i; 918 struct pipe_buffer *buf = pipe->bufs + i;
894 if (buf->ops) 919 if (buf->ops)
895 buf->ops->release(pipe, buf); 920 buf->ops->release(pipe, buf);
896 } 921 }
897 if (pipe->tmp_page) 922 if (pipe->tmp_page)
898 __free_page(pipe->tmp_page); 923 __free_page(pipe->tmp_page);
924 kfree(pipe->bufs);
899 kfree(pipe); 925 kfree(pipe);
900} 926}
901 927
@@ -1094,6 +1120,126 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
1094} 1120}
1095 1121
1096/* 1122/*
1123 * Allocate a new array of pipe buffers and copy the info over. Returns the
1124 * pipe size if successful, or return -ERROR on error.
1125 */
1126static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1127{
1128 struct pipe_buffer *bufs;
1129
1130 /*
1131 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1132 * expect a lot of shrink+grow operations, just free and allocate
1133 * again like we would do for growing. If the pipe currently
1134 * contains more buffers than arg, then return busy.
1135 */
1136 if (nr_pages < pipe->nrbufs)
1137 return -EBUSY;
1138
1139 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
1140 if (unlikely(!bufs))
1141 return -ENOMEM;
1142
1143 /*
1144 * The pipe array wraps around, so just start the new one at zero
1145 * and adjust the indexes.
1146 */
1147 if (pipe->nrbufs) {
1148 unsigned int tail;
1149 unsigned int head;
1150
1151 tail = pipe->curbuf + pipe->nrbufs;
1152 if (tail < pipe->buffers)
1153 tail = 0;
1154 else
1155 tail &= (pipe->buffers - 1);
1156
1157 head = pipe->nrbufs - tail;
1158 if (head)
1159 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1160 if (tail)
1161 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
1162 }
1163
1164 pipe->curbuf = 0;
1165 kfree(pipe->bufs);
1166 pipe->bufs = bufs;
1167 pipe->buffers = nr_pages;
1168 return nr_pages * PAGE_SIZE;
1169}
1170
1171/*
1172 * Currently we rely on the pipe array holding a power-of-2 number
1173 * of pages.
1174 */
1175static inline unsigned int round_pipe_size(unsigned int size)
1176{
1177 unsigned long nr_pages;
1178
1179 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1180 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
1181}
1182
1183/*
1184 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
1185 * will return an error.
1186 */
1187int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1188 size_t *lenp, loff_t *ppos)
1189{
1190 int ret;
1191
1192 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
1193 if (ret < 0 || !write)
1194 return ret;
1195
1196 pipe_max_size = round_pipe_size(pipe_max_size);
1197 return ret;
1198}
1199
1200long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1201{
1202 struct pipe_inode_info *pipe;
1203 long ret;
1204
1205 pipe = file->f_path.dentry->d_inode->i_pipe;
1206 if (!pipe)
1207 return -EBADF;
1208
1209 mutex_lock(&pipe->inode->i_mutex);
1210
1211 switch (cmd) {
1212 case F_SETPIPE_SZ: {
1213 unsigned int size, nr_pages;
1214
1215 size = round_pipe_size(arg);
1216 nr_pages = size >> PAGE_SHIFT;
1217
1218 ret = -EINVAL;
1219 if (!nr_pages)
1220 goto out;
1221
1222 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
1223 ret = -EPERM;
1224 goto out;
1225 }
1226 ret = pipe_set_size(pipe, nr_pages);
1227 break;
1228 }
1229 case F_GETPIPE_SZ:
1230 ret = pipe->buffers * PAGE_SIZE;
1231 break;
1232 default:
1233 ret = -EINVAL;
1234 break;
1235 }
1236
1237out:
1238 mutex_unlock(&pipe->inode->i_mutex);
1239 return ret;
1240}
1241
1242/*
1097 * pipefs should _never_ be mounted by userland - too much of security hassle, 1243 * pipefs should _never_ be mounted by userland - too much of security hassle,
1098 * no real gain from having the whole whorehouse mounted. So we don't need 1244 * no real gain from having the whole whorehouse mounted. So we don't need
1099 * any operations on the root directory. However, we need a non-trivial 1245 * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a83149..8066b8dd748f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
126 return 0; 126 return 0;
127} 127}
128 128
129/*
130 * vfsmount lock must be held for write
131 */
129void change_mnt_propagation(struct vfsmount *mnt, int type) 132void change_mnt_propagation(struct vfsmount *mnt, int type)
130{ 133{
131 if (type == MS_SHARED) { 134 if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
270 prev_src_mnt = child; 273 prev_src_mnt = child;
271 } 274 }
272out: 275out:
273 spin_lock(&vfsmount_lock); 276 br_write_lock(vfsmount_lock);
274 while (!list_empty(&tmp_list)) { 277 while (!list_empty(&tmp_list)) {
275 child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); 278 child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
276 umount_tree(child, 0, &umount_list); 279 umount_tree(child, 0, &umount_list);
277 } 280 }
278 spin_unlock(&vfsmount_lock); 281 br_write_unlock(vfsmount_lock);
279 release_mounts(&umount_list); 282 release_mounts(&umount_list);
280 return ret; 283 return ret;
281} 284}
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
296 * other mounts its parent propagates to. 299 * other mounts its parent propagates to.
297 * Check if any of these mounts that **do not have submounts** 300 * Check if any of these mounts that **do not have submounts**
298 * have more references than 'refcnt'. If so return busy. 301 * have more references than 'refcnt'. If so return busy.
302 *
303 * vfsmount lock must be held for read or write
299 */ 304 */
300int propagate_mount_busy(struct vfsmount *mnt, int refcnt) 305int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
301{ 306{
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
353 * collect all mounts that receive propagation from the mount in @list, 358 * collect all mounts that receive propagation from the mount in @list,
354 * and return these additional mounts in the same list. 359 * and return these additional mounts in the same list.
355 * @list: the list of mounts to be unmounted. 360 * @list: the list of mounts to be unmounted.
361 *
362 * vfsmount lock must be held for write
356 */ 363 */
357int propagate_umount(struct list_head *list) 364int propagate_umount(struct list_head *list)
358{ 365{
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 11a7b5c68153..2758e2afc518 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the Linux proc filesystem routines. 2# Makefile for the Linux proc filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_PROC_FS) += proc.o 5obj-y += proc.o
6 6
7proc-y := nommu.o task_nommu.o 7proc-y := nommu.o task_nommu.o
8proc-$(CONFIG_MMU) := mmu.o task_mmu.o 8proc-$(CONFIG_MMU) := mmu.o task_mmu.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..fff6572676ae 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
176 if (tracer) 176 if (tracer)
177 tpid = task_pid_nr_ns(tracer, ns); 177 tpid = task_pid_nr_ns(tracer, ns);
178 } 178 }
179 cred = get_cred((struct cred *) __task_cred(p)); 179 cred = get_task_cred(p);
180 seq_printf(m, 180 seq_printf(m,
181 "State:\t%s\n" 181 "State:\t%s\n"
182 "Tgid:\t%d\n" 182 "Tgid:\t%d\n"
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
267 shpending = p->signal->shared_pending.signal; 267 shpending = p->signal->shared_pending.signal;
268 blocked = p->blocked; 268 blocked = p->blocked;
269 collect_sigign_sigcatch(p, &ignored, &caught); 269 collect_sigign_sigcatch(p, &ignored, &caught);
270 num_threads = atomic_read(&p->signal->count); 270 num_threads = get_nr_threads(p);
271 rcu_read_lock(); /* FIXME: is this correct? */ 271 rcu_read_lock(); /* FIXME: is this correct? */
272 qsize = atomic_read(&__task_cred(p)->user->sigpending); 272 qsize = atomic_read(&__task_cred(p)->user->sigpending);
273 rcu_read_unlock(); 273 rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
410 tty_nr = new_encode_dev(tty_devnum(sig->tty)); 410 tty_nr = new_encode_dev(tty_devnum(sig->tty));
411 } 411 }
412 412
413 num_threads = atomic_read(&sig->count); 413 num_threads = get_nr_threads(task);
414 collect_sigign_sigcatch(task, &sigign, &sigcatch); 414 collect_sigign_sigcatch(task, &sigign, &sigcatch);
415 415
416 cmin_flt = sig->cmin_flt; 416 cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8418fcc0a6ab..8e4addaa5424 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
63#include <linux/namei.h> 63#include <linux/namei.h>
64#include <linux/mnt_namespace.h> 64#include <linux/mnt_namespace.h>
65#include <linux/mm.h> 65#include <linux/mm.h>
66#include <linux/swap.h>
66#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
67#include <linux/kallsyms.h> 68#include <linux/kallsyms.h>
68#include <linux/stacktrace.h> 69#include <linux/stacktrace.h>
@@ -148,43 +149,31 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
148 return count; 149 return count;
149} 150}
150 151
151static int get_fs_path(struct task_struct *task, struct path *path, bool root) 152static int get_task_root(struct task_struct *task, struct path *root)
152{ 153{
153 struct fs_struct *fs;
154 int result = -ENOENT; 154 int result = -ENOENT;
155 155
156 task_lock(task); 156 task_lock(task);
157 fs = task->fs; 157 if (task->fs) {
158 if (fs) { 158 get_fs_root(task->fs, root);
159 read_lock(&fs->lock);
160 *path = root ? fs->root : fs->pwd;
161 path_get(path);
162 read_unlock(&fs->lock);
163 result = 0; 159 result = 0;
164 } 160 }
165 task_unlock(task); 161 task_unlock(task);
166 return result; 162 return result;
167} 163}
168 164
169static int get_nr_threads(struct task_struct *tsk)
170{
171 unsigned long flags;
172 int count = 0;
173
174 if (lock_task_sighand(tsk, &flags)) {
175 count = atomic_read(&tsk->signal->count);
176 unlock_task_sighand(tsk, &flags);
177 }
178 return count;
179}
180
181static int proc_cwd_link(struct inode *inode, struct path *path) 165static int proc_cwd_link(struct inode *inode, struct path *path)
182{ 166{
183 struct task_struct *task = get_proc_task(inode); 167 struct task_struct *task = get_proc_task(inode);
184 int result = -ENOENT; 168 int result = -ENOENT;
185 169
186 if (task) { 170 if (task) {
187 result = get_fs_path(task, path, 0); 171 task_lock(task);
172 if (task->fs) {
173 get_fs_pwd(task->fs, path);
174 result = 0;
175 }
176 task_unlock(task);
188 put_task_struct(task); 177 put_task_struct(task);
189 } 178 }
190 return result; 179 return result;
@@ -196,7 +185,7 @@ static int proc_root_link(struct inode *inode, struct path *path)
196 int result = -ENOENT; 185 int result = -ENOENT;
197 186
198 if (task) { 187 if (task) {
199 result = get_fs_path(task, path, 1); 188 result = get_task_root(task, path);
200 put_task_struct(task); 189 put_task_struct(task);
201 } 190 }
202 return result; 191 return result;
@@ -439,17 +428,14 @@ static const struct file_operations proc_lstats_operations = {
439 428
440#endif 429#endif
441 430
442/* The badness from the OOM killer */
443unsigned long badness(struct task_struct *p, unsigned long uptime);
444static int proc_oom_score(struct task_struct *task, char *buffer) 431static int proc_oom_score(struct task_struct *task, char *buffer)
445{ 432{
446 unsigned long points = 0; 433 unsigned long points = 0;
447 struct timespec uptime;
448 434
449 do_posix_clock_monotonic_gettime(&uptime);
450 read_lock(&tasklist_lock); 435 read_lock(&tasklist_lock);
451 if (pid_alive(task)) 436 if (pid_alive(task))
452 points = badness(task, uptime.tv_sec); 437 points = oom_badness(task, NULL, NULL,
438 totalram_pages + total_swap_pages);
453 read_unlock(&tasklist_lock); 439 read_unlock(&tasklist_lock);
454 return sprintf(buffer, "%lu\n", points); 440 return sprintf(buffer, "%lu\n", points);
455} 441}
@@ -573,9 +559,19 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr)
573 return -EPERM; 559 return -EPERM;
574 560
575 error = inode_change_ok(inode, attr); 561 error = inode_change_ok(inode, attr);
576 if (!error) 562 if (error)
577 error = inode_setattr(inode, attr); 563 return error;
578 return error; 564
565 if ((attr->ia_valid & ATTR_SIZE) &&
566 attr->ia_size != i_size_read(inode)) {
567 error = vmtruncate(inode, attr->ia_size);
568 if (error)
569 return error;
570 }
571
572 setattr_copy(inode, attr);
573 mark_inode_dirty(inode);
574 return 0;
579} 575}
580 576
581static const struct inode_operations proc_def_inode_operations = { 577static const struct inode_operations proc_def_inode_operations = {
@@ -601,7 +597,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
601 get_mnt_ns(ns); 597 get_mnt_ns(ns);
602 } 598 }
603 rcu_read_unlock(); 599 rcu_read_unlock();
604 if (ns && get_fs_path(task, &root, 1) == 0) 600 if (ns && get_task_root(task, &root) == 0)
605 ret = 0; 601 ret = 0;
606 put_task_struct(task); 602 put_task_struct(task);
607 } 603 }
@@ -730,6 +726,7 @@ out_no_task:
730 726
731static const struct file_operations proc_info_file_operations = { 727static const struct file_operations proc_info_file_operations = {
732 .read = proc_info_read, 728 .read = proc_info_read,
729 .llseek = generic_file_llseek,
733}; 730};
734 731
735static int proc_single_show(struct seq_file *m, void *v) 732static int proc_single_show(struct seq_file *m, void *v)
@@ -987,6 +984,7 @@ out_no_task:
987 984
988static const struct file_operations proc_environ_operations = { 985static const struct file_operations proc_environ_operations = {
989 .read = environ_read, 986 .read = environ_read,
987 .llseek = generic_file_llseek,
990}; 988};
991 989
992static ssize_t oom_adjust_read(struct file *file, char __user *buf, 990static ssize_t oom_adjust_read(struct file *file, char __user *buf,
@@ -1049,8 +1047,24 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1049 return -EACCES; 1047 return -EACCES;
1050 } 1048 }
1051 1049
1050 /*
1051 * Warn that /proc/pid/oom_adj is deprecated, see
1052 * Documentation/feature-removal-schedule.txt.
1053 */
1054 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
1055 "please use /proc/%d/oom_score_adj instead.\n",
1056 current->comm, task_pid_nr(current),
1057 task_pid_nr(task), task_pid_nr(task));
1052 task->signal->oom_adj = oom_adjust; 1058 task->signal->oom_adj = oom_adjust;
1053 1059 /*
1060 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1061 * value is always attainable.
1062 */
1063 if (task->signal->oom_adj == OOM_ADJUST_MAX)
1064 task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
1065 else
1066 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
1067 -OOM_DISABLE;
1054 unlock_task_sighand(task, &flags); 1068 unlock_task_sighand(task, &flags);
1055 put_task_struct(task); 1069 put_task_struct(task);
1056 1070
@@ -1060,6 +1074,83 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1060static const struct file_operations proc_oom_adjust_operations = { 1074static const struct file_operations proc_oom_adjust_operations = {
1061 .read = oom_adjust_read, 1075 .read = oom_adjust_read,
1062 .write = oom_adjust_write, 1076 .write = oom_adjust_write,
1077 .llseek = generic_file_llseek,
1078};
1079
1080static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1081 size_t count, loff_t *ppos)
1082{
1083 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
1084 char buffer[PROC_NUMBUF];
1085 int oom_score_adj = OOM_SCORE_ADJ_MIN;
1086 unsigned long flags;
1087 size_t len;
1088
1089 if (!task)
1090 return -ESRCH;
1091 if (lock_task_sighand(task, &flags)) {
1092 oom_score_adj = task->signal->oom_score_adj;
1093 unlock_task_sighand(task, &flags);
1094 }
1095 put_task_struct(task);
1096 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
1097 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1098}
1099
1100static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1101 size_t count, loff_t *ppos)
1102{
1103 struct task_struct *task;
1104 char buffer[PROC_NUMBUF];
1105 unsigned long flags;
1106 long oom_score_adj;
1107 int err;
1108
1109 memset(buffer, 0, sizeof(buffer));
1110 if (count > sizeof(buffer) - 1)
1111 count = sizeof(buffer) - 1;
1112 if (copy_from_user(buffer, buf, count))
1113 return -EFAULT;
1114
1115 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
1116 if (err)
1117 return -EINVAL;
1118 if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1119 oom_score_adj > OOM_SCORE_ADJ_MAX)
1120 return -EINVAL;
1121
1122 task = get_proc_task(file->f_path.dentry->d_inode);
1123 if (!task)
1124 return -ESRCH;
1125 if (!lock_task_sighand(task, &flags)) {
1126 put_task_struct(task);
1127 return -ESRCH;
1128 }
1129 if (oom_score_adj < task->signal->oom_score_adj &&
1130 !capable(CAP_SYS_RESOURCE)) {
1131 unlock_task_sighand(task, &flags);
1132 put_task_struct(task);
1133 return -EACCES;
1134 }
1135
1136 task->signal->oom_score_adj = oom_score_adj;
1137 /*
1138 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1139 * always attainable.
1140 */
1141 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1142 task->signal->oom_adj = OOM_DISABLE;
1143 else
1144 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1145 OOM_SCORE_ADJ_MAX;
1146 unlock_task_sighand(task, &flags);
1147 put_task_struct(task);
1148 return count;
1149}
1150
1151static const struct file_operations proc_oom_score_adj_operations = {
1152 .read = oom_score_adj_read,
1153 .write = oom_score_adj_write,
1063}; 1154};
1064 1155
1065#ifdef CONFIG_AUDITSYSCALL 1156#ifdef CONFIG_AUDITSYSCALL
@@ -1131,6 +1222,7 @@ out_free_page:
1131static const struct file_operations proc_loginuid_operations = { 1222static const struct file_operations proc_loginuid_operations = {
1132 .read = proc_loginuid_read, 1223 .read = proc_loginuid_read,
1133 .write = proc_loginuid_write, 1224 .write = proc_loginuid_write,
1225 .llseek = generic_file_llseek,
1134}; 1226};
1135 1227
1136static ssize_t proc_sessionid_read(struct file * file, char __user * buf, 1228static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
@@ -1151,6 +1243,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1151 1243
1152static const struct file_operations proc_sessionid_operations = { 1244static const struct file_operations proc_sessionid_operations = {
1153 .read = proc_sessionid_read, 1245 .read = proc_sessionid_read,
1246 .llseek = generic_file_llseek,
1154}; 1247};
1155#endif 1248#endif
1156 1249
@@ -1202,6 +1295,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
1202static const struct file_operations proc_fault_inject_operations = { 1295static const struct file_operations proc_fault_inject_operations = {
1203 .read = proc_fault_inject_read, 1296 .read = proc_fault_inject_read,
1204 .write = proc_fault_inject_write, 1297 .write = proc_fault_inject_write,
1298 .llseek = generic_file_llseek,
1205}; 1299};
1206#endif 1300#endif
1207 1301
@@ -1432,7 +1526,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1432 if (!tmp) 1526 if (!tmp)
1433 return -ENOMEM; 1527 return -ENOMEM;
1434 1528
1435 pathname = d_path(path, tmp, PAGE_SIZE); 1529 pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
1436 len = PTR_ERR(pathname); 1530 len = PTR_ERR(pathname);
1437 if (IS_ERR(pathname)) 1531 if (IS_ERR(pathname))
1438 goto out; 1532 goto out;
@@ -1943,7 +2037,7 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1943} 2037}
1944 2038
1945static const struct file_operations proc_fdinfo_file_operations = { 2039static const struct file_operations proc_fdinfo_file_operations = {
1946 .open = nonseekable_open, 2040 .open = nonseekable_open,
1947 .read = proc_fdinfo_read, 2041 .read = proc_fdinfo_read,
1948}; 2042};
1949 2043
@@ -2227,6 +2321,7 @@ out_no_task:
2227static const struct file_operations proc_pid_attr_operations = { 2321static const struct file_operations proc_pid_attr_operations = {
2228 .read = proc_pid_attr_read, 2322 .read = proc_pid_attr_read,
2229 .write = proc_pid_attr_write, 2323 .write = proc_pid_attr_write,
2324 .llseek = generic_file_llseek,
2230}; 2325};
2231 2326
2232static const struct pid_entry attr_dir_stuff[] = { 2327static const struct pid_entry attr_dir_stuff[] = {
@@ -2347,6 +2442,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
2347static const struct file_operations proc_coredump_filter_operations = { 2442static const struct file_operations proc_coredump_filter_operations = {
2348 .read = proc_coredump_filter_read, 2443 .read = proc_coredump_filter_read,
2349 .write = proc_coredump_filter_write, 2444 .write = proc_coredump_filter_write,
2445 .llseek = generic_file_llseek,
2350}; 2446};
2351#endif 2447#endif
2352 2448
@@ -2436,7 +2532,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2436 const struct pid_entry *p = ptr; 2532 const struct pid_entry *p = ptr;
2437 struct inode *inode; 2533 struct inode *inode;
2438 struct proc_inode *ei; 2534 struct proc_inode *ei;
2439 struct dentry *error = ERR_PTR(-EINVAL); 2535 struct dentry *error;
2440 2536
2441 /* Allocate the inode */ 2537 /* Allocate the inode */
2442 error = ERR_PTR(-ENOMEM); 2538 error = ERR_PTR(-ENOMEM);
@@ -2579,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2579 INF("auxv", S_IRUSR, proc_pid_auxv), 2675 INF("auxv", S_IRUSR, proc_pid_auxv),
2580 ONE("status", S_IRUGO, proc_pid_status), 2676 ONE("status", S_IRUGO, proc_pid_status),
2581 ONE("personality", S_IRUSR, proc_pid_personality), 2677 ONE("personality", S_IRUSR, proc_pid_personality),
2582 INF("limits", S_IRUSR, proc_pid_limits), 2678 INF("limits", S_IRUGO, proc_pid_limits),
2583#ifdef CONFIG_SCHED_DEBUG 2679#ifdef CONFIG_SCHED_DEBUG
2584 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2680 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2585#endif 2681#endif
@@ -2629,6 +2725,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2629#endif 2725#endif
2630 INF("oom_score", S_IRUGO, proc_oom_score), 2726 INF("oom_score", S_IRUGO, proc_oom_score),
2631 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), 2727 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2728 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2632#ifdef CONFIG_AUDITSYSCALL 2729#ifdef CONFIG_AUDITSYSCALL
2633 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2730 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2634 REG("sessionid", S_IRUGO, proc_sessionid_operations), 2731 REG("sessionid", S_IRUGO, proc_sessionid_operations),
@@ -2786,7 +2883,7 @@ out:
2786 2883
2787struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 2884struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2788{ 2885{
2789 struct dentry *result = ERR_PTR(-ENOENT); 2886 struct dentry *result;
2790 struct task_struct *task; 2887 struct task_struct *task;
2791 unsigned tgid; 2888 unsigned tgid;
2792 struct pid_namespace *ns; 2889 struct pid_namespace *ns;
@@ -2914,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = {
2914 INF("auxv", S_IRUSR, proc_pid_auxv), 3011 INF("auxv", S_IRUSR, proc_pid_auxv),
2915 ONE("status", S_IRUGO, proc_pid_status), 3012 ONE("status", S_IRUGO, proc_pid_status),
2916 ONE("personality", S_IRUSR, proc_pid_personality), 3013 ONE("personality", S_IRUSR, proc_pid_personality),
2917 INF("limits", S_IRUSR, proc_pid_limits), 3014 INF("limits", S_IRUGO, proc_pid_limits),
2918#ifdef CONFIG_SCHED_DEBUG 3015#ifdef CONFIG_SCHED_DEBUG
2919 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 3016 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2920#endif 3017#endif
@@ -2963,6 +3060,7 @@ static const struct pid_entry tid_base_stuff[] = {
2963#endif 3060#endif
2964 INF("oom_score", S_IRUGO, proc_oom_score), 3061 INF("oom_score", S_IRUGO, proc_oom_score),
2965 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), 3062 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
3063 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2966#ifdef CONFIG_AUDITSYSCALL 3064#ifdef CONFIG_AUDITSYSCALL
2967 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3065 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2968 REG("sessionid", S_IRUSR, proc_sessionid_operations), 3066 REG("sessionid", S_IRUSR, proc_sessionid_operations),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..dd29f0337661 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -12,6 +12,7 @@
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/mm.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/mount.h> 18#include <linux/mount.h>
@@ -258,17 +259,22 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
258 259
259 error = inode_change_ok(inode, iattr); 260 error = inode_change_ok(inode, iattr);
260 if (error) 261 if (error)
261 goto out; 262 return error;
262 263
263 error = inode_setattr(inode, iattr); 264 if ((iattr->ia_valid & ATTR_SIZE) &&
264 if (error) 265 iattr->ia_size != i_size_read(inode)) {
265 goto out; 266 error = vmtruncate(inode, iattr->ia_size);
267 if (error)
268 return error;
269 }
270
271 setattr_copy(inode, iattr);
272 mark_inode_dirty(inode);
266 273
267 de->uid = inode->i_uid; 274 de->uid = inode->i_uid;
268 de->gid = inode->i_gid; 275 de->gid = inode->i_gid;
269 de->mode = inode->i_mode; 276 de->mode = inode->i_mode;
270out: 277 return 0;
271 return error;
272} 278}
273 279
274static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, 280static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -343,21 +349,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
343/* 349/*
344 * Return an inode number between PROC_DYNAMIC_FIRST and 350 * Return an inode number between PROC_DYNAMIC_FIRST and
345 * 0xffffffff, or zero on failure. 351 * 0xffffffff, or zero on failure.
346 *
347 * Current inode allocations in the proc-fs (hex-numbers):
348 *
349 * 00000000 reserved
350 * 00000001-00000fff static entries (goners)
351 * 001 root-ino
352 *
353 * 00001000-00001fff unused
354 * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
355 * 80000000-efffffff unused
356 * f0000000-ffffffff dynamic entries
357 *
358 * Goal:
359 * Once we split the thing into several virtual filesystems,
360 * we will get rid of magical ranges (and this comment, BTW).
361 */ 352 */
362static unsigned int get_inode_number(void) 353static unsigned int get_inode_number(void)
363{ 354{
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d35b23238fb1..9c2b5f484879 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -25,11 +25,12 @@
25 25
26#include "internal.h" 26#include "internal.h"
27 27
28static void proc_delete_inode(struct inode *inode) 28static void proc_evict_inode(struct inode *inode)
29{ 29{
30 struct proc_dir_entry *de; 30 struct proc_dir_entry *de;
31 31
32 truncate_inode_pages(&inode->i_data, 0); 32 truncate_inode_pages(&inode->i_data, 0);
33 end_writeback(inode);
33 34
34 /* Stop tracking associated processes */ 35 /* Stop tracking associated processes */
35 put_pid(PROC_I(inode)->pid); 36 put_pid(PROC_I(inode)->pid);
@@ -40,7 +41,6 @@ static void proc_delete_inode(struct inode *inode)
40 pde_put(de); 41 pde_put(de);
41 if (PROC_I(inode)->sysctl) 42 if (PROC_I(inode)->sysctl)
42 sysctl_head_put(PROC_I(inode)->sysctl); 43 sysctl_head_put(PROC_I(inode)->sysctl);
43 clear_inode(inode);
44} 44}
45 45
46struct vfsmount *proc_mnt; 46struct vfsmount *proc_mnt;
@@ -91,7 +91,7 @@ static const struct super_operations proc_sops = {
91 .alloc_inode = proc_alloc_inode, 91 .alloc_inode = proc_alloc_inode,
92 .destroy_inode = proc_destroy_inode, 92 .destroy_inode = proc_destroy_inode,
93 .drop_inode = generic_delete_inode, 93 .drop_inode = generic_delete_inode,
94 .delete_inode = proc_delete_inode, 94 .evict_inode = proc_evict_inode,
95 .statfs = simple_statfs, 95 .statfs = simple_statfs,
96}; 96};
97 97
@@ -214,8 +214,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
214{ 214{
215 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); 215 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
216 long rv = -ENOTTY; 216 long rv = -ENOTTY;
217 long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long); 217 long (*ioctl)(struct file *, unsigned int, unsigned long);
218 int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
219 218
220 spin_lock(&pde->pde_unload_lock); 219 spin_lock(&pde->pde_unload_lock);
221 if (!pde->proc_fops) { 220 if (!pde->proc_fops) {
@@ -223,19 +222,11 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
223 return rv; 222 return rv;
224 } 223 }
225 pde->pde_users++; 224 pde->pde_users++;
226 unlocked_ioctl = pde->proc_fops->unlocked_ioctl; 225 ioctl = pde->proc_fops->unlocked_ioctl;
227 ioctl = pde->proc_fops->ioctl;
228 spin_unlock(&pde->pde_unload_lock); 226 spin_unlock(&pde->pde_unload_lock);
229 227
230 if (unlocked_ioctl) { 228 if (ioctl)
231 rv = unlocked_ioctl(file, cmd, arg); 229 rv = ioctl(file, cmd, arg);
232 if (rv == -ENOIOCTLCMD)
233 rv = -EINVAL;
234 } else if (ioctl) {
235 lock_kernel();
236 rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
237 unlock_kernel();
238 }
239 230
240 pde_users_dec(pde); 231 pde_users_dec(pde);
241 return rv; 232 return rv;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 19979a2ce272..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,6 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
558static const struct file_operations proc_kcore_operations = { 558static const struct file_operations proc_kcore_operations = {
559 .read = read_kcore, 559 .read = read_kcore,
560 .open = open_kcore, 560 .open = open_kcore,
561 .llseek = generic_file_llseek,
561}; 562};
562 563
563#ifdef CONFIG_MEMORY_HOTPLUG 564#ifdef CONFIG_MEMORY_HOTPLUG
@@ -587,7 +588,7 @@ static struct kcore_list kcore_text;
587 */ 588 */
588static void __init proc_kcore_text_init(void) 589static void __init proc_kcore_text_init(void)
589{ 590{
590 kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); 591 kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
591} 592}
592#else 593#else
593static void __init proc_kcore_text_init(void) 594static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index cfe90a48a6e8..bd4b5a740ff1 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = {
53 .poll = kmsg_poll, 53 .poll = kmsg_poll,
54 .open = kmsg_open, 54 .open = kmsg_open,
55 .release = kmsg_release, 55 .release = kmsg_release,
56 .llseek = generic_file_llseek,
56}; 57};
57 58
58static int __init proc_kmsg_init(void) 59static int __init proc_kmsg_init(void)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 180cf5a0bd67..3b8b45660331 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -146,7 +146,7 @@ u64 stable_page_flags(struct page *page)
146 u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); 146 u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
147#endif 147#endif
148 148
149#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR 149#ifdef CONFIG_ARCH_USES_PG_UNCACHED
150 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); 150 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
151#endif 151#endif
152 152
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index ce94801f48ca..d9396a4fc7ff 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -209,6 +209,9 @@ void proc_device_tree_add_node(struct device_node *np,
209 for (pp = np->properties; pp != NULL; pp = pp->next) { 209 for (pp = np->properties; pp != NULL; pp = pp->next) {
210 p = pp->name; 210 p = pp->name;
211 211
212 if (strchr(p, '/'))
213 continue;
214
212 if (duplicate_name(de, p)) 215 if (duplicate_name(de, p))
213 p = fixup_name(np, de, p); 216 p = fixup_name(np, de, p);
214 217
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 6ff9981f0a18..5be436ea088e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -329,10 +329,19 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
329 return -EPERM; 329 return -EPERM;
330 330
331 error = inode_change_ok(inode, attr); 331 error = inode_change_ok(inode, attr);
332 if (!error) 332 if (error)
333 error = inode_setattr(inode, attr); 333 return error;
334
335 if ((attr->ia_valid & ATTR_SIZE) &&
336 attr->ia_size != i_size_read(inode)) {
337 error = vmtruncate(inode, attr->ia_size);
338 if (error)
339 return error;
340 }
334 341
335 return error; 342 setattr_copy(inode, attr);
343 mark_inode_dirty(inode);
344 return 0;
336} 345}
337 346
338static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 347static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
110 if (err) 110 if (err)
111 return; 111 return;
112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); 112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
113 err = PTR_ERR(proc_mnt);
114 if (IS_ERR(proc_mnt)) { 113 if (IS_ERR(proc_mnt)) {
115 unregister_filesystem(&proc_fs_type); 114 unregister_filesystem(&proc_fs_type);
116 return; 115 return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56e..1dbca4e8cc16 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,6 +210,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
210 int flags = vma->vm_flags; 210 int flags = vma->vm_flags;
211 unsigned long ino = 0; 211 unsigned long ino = 0;
212 unsigned long long pgoff = 0; 212 unsigned long long pgoff = 0;
213 unsigned long start;
213 dev_t dev = 0; 214 dev_t dev = 0;
214 int len; 215 int len;
215 216
@@ -220,8 +221,14 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
220 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; 221 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
221 } 222 }
222 223
224 /* We don't show the stack guard page in /proc/maps */
225 start = vma->vm_start;
226 if (vma->vm_flags & VM_GROWSDOWN)
227 if (!vma_stack_continue(vma->vm_prev, vma->vm_start))
228 start += PAGE_SIZE;
229
223 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 230 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
224 vma->vm_start, 231 start,
225 vma->vm_end, 232 vma->vm_end,
226 flags & VM_READ ? 'r' : '-', 233 flags & VM_READ ? 'r' : '-',
227 flags & VM_WRITE ? 'w' : '-', 234 flags & VM_WRITE ? 'w' : '-',
@@ -356,13 +363,13 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
356 mss->referenced += PAGE_SIZE; 363 mss->referenced += PAGE_SIZE;
357 mapcount = page_mapcount(page); 364 mapcount = page_mapcount(page);
358 if (mapcount >= 2) { 365 if (mapcount >= 2) {
359 if (pte_dirty(ptent)) 366 if (pte_dirty(ptent) || PageDirty(page))
360 mss->shared_dirty += PAGE_SIZE; 367 mss->shared_dirty += PAGE_SIZE;
361 else 368 else
362 mss->shared_clean += PAGE_SIZE; 369 mss->shared_clean += PAGE_SIZE;
363 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; 370 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
364 } else { 371 } else {
365 if (pte_dirty(ptent)) 372 if (pte_dirty(ptent) || PageDirty(page))
366 mss->private_dirty += PAGE_SIZE; 373 mss->private_dirty += PAGE_SIZE;
367 else 374 else
368 mss->private_clean += PAGE_SIZE; 375 mss->private_clean += PAGE_SIZE;
@@ -634,6 +641,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
634 return err; 641 return err;
635} 642}
636 643
644#ifdef CONFIG_HUGETLB_PAGE
637static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) 645static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
638{ 646{
639 u64 pme = 0; 647 u64 pme = 0;
@@ -664,6 +672,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
664 672
665 return err; 673 return err;
666} 674}
675#endif /* HUGETLB_PAGE */
667 676
668/* 677/*
669 * /proc/pid/pagemap - an array mapping virtual pages to pfns 678 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +742,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
733 742
734 pagemap_walk.pmd_entry = pagemap_pte_range; 743 pagemap_walk.pmd_entry = pagemap_pte_range;
735 pagemap_walk.pte_hole = pagemap_pte_hole; 744 pagemap_walk.pte_hole = pagemap_pte_hole;
745#ifdef CONFIG_HUGETLB_PAGE
736 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 746 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
747#endif
737 pagemap_walk.mm = mm; 748 pagemap_walk.mm = mm;
738 pagemap_walk.private = &pm; 749 pagemap_walk.private = &pm;
739 750
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 46d4b5d72bd3..cb6306e63843 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -122,11 +122,20 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
122 return size; 122 return size;
123} 123}
124 124
125static void pad_len_spaces(struct seq_file *m, int len)
126{
127 len = 25 + sizeof(void*) * 6 - len;
128 if (len < 1)
129 len = 1;
130 seq_printf(m, "%*c", len, ' ');
131}
132
125/* 133/*
126 * display a single VMA to a sequenced file 134 * display a single VMA to a sequenced file
127 */ 135 */
128static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) 136static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
129{ 137{
138 struct mm_struct *mm = vma->vm_mm;
130 unsigned long ino = 0; 139 unsigned long ino = 0;
131 struct file *file; 140 struct file *file;
132 dev_t dev = 0; 141 dev_t dev = 0;
@@ -155,11 +164,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
155 MAJOR(dev), MINOR(dev), ino, &len); 164 MAJOR(dev), MINOR(dev), ino, &len);
156 165
157 if (file) { 166 if (file) {
158 len = 25 + sizeof(void *) * 6 - len; 167 pad_len_spaces(m, len);
159 if (len < 1)
160 len = 1;
161 seq_printf(m, "%*c", len, ' ');
162 seq_path(m, &file->f_path, ""); 168 seq_path(m, &file->f_path, "");
169 } else if (mm) {
170 if (vma->vm_start <= mm->start_stack &&
171 vma->vm_end >= mm->start_stack) {
172 pad_len_spaces(m, len);
173 seq_puts(m, "[stack]");
174 }
163 } 175 }
164 176
165 seq_putc(m, '\n'); 177 seq_putc(m, '\n');
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9fbc99ec799a..2367fb3f70bc 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -163,6 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
163 163
164static const struct file_operations proc_vmcore_operations = { 164static const struct file_operations proc_vmcore_operations = {
165 .read = read_vmcore, 165 .read = read_vmcore,
166 .llseek = default_llseek,
166}; 167};
167 168
168static struct vmcore* __init get_new_element(void) 169static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
77 77
78const struct file_operations qnx4_dir_operations = 78const struct file_operations qnx4_dir_operations =
79{ 79{
80 .llseek = generic_file_llseek,
80 .read = generic_read_dir, 81 .read = generic_read_dir,
81 .readdir = qnx4_readdir, 82 .readdir = qnx4_readdir,
82 .fsync = simple_fsync, 83 .fsync = generic_file_fsync,
83}; 84};
84 85
85const struct inode_operations qnx4_dir_inode_operations = 86const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 277575ddc05c..16829722be93 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -320,10 +320,19 @@ static int qnx4_write_begin(struct file *file, struct address_space *mapping,
320 struct page **pagep, void **fsdata) 320 struct page **pagep, void **fsdata)
321{ 321{
322 struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host); 322 struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host);
323 int ret;
324
323 *pagep = NULL; 325 *pagep = NULL;
324 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 326 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
325 qnx4_get_block, 327 qnx4_get_block,
326 &qnx4_inode->mmu_private); 328 &qnx4_inode->mmu_private);
329 if (unlikely(ret)) {
330 loff_t isize = mapping->host->i_size;
331 if (pos + len > isize)
332 vmtruncate(mapping->host, isize);
333 }
334
335 return ret;
327} 336}
328static sector_t qnx4_bmap(struct address_space *mapping, sector_t block) 337static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
329{ 338{
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 788b5802a7ce..aad1316a977f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -82,7 +82,7 @@
82 82
83/* 83/*
84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas 84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
85 * and quota formats, dqstats structure containing statistics about the lists 85 * and quota formats.
86 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and 86 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
87 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. 87 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
88 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly 88 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
@@ -132,7 +132,25 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); 132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
133EXPORT_SYMBOL(dq_data_lock); 133EXPORT_SYMBOL(dq_data_lock);
134 134
135void __quota_error(struct super_block *sb, const char *func,
136 const char *fmt, ...)
137{
138 va_list args;
139
140 if (printk_ratelimit()) {
141 va_start(args, fmt);
142 printk(KERN_ERR "Quota error (device %s): %s: ",
143 sb->s_id, func);
144 vprintk(fmt, args);
145 printk("\n");
146 va_end(args);
147 }
148}
149EXPORT_SYMBOL(__quota_error);
150
151#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
135static char *quotatypes[] = INITQFNAMES; 152static char *quotatypes[] = INITQFNAMES;
153#endif
136static struct quota_format_type *quota_formats; /* List of registered formats */ 154static struct quota_format_type *quota_formats; /* List of registered formats */
137static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; 155static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
138 156
@@ -273,7 +291,7 @@ static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
273static inline void put_dquot_last(struct dquot *dquot) 291static inline void put_dquot_last(struct dquot *dquot)
274{ 292{
275 list_add_tail(&dquot->dq_free, &free_dquots); 293 list_add_tail(&dquot->dq_free, &free_dquots);
276 dqstats.free_dquots++; 294 dqstats_inc(DQST_FREE_DQUOTS);
277} 295}
278 296
279static inline void remove_free_dquot(struct dquot *dquot) 297static inline void remove_free_dquot(struct dquot *dquot)
@@ -281,7 +299,7 @@ static inline void remove_free_dquot(struct dquot *dquot)
281 if (list_empty(&dquot->dq_free)) 299 if (list_empty(&dquot->dq_free))
282 return; 300 return;
283 list_del_init(&dquot->dq_free); 301 list_del_init(&dquot->dq_free);
284 dqstats.free_dquots--; 302 dqstats_dec(DQST_FREE_DQUOTS);
285} 303}
286 304
287static inline void put_inuse(struct dquot *dquot) 305static inline void put_inuse(struct dquot *dquot)
@@ -289,12 +307,12 @@ static inline void put_inuse(struct dquot *dquot)
289 /* We add to the back of inuse list so we don't have to restart 307 /* We add to the back of inuse list so we don't have to restart
290 * when traversing this list and we block */ 308 * when traversing this list and we block */
291 list_add_tail(&dquot->dq_inuse, &inuse_list); 309 list_add_tail(&dquot->dq_inuse, &inuse_list);
292 dqstats.allocated_dquots++; 310 dqstats_inc(DQST_ALLOC_DQUOTS);
293} 311}
294 312
295static inline void remove_inuse(struct dquot *dquot) 313static inline void remove_inuse(struct dquot *dquot)
296{ 314{
297 dqstats.allocated_dquots--; 315 dqstats_dec(DQST_ALLOC_DQUOTS);
298 list_del(&dquot->dq_inuse); 316 list_del(&dquot->dq_inuse);
299} 317}
300/* 318/*
@@ -317,14 +335,23 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
317 return dquot->dq_sb->dq_op->mark_dirty(dquot); 335 return dquot->dq_sb->dq_op->mark_dirty(dquot);
318} 336}
319 337
338/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
320int dquot_mark_dquot_dirty(struct dquot *dquot) 339int dquot_mark_dquot_dirty(struct dquot *dquot)
321{ 340{
341 int ret = 1;
342
343 /* If quota is dirty already, we don't have to acquire dq_list_lock */
344 if (test_bit(DQ_MOD_B, &dquot->dq_flags))
345 return 1;
346
322 spin_lock(&dq_list_lock); 347 spin_lock(&dq_list_lock);
323 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) 348 if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
324 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> 349 list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
325 info[dquot->dq_type].dqi_dirty_list); 350 info[dquot->dq_type].dqi_dirty_list);
351 ret = 0;
352 }
326 spin_unlock(&dq_list_lock); 353 spin_unlock(&dq_list_lock);
327 return 0; 354 return ret;
328} 355}
329EXPORT_SYMBOL(dquot_mark_dquot_dirty); 356EXPORT_SYMBOL(dquot_mark_dquot_dirty);
330 357
@@ -550,8 +577,8 @@ int dquot_scan_active(struct super_block *sb,
550 continue; 577 continue;
551 /* Now we have active dquot so we can just increase use count */ 578 /* Now we have active dquot so we can just increase use count */
552 atomic_inc(&dquot->dq_count); 579 atomic_inc(&dquot->dq_count);
553 dqstats.lookups++;
554 spin_unlock(&dq_list_lock); 580 spin_unlock(&dq_list_lock);
581 dqstats_inc(DQST_LOOKUPS);
555 dqput(old_dquot); 582 dqput(old_dquot);
556 old_dquot = dquot; 583 old_dquot = dquot;
557 ret = fn(dquot, priv); 584 ret = fn(dquot, priv);
@@ -569,7 +596,7 @@ out:
569} 596}
570EXPORT_SYMBOL(dquot_scan_active); 597EXPORT_SYMBOL(dquot_scan_active);
571 598
572int vfs_quota_sync(struct super_block *sb, int type, int wait) 599int dquot_quota_sync(struct super_block *sb, int type, int wait)
573{ 600{
574 struct list_head *dirty; 601 struct list_head *dirty;
575 struct dquot *dquot; 602 struct dquot *dquot;
@@ -596,8 +623,8 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
596 * holding reference so we can safely just increase 623 * holding reference so we can safely just increase
597 * use count */ 624 * use count */
598 atomic_inc(&dquot->dq_count); 625 atomic_inc(&dquot->dq_count);
599 dqstats.lookups++;
600 spin_unlock(&dq_list_lock); 626 spin_unlock(&dq_list_lock);
627 dqstats_inc(DQST_LOOKUPS);
601 sb->dq_op->write_dquot(dquot); 628 sb->dq_op->write_dquot(dquot);
602 dqput(dquot); 629 dqput(dquot);
603 spin_lock(&dq_list_lock); 630 spin_lock(&dq_list_lock);
@@ -609,9 +636,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
609 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) 636 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
610 && info_dirty(&dqopt->info[cnt])) 637 && info_dirty(&dqopt->info[cnt]))
611 sb->dq_op->write_info(sb, cnt); 638 sb->dq_op->write_info(sb, cnt);
612 spin_lock(&dq_list_lock); 639 dqstats_inc(DQST_SYNCS);
613 dqstats.syncs++;
614 spin_unlock(&dq_list_lock);
615 mutex_unlock(&dqopt->dqonoff_mutex); 640 mutex_unlock(&dqopt->dqonoff_mutex);
616 641
617 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) 642 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
@@ -643,7 +668,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
643 668
644 return 0; 669 return 0;
645} 670}
646EXPORT_SYMBOL(vfs_quota_sync); 671EXPORT_SYMBOL(dquot_quota_sync);
647 672
648/* Free unused dquots from cache */ 673/* Free unused dquots from cache */
649static void prune_dqcache(int count) 674static void prune_dqcache(int count)
@@ -667,15 +692,16 @@ static void prune_dqcache(int count)
667 * This is called from kswapd when we think we need some 692 * This is called from kswapd when we think we need some
668 * more memory 693 * more memory
669 */ 694 */
670 695static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
671static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
672{ 696{
673 if (nr) { 697 if (nr) {
674 spin_lock(&dq_list_lock); 698 spin_lock(&dq_list_lock);
675 prune_dqcache(nr); 699 prune_dqcache(nr);
676 spin_unlock(&dq_list_lock); 700 spin_unlock(&dq_list_lock);
677 } 701 }
678 return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure; 702 return ((unsigned)
703 percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
704 /100) * sysctl_vfs_cache_pressure;
679} 705}
680 706
681static struct shrinker dqcache_shrinker = { 707static struct shrinker dqcache_shrinker = {
@@ -695,18 +721,12 @@ void dqput(struct dquot *dquot)
695 return; 721 return;
696#ifdef CONFIG_QUOTA_DEBUG 722#ifdef CONFIG_QUOTA_DEBUG
697 if (!atomic_read(&dquot->dq_count)) { 723 if (!atomic_read(&dquot->dq_count)) {
698 printk("VFS: dqput: trying to free free dquot\n"); 724 quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
699 printk("VFS: device %s, dquot of %s %d\n", 725 quotatypes[dquot->dq_type], dquot->dq_id);
700 dquot->dq_sb->s_id,
701 quotatypes[dquot->dq_type],
702 dquot->dq_id);
703 BUG(); 726 BUG();
704 } 727 }
705#endif 728#endif
706 729 dqstats_inc(DQST_DROPS);
707 spin_lock(&dq_list_lock);
708 dqstats.drops++;
709 spin_unlock(&dq_list_lock);
710we_slept: 730we_slept:
711 spin_lock(&dq_list_lock); 731 spin_lock(&dq_list_lock);
712 if (atomic_read(&dquot->dq_count) > 1) { 732 if (atomic_read(&dquot->dq_count) > 1) {
@@ -725,9 +745,9 @@ we_slept:
725 /* Commit dquot before releasing */ 745 /* Commit dquot before releasing */
726 ret = dquot->dq_sb->dq_op->write_dquot(dquot); 746 ret = dquot->dq_sb->dq_op->write_dquot(dquot);
727 if (ret < 0) { 747 if (ret < 0) {
728 printk(KERN_ERR "VFS: cannot write quota structure on " 748 quota_error(dquot->dq_sb, "Can't write quota structure"
729 "device %s (error %d). Quota may get out of " 749 " (error %d). Quota may get out of sync!",
730 "sync!\n", dquot->dq_sb->s_id, ret); 750 ret);
731 /* 751 /*
732 * We clear dirty bit anyway, so that we avoid 752 * We clear dirty bit anyway, so that we avoid
733 * infinite loop here 753 * infinite loop here
@@ -823,15 +843,15 @@ we_slept:
823 put_inuse(dquot); 843 put_inuse(dquot);
824 /* hash it first so it can be found */ 844 /* hash it first so it can be found */
825 insert_dquot_hash(dquot); 845 insert_dquot_hash(dquot);
826 dqstats.lookups++;
827 spin_unlock(&dq_list_lock); 846 spin_unlock(&dq_list_lock);
847 dqstats_inc(DQST_LOOKUPS);
828 } else { 848 } else {
829 if (!atomic_read(&dquot->dq_count)) 849 if (!atomic_read(&dquot->dq_count))
830 remove_free_dquot(dquot); 850 remove_free_dquot(dquot);
831 atomic_inc(&dquot->dq_count); 851 atomic_inc(&dquot->dq_count);
832 dqstats.cache_hits++;
833 dqstats.lookups++;
834 spin_unlock(&dq_list_lock); 852 spin_unlock(&dq_list_lock);
853 dqstats_inc(DQST_CACHE_HITS);
854 dqstats_inc(DQST_LOOKUPS);
835 } 855 }
836 /* Wait for dq_lock - after this we know that either dquot_release() is 856 /* Wait for dq_lock - after this we know that either dquot_release() is
837 * already finished or it will be canceled due to dq_count > 1 test */ 857 * already finished or it will be canceled due to dq_count > 1 test */
@@ -878,7 +898,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
878 898
879 spin_lock(&inode_lock); 899 spin_lock(&inode_lock);
880 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 900 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
881 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 901 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
882 continue; 902 continue;
883#ifdef CONFIG_QUOTA_DEBUG 903#ifdef CONFIG_QUOTA_DEBUG
884 if (unlikely(inode_get_rsv_space(inode) > 0)) 904 if (unlikely(inode_get_rsv_space(inode) > 0))
@@ -907,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type)
907 927
908#ifdef CONFIG_QUOTA_DEBUG 928#ifdef CONFIG_QUOTA_DEBUG
909 if (reserved) { 929 if (reserved) {
910 printk(KERN_WARNING "VFS (%s): Writes happened before quota" 930 quota_error(sb, "Writes happened before quota was turned on "
911 " was turned on thus quota information is probably " 931 "thus quota information is probably inconsistent. "
912 "inconsistent. Please run quotacheck(8).\n", sb->s_id); 932 "Please run quotacheck(8)");
913 } 933 }
914#endif 934#endif
915} 935}
@@ -940,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
940 if (dqput_blocks(dquot)) { 960 if (dqput_blocks(dquot)) {
941#ifdef CONFIG_QUOTA_DEBUG 961#ifdef CONFIG_QUOTA_DEBUG
942 if (atomic_read(&dquot->dq_count) != 1) 962 if (atomic_read(&dquot->dq_count) != 1)
943 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); 963 quota_error(inode->i_sb, "Adding dquot with "
964 "dq_count %d to dispose list",
965 atomic_read(&dquot->dq_count));
944#endif 966#endif
945 spin_lock(&dq_list_lock); 967 spin_lock(&dq_list_lock);
946 /* As dquot must have currently users it can't be on 968 /* As dquot must have currently users it can't be on
@@ -979,6 +1001,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
979 struct list_head *tofree_head) 1001 struct list_head *tofree_head)
980{ 1002{
981 struct inode *inode; 1003 struct inode *inode;
1004 int reserved = 0;
982 1005
983 spin_lock(&inode_lock); 1006 spin_lock(&inode_lock);
984 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1007 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -988,10 +1011,20 @@ static void remove_dquot_ref(struct super_block *sb, int type,
988 * only quota pointers and these have separate locking 1011 * only quota pointers and these have separate locking
989 * (dqptr_sem). 1012 * (dqptr_sem).
990 */ 1013 */
991 if (!IS_NOQUOTA(inode)) 1014 if (!IS_NOQUOTA(inode)) {
1015 if (unlikely(inode_get_rsv_space(inode) > 0))
1016 reserved = 1;
992 remove_inode_dquot_ref(inode, type, tofree_head); 1017 remove_inode_dquot_ref(inode, type, tofree_head);
1018 }
993 } 1019 }
994 spin_unlock(&inode_lock); 1020 spin_unlock(&inode_lock);
1021#ifdef CONFIG_QUOTA_DEBUG
1022 if (reserved) {
1023 printk(KERN_WARNING "VFS (%s): Writes happened after quota"
1024 " was disabled thus quota information is probably "
1025 "inconsistent. Please run quotacheck(8).\n", sb->s_id);
1026 }
1027#endif
995} 1028}
996 1029
997/* Gather all references from inodes and drop them */ 1030/* Gather all references from inodes and drop them */
@@ -1297,6 +1330,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
1297 return QUOTA_NL_NOWARN; 1330 return QUOTA_NL_NOWARN;
1298} 1331}
1299 1332
1333static int dquot_active(const struct inode *inode)
1334{
1335 struct super_block *sb = inode->i_sb;
1336
1337 if (IS_NOQUOTA(inode))
1338 return 0;
1339 return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
1340}
1341
1300/* 1342/*
1301 * Initialize quota pointers in inode 1343 * Initialize quota pointers in inode
1302 * 1344 *
@@ -1316,7 +1358,7 @@ static void __dquot_initialize(struct inode *inode, int type)
1316 1358
1317 /* First test before acquiring mutex - solves deadlocks when we 1359 /* First test before acquiring mutex - solves deadlocks when we
1318 * re-enter the quota code and are already holding the mutex */ 1360 * re-enter the quota code and are already holding the mutex */
1319 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) 1361 if (!dquot_active(inode))
1320 return; 1362 return;
1321 1363
1322 /* First get references to structures we might need. */ 1364 /* First get references to structures we might need. */
@@ -1488,17 +1530,19 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1488/* 1530/*
1489 * This operation can block, but only after everything is updated 1531 * This operation can block, but only after everything is updated
1490 */ 1532 */
1491int __dquot_alloc_space(struct inode *inode, qsize_t number, 1533int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1492 int warn, int reserve)
1493{ 1534{
1494 int cnt, ret = 0; 1535 int cnt, ret = 0;
1495 char warntype[MAXQUOTAS]; 1536 char warntype[MAXQUOTAS];
1537 int warn = flags & DQUOT_SPACE_WARN;
1538 int reserve = flags & DQUOT_SPACE_RESERVE;
1539 int nofail = flags & DQUOT_SPACE_NOFAIL;
1496 1540
1497 /* 1541 /*
1498 * First test before acquiring mutex - solves deadlocks when we 1542 * First test before acquiring mutex - solves deadlocks when we
1499 * re-enter the quota code and are already holding the mutex 1543 * re-enter the quota code and are already holding the mutex
1500 */ 1544 */
1501 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { 1545 if (!dquot_active(inode)) {
1502 inode_incr_space(inode, number, reserve); 1546 inode_incr_space(inode, number, reserve);
1503 goto out; 1547 goto out;
1504 } 1548 }
@@ -1513,7 +1557,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1513 continue; 1557 continue;
1514 ret = check_bdq(inode->i_dquot[cnt], number, !warn, 1558 ret = check_bdq(inode->i_dquot[cnt], number, !warn,
1515 warntype+cnt); 1559 warntype+cnt);
1516 if (ret) { 1560 if (ret && !nofail) {
1517 spin_unlock(&dq_data_lock); 1561 spin_unlock(&dq_data_lock);
1518 goto out_flush_warn; 1562 goto out_flush_warn;
1519 } 1563 }
@@ -1550,7 +1594,7 @@ int dquot_alloc_inode(const struct inode *inode)
1550 1594
1551 /* First test before acquiring mutex - solves deadlocks when we 1595 /* First test before acquiring mutex - solves deadlocks when we
1552 * re-enter the quota code and are already holding the mutex */ 1596 * re-enter the quota code and are already holding the mutex */
1553 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) 1597 if (!dquot_active(inode))
1554 return 0; 1598 return 0;
1555 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1599 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1556 warntype[cnt] = QUOTA_NL_NOWARN; 1600 warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1587,7 +1631,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
1587{ 1631{
1588 int cnt; 1632 int cnt;
1589 1633
1590 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { 1634 if (!dquot_active(inode)) {
1591 inode_claim_rsv_space(inode, number); 1635 inode_claim_rsv_space(inode, number);
1592 return 0; 1636 return 0;
1593 } 1637 }
@@ -1612,14 +1656,15 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
1612/* 1656/*
1613 * This operation can block, but only after everything is updated 1657 * This operation can block, but only after everything is updated
1614 */ 1658 */
1615void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) 1659void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1616{ 1660{
1617 unsigned int cnt; 1661 unsigned int cnt;
1618 char warntype[MAXQUOTAS]; 1662 char warntype[MAXQUOTAS];
1663 int reserve = flags & DQUOT_SPACE_RESERVE;
1619 1664
1620 /* First test before acquiring mutex - solves deadlocks when we 1665 /* First test before acquiring mutex - solves deadlocks when we
1621 * re-enter the quota code and are already holding the mutex */ 1666 * re-enter the quota code and are already holding the mutex */
1622 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { 1667 if (!dquot_active(inode)) {
1623 inode_decr_space(inode, number, reserve); 1668 inode_decr_space(inode, number, reserve);
1624 return; 1669 return;
1625 } 1670 }
@@ -1657,7 +1702,7 @@ void dquot_free_inode(const struct inode *inode)
1657 1702
1658 /* First test before acquiring mutex - solves deadlocks when we 1703 /* First test before acquiring mutex - solves deadlocks when we
1659 * re-enter the quota code and are already holding the mutex */ 1704 * re-enter the quota code and are already holding the mutex */
1660 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) 1705 if (!dquot_active(inode))
1661 return; 1706 return;
1662 1707
1663 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1708 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1677,16 +1722,19 @@ EXPORT_SYMBOL(dquot_free_inode);
1677 1722
1678/* 1723/*
1679 * Transfer the number of inode and blocks from one diskquota to an other. 1724 * Transfer the number of inode and blocks from one diskquota to an other.
1725 * On success, dquot references in transfer_to are consumed and references
1726 * to original dquots that need to be released are placed there. On failure,
1727 * references are kept untouched.
1680 * 1728 *
1681 * This operation can block, but only after everything is updated 1729 * This operation can block, but only after everything is updated
1682 * A transaction must be started when entering this function. 1730 * A transaction must be started when entering this function.
1731 *
1683 */ 1732 */
1684static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) 1733int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1685{ 1734{
1686 qsize_t space, cur_space; 1735 qsize_t space, cur_space;
1687 qsize_t rsv_space = 0; 1736 qsize_t rsv_space = 0;
1688 struct dquot *transfer_from[MAXQUOTAS]; 1737 struct dquot *transfer_from[MAXQUOTAS] = {};
1689 struct dquot *transfer_to[MAXQUOTAS];
1690 int cnt, ret = 0; 1738 int cnt, ret = 0;
1691 char warntype_to[MAXQUOTAS]; 1739 char warntype_to[MAXQUOTAS];
1692 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1740 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1696,19 +1744,12 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
1696 if (IS_NOQUOTA(inode)) 1744 if (IS_NOQUOTA(inode))
1697 return 0; 1745 return 0;
1698 /* Initialize the arrays */ 1746 /* Initialize the arrays */
1699 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1747 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1700 transfer_from[cnt] = NULL;
1701 transfer_to[cnt] = NULL;
1702 warntype_to[cnt] = QUOTA_NL_NOWARN; 1748 warntype_to[cnt] = QUOTA_NL_NOWARN;
1703 }
1704 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1705 if (mask & (1 << cnt))
1706 transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
1707 }
1708 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1749 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1709 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ 1750 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1710 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1751 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1711 goto put_all; 1752 return 0;
1712 } 1753 }
1713 spin_lock(&dq_data_lock); 1754 spin_lock(&dq_data_lock);
1714 cur_space = inode_get_bytes(inode); 1755 cur_space = inode_get_bytes(inode);
@@ -1760,47 +1801,41 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
1760 1801
1761 mark_all_dquot_dirty(transfer_from); 1802 mark_all_dquot_dirty(transfer_from);
1762 mark_all_dquot_dirty(transfer_to); 1803 mark_all_dquot_dirty(transfer_to);
1763 /* The reference we got is transferred to the inode */ 1804 /* Pass back references to put */
1764 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1805 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1765 transfer_to[cnt] = NULL; 1806 transfer_to[cnt] = transfer_from[cnt];
1766warn_put_all: 1807warn:
1767 flush_warnings(transfer_to, warntype_to); 1808 flush_warnings(transfer_to, warntype_to);
1768 flush_warnings(transfer_from, warntype_from_inodes); 1809 flush_warnings(transfer_from, warntype_from_inodes);
1769 flush_warnings(transfer_from, warntype_from_space); 1810 flush_warnings(transfer_from, warntype_from_space);
1770put_all:
1771 dqput_all(transfer_from);
1772 dqput_all(transfer_to);
1773 return ret; 1811 return ret;
1774over_quota: 1812over_quota:
1775 spin_unlock(&dq_data_lock); 1813 spin_unlock(&dq_data_lock);
1776 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1814 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1777 /* Clear dquot pointers we don't want to dqput() */ 1815 goto warn;
1778 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1779 transfer_from[cnt] = NULL;
1780 goto warn_put_all;
1781} 1816}
1817EXPORT_SYMBOL(__dquot_transfer);
1782 1818
1783/* Wrapper for transferring ownership of an inode for uid/gid only 1819/* Wrapper for transferring ownership of an inode for uid/gid only
1784 * Called from FSXXX_setattr() 1820 * Called from FSXXX_setattr()
1785 */ 1821 */
1786int dquot_transfer(struct inode *inode, struct iattr *iattr) 1822int dquot_transfer(struct inode *inode, struct iattr *iattr)
1787{ 1823{
1788 qid_t chid[MAXQUOTAS]; 1824 struct dquot *transfer_to[MAXQUOTAS] = {};
1789 unsigned long mask = 0; 1825 struct super_block *sb = inode->i_sb;
1826 int ret;
1790 1827
1791 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) { 1828 if (!dquot_active(inode))
1792 mask |= 1 << USRQUOTA; 1829 return 0;
1793 chid[USRQUOTA] = iattr->ia_uid; 1830
1794 } 1831 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
1795 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) { 1832 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
1796 mask |= 1 << GRPQUOTA; 1833 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
1797 chid[GRPQUOTA] = iattr->ia_gid; 1834 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
1798 } 1835
1799 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { 1836 ret = __dquot_transfer(inode, transfer_to);
1800 dquot_initialize(inode); 1837 dqput_all(transfer_to);
1801 return __dquot_transfer(inode, chid, mask); 1838 return ret;
1802 }
1803 return 0;
1804} 1839}
1805EXPORT_SYMBOL(dquot_transfer); 1840EXPORT_SYMBOL(dquot_transfer);
1806 1841
@@ -1831,6 +1866,7 @@ const struct dquot_operations dquot_operations = {
1831 .alloc_dquot = dquot_alloc, 1866 .alloc_dquot = dquot_alloc,
1832 .destroy_dquot = dquot_destroy, 1867 .destroy_dquot = dquot_destroy,
1833}; 1868};
1869EXPORT_SYMBOL(dquot_operations);
1834 1870
1835/* 1871/*
1836 * Generic helper for ->open on filesystems supporting disk quotas. 1872 * Generic helper for ->open on filesystems supporting disk quotas.
@@ -1849,7 +1885,7 @@ EXPORT_SYMBOL(dquot_file_open);
1849/* 1885/*
1850 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1886 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1851 */ 1887 */
1852int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) 1888int dquot_disable(struct super_block *sb, int type, unsigned int flags)
1853{ 1889{
1854 int cnt, ret = 0; 1890 int cnt, ret = 0;
1855 struct quota_info *dqopt = sb_dqopt(sb); 1891 struct quota_info *dqopt = sb_dqopt(sb);
@@ -1956,7 +1992,7 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
1956 truncate_inode_pages(&toputinode[cnt]->i_data, 1992 truncate_inode_pages(&toputinode[cnt]->i_data,
1957 0); 1993 0);
1958 mutex_unlock(&toputinode[cnt]->i_mutex); 1994 mutex_unlock(&toputinode[cnt]->i_mutex);
1959 mark_inode_dirty(toputinode[cnt]); 1995 mark_inode_dirty_sync(toputinode[cnt]);
1960 } 1996 }
1961 mutex_unlock(&dqopt->dqonoff_mutex); 1997 mutex_unlock(&dqopt->dqonoff_mutex);
1962 } 1998 }
@@ -1979,14 +2015,15 @@ put_inodes:
1979 } 2015 }
1980 return ret; 2016 return ret;
1981} 2017}
1982EXPORT_SYMBOL(vfs_quota_disable); 2018EXPORT_SYMBOL(dquot_disable);
1983 2019
1984int vfs_quota_off(struct super_block *sb, int type, int remount) 2020int dquot_quota_off(struct super_block *sb, int type)
1985{ 2021{
1986 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : 2022 return dquot_disable(sb, type,
1987 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); 2023 DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
1988} 2024}
1989EXPORT_SYMBOL(vfs_quota_off); 2025EXPORT_SYMBOL(dquot_quota_off);
2026
1990/* 2027/*
1991 * Turn quotas on on a device 2028 * Turn quotas on on a device
1992 */ 2029 */
@@ -2104,36 +2141,43 @@ out_fmt:
2104} 2141}
2105 2142
2106/* Reenable quotas on remount RW */ 2143/* Reenable quotas on remount RW */
2107static int vfs_quota_on_remount(struct super_block *sb, int type) 2144int dquot_resume(struct super_block *sb, int type)
2108{ 2145{
2109 struct quota_info *dqopt = sb_dqopt(sb); 2146 struct quota_info *dqopt = sb_dqopt(sb);
2110 struct inode *inode; 2147 struct inode *inode;
2111 int ret; 2148 int ret = 0, cnt;
2112 unsigned int flags; 2149 unsigned int flags;
2113 2150
2114 mutex_lock(&dqopt->dqonoff_mutex); 2151 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2115 if (!sb_has_quota_suspended(sb, type)) { 2152 if (type != -1 && cnt != type)
2153 continue;
2154
2155 mutex_lock(&dqopt->dqonoff_mutex);
2156 if (!sb_has_quota_suspended(sb, cnt)) {
2157 mutex_unlock(&dqopt->dqonoff_mutex);
2158 continue;
2159 }
2160 inode = dqopt->files[cnt];
2161 dqopt->files[cnt] = NULL;
2162 spin_lock(&dq_state_lock);
2163 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2164 DQUOT_LIMITS_ENABLED,
2165 cnt);
2166 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
2167 spin_unlock(&dq_state_lock);
2116 mutex_unlock(&dqopt->dqonoff_mutex); 2168 mutex_unlock(&dqopt->dqonoff_mutex);
2117 return 0;
2118 }
2119 inode = dqopt->files[type];
2120 dqopt->files[type] = NULL;
2121 spin_lock(&dq_state_lock);
2122 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2123 DQUOT_LIMITS_ENABLED, type);
2124 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
2125 spin_unlock(&dq_state_lock);
2126 mutex_unlock(&dqopt->dqonoff_mutex);
2127 2169
2128 flags = dquot_generic_flag(flags, type); 2170 flags = dquot_generic_flag(flags, cnt);
2129 ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id, 2171 ret = vfs_load_quota_inode(inode, cnt,
2130 flags); 2172 dqopt->info[cnt].dqi_fmt_id, flags);
2131 iput(inode); 2173 iput(inode);
2174 }
2132 2175
2133 return ret; 2176 return ret;
2134} 2177}
2178EXPORT_SYMBOL(dquot_resume);
2135 2179
2136int vfs_quota_on_path(struct super_block *sb, int type, int format_id, 2180int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
2137 struct path *path) 2181 struct path *path)
2138{ 2182{
2139 int error = security_quota_on(path->dentry); 2183 int error = security_quota_on(path->dentry);
@@ -2148,40 +2192,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
2148 DQUOT_LIMITS_ENABLED); 2192 DQUOT_LIMITS_ENABLED);
2149 return error; 2193 return error;
2150} 2194}
2151EXPORT_SYMBOL(vfs_quota_on_path); 2195EXPORT_SYMBOL(dquot_quota_on_path);
2152 2196
2153int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, 2197int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
2154 int remount)
2155{ 2198{
2156 struct path path; 2199 struct path path;
2157 int error; 2200 int error;
2158 2201
2159 if (remount)
2160 return vfs_quota_on_remount(sb, type);
2161
2162 error = kern_path(name, LOOKUP_FOLLOW, &path); 2202 error = kern_path(name, LOOKUP_FOLLOW, &path);
2163 if (!error) { 2203 if (!error) {
2164 error = vfs_quota_on_path(sb, type, format_id, &path); 2204 error = dquot_quota_on_path(sb, type, format_id, &path);
2165 path_put(&path); 2205 path_put(&path);
2166 } 2206 }
2167 return error; 2207 return error;
2168} 2208}
2169EXPORT_SYMBOL(vfs_quota_on); 2209EXPORT_SYMBOL(dquot_quota_on);
2170 2210
2171/* 2211/*
2172 * More powerful function for turning on quotas allowing setting 2212 * More powerful function for turning on quotas allowing setting
2173 * of individual quota flags 2213 * of individual quota flags
2174 */ 2214 */
2175int vfs_quota_enable(struct inode *inode, int type, int format_id, 2215int dquot_enable(struct inode *inode, int type, int format_id,
2176 unsigned int flags) 2216 unsigned int flags)
2177{ 2217{
2178 int ret = 0; 2218 int ret = 0;
2179 struct super_block *sb = inode->i_sb; 2219 struct super_block *sb = inode->i_sb;
2180 struct quota_info *dqopt = sb_dqopt(sb); 2220 struct quota_info *dqopt = sb_dqopt(sb);
2181 2221
2182 /* Just unsuspend quotas? */ 2222 /* Just unsuspend quotas? */
2183 if (flags & DQUOT_SUSPENDED) 2223 BUG_ON(flags & DQUOT_SUSPENDED);
2184 return vfs_quota_on_remount(sb, type); 2224
2185 if (!flags) 2225 if (!flags)
2186 return 0; 2226 return 0;
2187 /* Just updating flags needed? */ 2227 /* Just updating flags needed? */
@@ -2213,13 +2253,13 @@ out_lock:
2213load_quota: 2253load_quota:
2214 return vfs_load_quota_inode(inode, type, format_id, flags); 2254 return vfs_load_quota_inode(inode, type, format_id, flags);
2215} 2255}
2216EXPORT_SYMBOL(vfs_quota_enable); 2256EXPORT_SYMBOL(dquot_enable);
2217 2257
2218/* 2258/*
2219 * This function is used when filesystem needs to initialize quotas 2259 * This function is used when filesystem needs to initialize quotas
2220 * during mount time. 2260 * during mount time.
2221 */ 2261 */
2222int vfs_quota_on_mount(struct super_block *sb, char *qf_name, 2262int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
2223 int format_id, int type) 2263 int format_id, int type)
2224{ 2264{
2225 struct dentry *dentry; 2265 struct dentry *dentry;
@@ -2245,24 +2285,7 @@ out:
2245 dput(dentry); 2285 dput(dentry);
2246 return error; 2286 return error;
2247} 2287}
2248EXPORT_SYMBOL(vfs_quota_on_mount); 2288EXPORT_SYMBOL(dquot_quota_on_mount);
2249
2250/* Wrapper to turn on quotas when remounting rw */
2251int vfs_dq_quota_on_remount(struct super_block *sb)
2252{
2253 int cnt;
2254 int ret = 0, err;
2255
2256 if (!sb->s_qcop || !sb->s_qcop->quota_on)
2257 return -ENOSYS;
2258 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2259 err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
2260 if (err < 0 && !ret)
2261 ret = err;
2262 }
2263 return ret;
2264}
2265EXPORT_SYMBOL(vfs_dq_quota_on_remount);
2266 2289
2267static inline qsize_t qbtos(qsize_t blocks) 2290static inline qsize_t qbtos(qsize_t blocks)
2268{ 2291{
@@ -2275,25 +2298,30 @@ static inline qsize_t stoqb(qsize_t space)
2275} 2298}
2276 2299
2277/* Generic routine for getting common part of quota structure */ 2300/* Generic routine for getting common part of quota structure */
2278static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) 2301static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2279{ 2302{
2280 struct mem_dqblk *dm = &dquot->dq_dqb; 2303 struct mem_dqblk *dm = &dquot->dq_dqb;
2281 2304
2305 memset(di, 0, sizeof(*di));
2306 di->d_version = FS_DQUOT_VERSION;
2307 di->d_flags = dquot->dq_type == USRQUOTA ?
2308 FS_USER_QUOTA : FS_GROUP_QUOTA;
2309 di->d_id = dquot->dq_id;
2310
2282 spin_lock(&dq_data_lock); 2311 spin_lock(&dq_data_lock);
2283 di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit); 2312 di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
2284 di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit); 2313 di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
2285 di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace; 2314 di->d_ino_hardlimit = dm->dqb_ihardlimit;
2286 di->dqb_ihardlimit = dm->dqb_ihardlimit; 2315 di->d_ino_softlimit = dm->dqb_isoftlimit;
2287 di->dqb_isoftlimit = dm->dqb_isoftlimit; 2316 di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
2288 di->dqb_curinodes = dm->dqb_curinodes; 2317 di->d_icount = dm->dqb_curinodes;
2289 di->dqb_btime = dm->dqb_btime; 2318 di->d_btimer = dm->dqb_btime;
2290 di->dqb_itime = dm->dqb_itime; 2319 di->d_itimer = dm->dqb_itime;
2291 di->dqb_valid = QIF_ALL;
2292 spin_unlock(&dq_data_lock); 2320 spin_unlock(&dq_data_lock);
2293} 2321}
2294 2322
2295int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, 2323int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
2296 struct if_dqblk *di) 2324 struct fs_disk_quota *di)
2297{ 2325{
2298 struct dquot *dquot; 2326 struct dquot *dquot;
2299 2327
@@ -2305,53 +2333,72 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2305 2333
2306 return 0; 2334 return 0;
2307} 2335}
2308EXPORT_SYMBOL(vfs_get_dqblk); 2336EXPORT_SYMBOL(dquot_get_dqblk);
2337
2338#define VFS_FS_DQ_MASK \
2339 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
2340 FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
2341 FS_DQ_BTIMER | FS_DQ_ITIMER)
2309 2342
2310/* Generic routine for setting common part of quota structure */ 2343/* Generic routine for setting common part of quota structure */
2311static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) 2344static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2312{ 2345{
2313 struct mem_dqblk *dm = &dquot->dq_dqb; 2346 struct mem_dqblk *dm = &dquot->dq_dqb;
2314 int check_blim = 0, check_ilim = 0; 2347 int check_blim = 0, check_ilim = 0;
2315 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; 2348 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
2316 2349
2317 if ((di->dqb_valid & QIF_BLIMITS && 2350 if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
2318 (di->dqb_bhardlimit > dqi->dqi_maxblimit || 2351 return -EINVAL;
2319 di->dqb_bsoftlimit > dqi->dqi_maxblimit)) || 2352
2320 (di->dqb_valid & QIF_ILIMITS && 2353 if (((di->d_fieldmask & FS_DQ_BSOFT) &&
2321 (di->dqb_ihardlimit > dqi->dqi_maxilimit || 2354 (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
2322 di->dqb_isoftlimit > dqi->dqi_maxilimit))) 2355 ((di->d_fieldmask & FS_DQ_BHARD) &&
2356 (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
2357 ((di->d_fieldmask & FS_DQ_ISOFT) &&
2358 (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
2359 ((di->d_fieldmask & FS_DQ_IHARD) &&
2360 (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
2323 return -ERANGE; 2361 return -ERANGE;
2324 2362
2325 spin_lock(&dq_data_lock); 2363 spin_lock(&dq_data_lock);
2326 if (di->dqb_valid & QIF_SPACE) { 2364 if (di->d_fieldmask & FS_DQ_BCOUNT) {
2327 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace; 2365 dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
2328 check_blim = 1; 2366 check_blim = 1;
2329 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); 2367 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
2330 } 2368 }
2331 if (di->dqb_valid & QIF_BLIMITS) { 2369
2332 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); 2370 if (di->d_fieldmask & FS_DQ_BSOFT)
2333 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); 2371 dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
2372 if (di->d_fieldmask & FS_DQ_BHARD)
2373 dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
2374 if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
2334 check_blim = 1; 2375 check_blim = 1;
2335 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); 2376 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
2336 } 2377 }
2337 if (di->dqb_valid & QIF_INODES) { 2378
2338 dm->dqb_curinodes = di->dqb_curinodes; 2379 if (di->d_fieldmask & FS_DQ_ICOUNT) {
2380 dm->dqb_curinodes = di->d_icount;
2339 check_ilim = 1; 2381 check_ilim = 1;
2340 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); 2382 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
2341 } 2383 }
2342 if (di->dqb_valid & QIF_ILIMITS) { 2384
2343 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2385 if (di->d_fieldmask & FS_DQ_ISOFT)
2344 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2386 dm->dqb_isoftlimit = di->d_ino_softlimit;
2387 if (di->d_fieldmask & FS_DQ_IHARD)
2388 dm->dqb_ihardlimit = di->d_ino_hardlimit;
2389 if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
2345 check_ilim = 1; 2390 check_ilim = 1;
2346 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); 2391 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
2347 } 2392 }
2348 if (di->dqb_valid & QIF_BTIME) { 2393
2349 dm->dqb_btime = di->dqb_btime; 2394 if (di->d_fieldmask & FS_DQ_BTIMER) {
2395 dm->dqb_btime = di->d_btimer;
2350 check_blim = 1; 2396 check_blim = 1;
2351 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); 2397 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2352 } 2398 }
2353 if (di->dqb_valid & QIF_ITIME) { 2399
2354 dm->dqb_itime = di->dqb_itime; 2400 if (di->d_fieldmask & FS_DQ_ITIMER) {
2401 dm->dqb_itime = di->d_itimer;
2355 check_ilim = 1; 2402 check_ilim = 1;
2356 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); 2403 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2357 } 2404 }
@@ -2361,7 +2408,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2361 dm->dqb_curspace < dm->dqb_bsoftlimit) { 2408 dm->dqb_curspace < dm->dqb_bsoftlimit) {
2362 dm->dqb_btime = 0; 2409 dm->dqb_btime = 0;
2363 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 2410 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
2364 } else if (!(di->dqb_valid & QIF_BTIME)) 2411 } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
2365 /* Set grace only if user hasn't provided his own... */ 2412 /* Set grace only if user hasn't provided his own... */
2366 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; 2413 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
2367 } 2414 }
@@ -2370,7 +2417,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2370 dm->dqb_curinodes < dm->dqb_isoftlimit) { 2417 dm->dqb_curinodes < dm->dqb_isoftlimit) {
2371 dm->dqb_itime = 0; 2418 dm->dqb_itime = 0;
2372 clear_bit(DQ_INODES_B, &dquot->dq_flags); 2419 clear_bit(DQ_INODES_B, &dquot->dq_flags);
2373 } else if (!(di->dqb_valid & QIF_ITIME)) 2420 } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
2374 /* Set grace only if user hasn't provided his own... */ 2421 /* Set grace only if user hasn't provided his own... */
2375 dm->dqb_itime = get_seconds() + dqi->dqi_igrace; 2422 dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
2376 } 2423 }
@@ -2385,8 +2432,8 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2385 return 0; 2432 return 0;
2386} 2433}
2387 2434
2388int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, 2435int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
2389 struct if_dqblk *di) 2436 struct fs_disk_quota *di)
2390{ 2437{
2391 struct dquot *dquot; 2438 struct dquot *dquot;
2392 int rc; 2439 int rc;
@@ -2401,10 +2448,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
2401out: 2448out:
2402 return rc; 2449 return rc;
2403} 2450}
2404EXPORT_SYMBOL(vfs_set_dqblk); 2451EXPORT_SYMBOL(dquot_set_dqblk);
2405 2452
2406/* Generic routine for getting common part of quota file information */ 2453/* Generic routine for getting common part of quota file information */
2407int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2454int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2408{ 2455{
2409 struct mem_dqinfo *mi; 2456 struct mem_dqinfo *mi;
2410 2457
@@ -2423,10 +2470,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2423 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2470 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2424 return 0; 2471 return 0;
2425} 2472}
2426EXPORT_SYMBOL(vfs_get_dqinfo); 2473EXPORT_SYMBOL(dquot_get_dqinfo);
2427 2474
2428/* Generic routine for setting common part of quota file information */ 2475/* Generic routine for setting common part of quota file information */
2429int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2476int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2430{ 2477{
2431 struct mem_dqinfo *mi; 2478 struct mem_dqinfo *mi;
2432 int err = 0; 2479 int err = 0;
@@ -2453,74 +2500,86 @@ out:
2453 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2500 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2454 return err; 2501 return err;
2455} 2502}
2456EXPORT_SYMBOL(vfs_set_dqinfo); 2503EXPORT_SYMBOL(dquot_set_dqinfo);
2457 2504
2458const struct quotactl_ops vfs_quotactl_ops = { 2505const struct quotactl_ops dquot_quotactl_ops = {
2459 .quota_on = vfs_quota_on, 2506 .quota_on = dquot_quota_on,
2460 .quota_off = vfs_quota_off, 2507 .quota_off = dquot_quota_off,
2461 .quota_sync = vfs_quota_sync, 2508 .quota_sync = dquot_quota_sync,
2462 .get_info = vfs_get_dqinfo, 2509 .get_info = dquot_get_dqinfo,
2463 .set_info = vfs_set_dqinfo, 2510 .set_info = dquot_set_dqinfo,
2464 .get_dqblk = vfs_get_dqblk, 2511 .get_dqblk = dquot_get_dqblk,
2465 .set_dqblk = vfs_set_dqblk 2512 .set_dqblk = dquot_set_dqblk
2466}; 2513};
2514EXPORT_SYMBOL(dquot_quotactl_ops);
2515
2516static int do_proc_dqstats(struct ctl_table *table, int write,
2517 void __user *buffer, size_t *lenp, loff_t *ppos)
2518{
2519 unsigned int type = (int *)table->data - dqstats.stat;
2520
2521 /* Update global table */
2522 dqstats.stat[type] =
2523 percpu_counter_sum_positive(&dqstats.counter[type]);
2524 return proc_dointvec(table, write, buffer, lenp, ppos);
2525}
2467 2526
2468static ctl_table fs_dqstats_table[] = { 2527static ctl_table fs_dqstats_table[] = {
2469 { 2528 {
2470 .procname = "lookups", 2529 .procname = "lookups",
2471 .data = &dqstats.lookups, 2530 .data = &dqstats.stat[DQST_LOOKUPS],
2472 .maxlen = sizeof(int), 2531 .maxlen = sizeof(int),
2473 .mode = 0444, 2532 .mode = 0444,
2474 .proc_handler = proc_dointvec, 2533 .proc_handler = do_proc_dqstats,
2475 }, 2534 },
2476 { 2535 {
2477 .procname = "drops", 2536 .procname = "drops",
2478 .data = &dqstats.drops, 2537 .data = &dqstats.stat[DQST_DROPS],
2479 .maxlen = sizeof(int), 2538 .maxlen = sizeof(int),
2480 .mode = 0444, 2539 .mode = 0444,
2481 .proc_handler = proc_dointvec, 2540 .proc_handler = do_proc_dqstats,
2482 }, 2541 },
2483 { 2542 {
2484 .procname = "reads", 2543 .procname = "reads",
2485 .data = &dqstats.reads, 2544 .data = &dqstats.stat[DQST_READS],
2486 .maxlen = sizeof(int), 2545 .maxlen = sizeof(int),
2487 .mode = 0444, 2546 .mode = 0444,
2488 .proc_handler = proc_dointvec, 2547 .proc_handler = do_proc_dqstats,
2489 }, 2548 },
2490 { 2549 {
2491 .procname = "writes", 2550 .procname = "writes",
2492 .data = &dqstats.writes, 2551 .data = &dqstats.stat[DQST_WRITES],
2493 .maxlen = sizeof(int), 2552 .maxlen = sizeof(int),
2494 .mode = 0444, 2553 .mode = 0444,
2495 .proc_handler = proc_dointvec, 2554 .proc_handler = do_proc_dqstats,
2496 }, 2555 },
2497 { 2556 {
2498 .procname = "cache_hits", 2557 .procname = "cache_hits",
2499 .data = &dqstats.cache_hits, 2558 .data = &dqstats.stat[DQST_CACHE_HITS],
2500 .maxlen = sizeof(int), 2559 .maxlen = sizeof(int),
2501 .mode = 0444, 2560 .mode = 0444,
2502 .proc_handler = proc_dointvec, 2561 .proc_handler = do_proc_dqstats,
2503 }, 2562 },
2504 { 2563 {
2505 .procname = "allocated_dquots", 2564 .procname = "allocated_dquots",
2506 .data = &dqstats.allocated_dquots, 2565 .data = &dqstats.stat[DQST_ALLOC_DQUOTS],
2507 .maxlen = sizeof(int), 2566 .maxlen = sizeof(int),
2508 .mode = 0444, 2567 .mode = 0444,
2509 .proc_handler = proc_dointvec, 2568 .proc_handler = do_proc_dqstats,
2510 }, 2569 },
2511 { 2570 {
2512 .procname = "free_dquots", 2571 .procname = "free_dquots",
2513 .data = &dqstats.free_dquots, 2572 .data = &dqstats.stat[DQST_FREE_DQUOTS],
2514 .maxlen = sizeof(int), 2573 .maxlen = sizeof(int),
2515 .mode = 0444, 2574 .mode = 0444,
2516 .proc_handler = proc_dointvec, 2575 .proc_handler = do_proc_dqstats,
2517 }, 2576 },
2518 { 2577 {
2519 .procname = "syncs", 2578 .procname = "syncs",
2520 .data = &dqstats.syncs, 2579 .data = &dqstats.stat[DQST_SYNCS],
2521 .maxlen = sizeof(int), 2580 .maxlen = sizeof(int),
2522 .mode = 0444, 2581 .mode = 0444,
2523 .proc_handler = proc_dointvec, 2582 .proc_handler = do_proc_dqstats,
2524 }, 2583 },
2525#ifdef CONFIG_PRINT_QUOTA_WARNING 2584#ifdef CONFIG_PRINT_QUOTA_WARNING
2526 { 2585 {
@@ -2554,7 +2613,7 @@ static ctl_table sys_table[] = {
2554 2613
2555static int __init dquot_init(void) 2614static int __init dquot_init(void)
2556{ 2615{
2557 int i; 2616 int i, ret;
2558 unsigned long nr_hash, order; 2617 unsigned long nr_hash, order;
2559 2618
2560 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); 2619 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2572,6 +2631,12 @@ static int __init dquot_init(void)
2572 if (!dquot_hash) 2631 if (!dquot_hash)
2573 panic("Cannot create dquot hash table"); 2632 panic("Cannot create dquot hash table");
2574 2633
2634 for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
2635 ret = percpu_counter_init(&dqstats.counter[i], 0);
2636 if (ret)
2637 panic("Cannot create dquot stat counters");
2638 }
2639
2575 /* Find power-of-two hlist_heads which can fit into allocation */ 2640 /* Find power-of-two hlist_heads which can fit into allocation */
2576 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); 2641 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
2577 dq_hash_bits = 0; 2642 dq_hash_bits = 0;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95388f9b7356..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
45 return security_quotactl(cmd, type, id, sb); 45 return security_quotactl(cmd, type, id, sb);
46} 46}
47 47
48static void quota_sync_one(struct super_block *sb, void *arg)
49{
50 if (sb->s_qcop && sb->s_qcop->quota_sync)
51 sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
52}
53
48static int quota_sync_all(int type) 54static int quota_sync_all(int type)
49{ 55{
50 struct super_block *sb;
51 int ret; 56 int ret;
52 57
53 if (type >= MAXQUOTAS) 58 if (type >= MAXQUOTAS)
54 return -EINVAL; 59 return -EINVAL;
55 ret = security_quotactl(Q_SYNC, type, 0, NULL); 60 ret = security_quotactl(Q_SYNC, type, 0, NULL);
56 if (ret) 61 if (!ret)
57 return ret; 62 iterate_supers(quota_sync_one, &type);
58 63 return ret;
59 spin_lock(&sb_lock);
60restart:
61 list_for_each_entry(sb, &super_blocks, s_list) {
62 if (!sb->s_qcop || !sb->s_qcop->quota_sync)
63 continue;
64
65 sb->s_count++;
66 spin_unlock(&sb_lock);
67 down_read(&sb->s_umount);
68 if (sb->s_root)
69 sb->s_qcop->quota_sync(sb, type, 1);
70 up_read(&sb->s_umount);
71 spin_lock(&sb_lock);
72 if (__put_super_and_need_restart(sb))
73 goto restart;
74 }
75 spin_unlock(&sb_lock);
76
77 return 0;
78} 64}
79 65
80static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
@@ -87,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
87 if (IS_ERR(pathname)) 73 if (IS_ERR(pathname))
88 return PTR_ERR(pathname); 74 return PTR_ERR(pathname);
89 if (sb->s_qcop->quota_on) 75 if (sb->s_qcop->quota_on)
90 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); 76 ret = sb->s_qcop->quota_on(sb, type, id, pathname);
91 putname(pathname); 77 putname(pathname);
92 return ret; 78 return ret;
93} 79}
@@ -113,8 +99,6 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
113 struct if_dqinfo info; 99 struct if_dqinfo info;
114 int ret; 100 int ret;
115 101
116 if (!sb_has_quota_active(sb, type))
117 return -ESRCH;
118 if (!sb->s_qcop->get_info) 102 if (!sb->s_qcop->get_info)
119 return -ENOSYS; 103 return -ENOSYS;
120 ret = sb->s_qcop->get_info(sb, type, &info); 104 ret = sb->s_qcop->get_info(sb, type, &info);
@@ -129,43 +113,80 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
129 113
130 if (copy_from_user(&info, addr, sizeof(info))) 114 if (copy_from_user(&info, addr, sizeof(info)))
131 return -EFAULT; 115 return -EFAULT;
132 if (!sb_has_quota_active(sb, type))
133 return -ESRCH;
134 if (!sb->s_qcop->set_info) 116 if (!sb->s_qcop->set_info)
135 return -ENOSYS; 117 return -ENOSYS;
136 return sb->s_qcop->set_info(sb, type, &info); 118 return sb->s_qcop->set_info(sb, type, &info);
137} 119}
138 120
121static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
122{
123 dst->dqb_bhardlimit = src->d_blk_hardlimit;
124 dst->dqb_bsoftlimit = src->d_blk_softlimit;
125 dst->dqb_curspace = src->d_bcount;
126 dst->dqb_ihardlimit = src->d_ino_hardlimit;
127 dst->dqb_isoftlimit = src->d_ino_softlimit;
128 dst->dqb_curinodes = src->d_icount;
129 dst->dqb_btime = src->d_btimer;
130 dst->dqb_itime = src->d_itimer;
131 dst->dqb_valid = QIF_ALL;
132}
133
139static int quota_getquota(struct super_block *sb, int type, qid_t id, 134static int quota_getquota(struct super_block *sb, int type, qid_t id,
140 void __user *addr) 135 void __user *addr)
141{ 136{
137 struct fs_disk_quota fdq;
142 struct if_dqblk idq; 138 struct if_dqblk idq;
143 int ret; 139 int ret;
144 140
145 if (!sb_has_quota_active(sb, type))
146 return -ESRCH;
147 if (!sb->s_qcop->get_dqblk) 141 if (!sb->s_qcop->get_dqblk)
148 return -ENOSYS; 142 return -ENOSYS;
149 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); 143 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
150 if (ret) 144 if (ret)
151 return ret; 145 return ret;
146 copy_to_if_dqblk(&idq, &fdq);
152 if (copy_to_user(addr, &idq, sizeof(idq))) 147 if (copy_to_user(addr, &idq, sizeof(idq)))
153 return -EFAULT; 148 return -EFAULT;
154 return 0; 149 return 0;
155} 150}
156 151
152static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
153{
154 dst->d_blk_hardlimit = src->dqb_bhardlimit;
155 dst->d_blk_softlimit = src->dqb_bsoftlimit;
156 dst->d_bcount = src->dqb_curspace;
157 dst->d_ino_hardlimit = src->dqb_ihardlimit;
158 dst->d_ino_softlimit = src->dqb_isoftlimit;
159 dst->d_icount = src->dqb_curinodes;
160 dst->d_btimer = src->dqb_btime;
161 dst->d_itimer = src->dqb_itime;
162
163 dst->d_fieldmask = 0;
164 if (src->dqb_valid & QIF_BLIMITS)
165 dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
166 if (src->dqb_valid & QIF_SPACE)
167 dst->d_fieldmask |= FS_DQ_BCOUNT;
168 if (src->dqb_valid & QIF_ILIMITS)
169 dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
170 if (src->dqb_valid & QIF_INODES)
171 dst->d_fieldmask |= FS_DQ_ICOUNT;
172 if (src->dqb_valid & QIF_BTIME)
173 dst->d_fieldmask |= FS_DQ_BTIMER;
174 if (src->dqb_valid & QIF_ITIME)
175 dst->d_fieldmask |= FS_DQ_ITIMER;
176}
177
157static int quota_setquota(struct super_block *sb, int type, qid_t id, 178static int quota_setquota(struct super_block *sb, int type, qid_t id,
158 void __user *addr) 179 void __user *addr)
159{ 180{
181 struct fs_disk_quota fdq;
160 struct if_dqblk idq; 182 struct if_dqblk idq;
161 183
162 if (copy_from_user(&idq, addr, sizeof(idq))) 184 if (copy_from_user(&idq, addr, sizeof(idq)))
163 return -EFAULT; 185 return -EFAULT;
164 if (!sb_has_quota_active(sb, type))
165 return -ESRCH;
166 if (!sb->s_qcop->set_dqblk) 186 if (!sb->s_qcop->set_dqblk)
167 return -ENOSYS; 187 return -ENOSYS;
168 return sb->s_qcop->set_dqblk(sb, type, id, &idq); 188 copy_from_if_dqblk(&fdq, &idq);
189 return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
169} 190}
170 191
171static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) 192static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
@@ -199,9 +220,9 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
199 220
200 if (copy_from_user(&fdq, addr, sizeof(fdq))) 221 if (copy_from_user(&fdq, addr, sizeof(fdq)))
201 return -EFAULT; 222 return -EFAULT;
202 if (!sb->s_qcop->set_xquota) 223 if (!sb->s_qcop->set_dqblk)
203 return -ENOSYS; 224 return -ENOSYS;
204 return sb->s_qcop->set_xquota(sb, type, id, &fdq); 225 return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
205} 226}
206 227
207static int quota_getxquota(struct super_block *sb, int type, qid_t id, 228static int quota_getxquota(struct super_block *sb, int type, qid_t id,
@@ -210,9 +231,9 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
210 struct fs_disk_quota fdq; 231 struct fs_disk_quota fdq;
211 int ret; 232 int ret;
212 233
213 if (!sb->s_qcop->get_xquota) 234 if (!sb->s_qcop->get_dqblk)
214 return -ENOSYS; 235 return -ENOSYS;
215 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); 236 ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
216 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) 237 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
217 return -EFAULT; 238 return -EFAULT;
218 return ret; 239 return ret;
@@ -239,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
239 case Q_QUOTAOFF: 260 case Q_QUOTAOFF:
240 if (!sb->s_qcop->quota_off) 261 if (!sb->s_qcop->quota_off)
241 return -ENOSYS; 262 return -ENOSYS;
242 return sb->s_qcop->quota_off(sb, type, 0); 263 return sb->s_qcop->quota_off(sb, type);
243 case Q_GETFMT: 264 case Q_GETFMT:
244 return quota_getfmt(sb, type, addr); 265 return quota_getfmt(sb, type, addr);
245 case Q_GETINFO: 266 case Q_GETINFO:
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index f81f4bcfb178..9e48874eabcc 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -60,9 +60,16 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
60static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) 60static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
61{ 61{
62 struct super_block *sb = info->dqi_sb; 62 struct super_block *sb = info->dqi_sb;
63 ssize_t ret;
63 64
64 return sb->s_op->quota_write(sb, info->dqi_type, buf, 65 ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
65 info->dqi_usable_bs, blk << info->dqi_blocksize_bits); 66 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
67 if (ret != info->dqi_usable_bs) {
68 quota_error(sb, "dquota write failed");
69 if (ret >= 0)
70 ret = -EIO;
71 }
72 return ret;
66} 73}
67 74
68/* Remove empty block from list and return it */ 75/* Remove empty block from list and return it */
@@ -152,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
152 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); 159 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
153 /* No matter whether write succeeds block is out of list */ 160 /* No matter whether write succeeds block is out of list */
154 if (write_blk(info, blk, buf) < 0) 161 if (write_blk(info, blk, buf) < 0)
155 printk(KERN_ERR 162 quota_error(info->dqi_sb, "Can't write block (%u) "
156 "VFS: Can't write block (%u) with free entries.\n", 163 "with free entries", blk);
157 blk);
158 return 0; 164 return 0;
159out_buf: 165out_buf:
160 kfree(tmpbuf); 166 kfree(tmpbuf);
@@ -244,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
244 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { 250 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
245 *err = remove_free_dqentry(info, buf, blk); 251 *err = remove_free_dqentry(info, buf, blk);
246 if (*err < 0) { 252 if (*err < 0) {
247 printk(KERN_ERR "VFS: find_free_dqentry(): Can't " 253 quota_error(dquot->dq_sb, "Can't remove block (%u) "
248 "remove block (%u) from entry free list.\n", 254 "from entry free list", blk);
249 blk);
250 goto out_buf; 255 goto out_buf;
251 } 256 }
252 } 257 }
@@ -260,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
260 } 265 }
261#ifdef __QUOTA_QT_PARANOIA 266#ifdef __QUOTA_QT_PARANOIA
262 if (i == qtree_dqstr_in_blk(info)) { 267 if (i == qtree_dqstr_in_blk(info)) {
263 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full " 268 quota_error(dquot->dq_sb, "Data block full but it shouldn't");
264 "but it shouldn't.\n");
265 *err = -EIO; 269 *err = -EIO;
266 goto out_buf; 270 goto out_buf;
267 } 271 }
268#endif 272#endif
269 *err = write_blk(info, blk, buf); 273 *err = write_blk(info, blk, buf);
270 if (*err < 0) { 274 if (*err < 0) {
271 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " 275 quota_error(dquot->dq_sb, "Can't write quota data block %u",
272 "data block %u.\n", blk); 276 blk);
273 goto out_buf; 277 goto out_buf;
274 } 278 }
275 dquot->dq_off = (blk << info->dqi_blocksize_bits) + 279 dquot->dq_off = (blk << info->dqi_blocksize_bits) +
@@ -303,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
303 } else { 307 } else {
304 ret = read_blk(info, *treeblk, buf); 308 ret = read_blk(info, *treeblk, buf);
305 if (ret < 0) { 309 if (ret < 0) {
306 printk(KERN_ERR "VFS: Can't read tree quota block " 310 quota_error(dquot->dq_sb, "Can't read tree quota "
307 "%u.\n", *treeblk); 311 "block %u", *treeblk);
308 goto out_buf; 312 goto out_buf;
309 } 313 }
310 } 314 }
@@ -315,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
315 if (depth == info->dqi_qtree_depth - 1) { 319 if (depth == info->dqi_qtree_depth - 1) {
316#ifdef __QUOTA_QT_PARANOIA 320#ifdef __QUOTA_QT_PARANOIA
317 if (newblk) { 321 if (newblk) {
318 printk(KERN_ERR "VFS: Inserting already present quota " 322 quota_error(dquot->dq_sb, "Inserting already present "
319 "entry (block %u).\n", 323 "quota entry (block %u)",
320 le32_to_cpu(ref[get_index(info, 324 le32_to_cpu(ref[get_index(info,
321 dquot->dq_id, depth)])); 325 dquot->dq_id, depth)]));
322 ret = -EIO; 326 ret = -EIO;
323 goto out_buf; 327 goto out_buf;
@@ -365,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
365 if (!dquot->dq_off) { 369 if (!dquot->dq_off) {
366 ret = dq_insert_tree(info, dquot); 370 ret = dq_insert_tree(info, dquot);
367 if (ret < 0) { 371 if (ret < 0) {
368 printk(KERN_ERR "VFS: Error %zd occurred while " 372 quota_error(sb, "Error %zd occurred while creating "
369 "creating quota.\n", ret); 373 "quota", ret);
370 kfree(ddquot); 374 kfree(ddquot);
371 return ret; 375 return ret;
372 } 376 }
@@ -377,14 +381,13 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
377 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, 381 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
378 dquot->dq_off); 382 dquot->dq_off);
379 if (ret != info->dqi_entry_size) { 383 if (ret != info->dqi_entry_size) {
380 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", 384 quota_error(sb, "dquota write failed");
381 sb->s_id);
382 if (ret >= 0) 385 if (ret >= 0)
383 ret = -ENOSPC; 386 ret = -ENOSPC;
384 } else { 387 } else {
385 ret = 0; 388 ret = 0;
386 } 389 }
387 dqstats.writes++; 390 dqstats_inc(DQST_WRITES);
388 kfree(ddquot); 391 kfree(ddquot);
389 392
390 return ret; 393 return ret;
@@ -402,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
402 if (!buf) 405 if (!buf)
403 return -ENOMEM; 406 return -ENOMEM;
404 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { 407 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
405 printk(KERN_ERR "VFS: Quota structure has offset to other " 408 quota_error(dquot->dq_sb, "Quota structure has offset to "
406 "block (%u) than it should (%u).\n", blk, 409 "other block (%u) than it should (%u)", blk,
407 (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); 410 (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
408 goto out_buf; 411 goto out_buf;
409 } 412 }
410 ret = read_blk(info, blk, buf); 413 ret = read_blk(info, blk, buf);
411 if (ret < 0) { 414 if (ret < 0) {
412 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); 415 quota_error(dquot->dq_sb, "Can't read quota data block %u",
416 blk);
413 goto out_buf; 417 goto out_buf;
414 } 418 }
415 dh = (struct qt_disk_dqdbheader *)buf; 419 dh = (struct qt_disk_dqdbheader *)buf;
@@ -419,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
419 if (ret >= 0) 423 if (ret >= 0)
420 ret = put_free_dqblk(info, buf, blk); 424 ret = put_free_dqblk(info, buf, blk);
421 if (ret < 0) { 425 if (ret < 0) {
422 printk(KERN_ERR "VFS: Can't move quota data block (%u) " 426 quota_error(dquot->dq_sb, "Can't move quota data block "
423 "to free list.\n", blk); 427 "(%u) to free list", blk);
424 goto out_buf; 428 goto out_buf;
425 } 429 }
426 } else { 430 } else {
@@ -432,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
432 /* Insert will write block itself */ 436 /* Insert will write block itself */
433 ret = insert_free_dqentry(info, buf, blk); 437 ret = insert_free_dqentry(info, buf, blk);
434 if (ret < 0) { 438 if (ret < 0) {
435 printk(KERN_ERR "VFS: Can't insert quota data " 439 quota_error(dquot->dq_sb, "Can't insert quota "
436 "block (%u) to free entry list.\n", blk); 440 "data block (%u) to free entry list", blk);
437 goto out_buf; 441 goto out_buf;
438 } 442 }
439 } else { 443 } else {
440 ret = write_blk(info, blk, buf); 444 ret = write_blk(info, blk, buf);
441 if (ret < 0) { 445 if (ret < 0) {
442 printk(KERN_ERR "VFS: Can't write quota data " 446 quota_error(dquot->dq_sb, "Can't write quota "
443 "block %u\n", blk); 447 "data block %u", blk);
444 goto out_buf; 448 goto out_buf;
445 } 449 }
446 } 450 }
@@ -464,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
464 return -ENOMEM; 468 return -ENOMEM;
465 ret = read_blk(info, *blk, buf); 469 ret = read_blk(info, *blk, buf);
466 if (ret < 0) { 470 if (ret < 0) {
467 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); 471 quota_error(dquot->dq_sb, "Can't read quota data "
472 "block %u", blk);
468 goto out_buf; 473 goto out_buf;
469 } 474 }
470 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); 475 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -488,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
488 } else { 493 } else {
489 ret = write_blk(info, *blk, buf); 494 ret = write_blk(info, *blk, buf);
490 if (ret < 0) 495 if (ret < 0)
491 printk(KERN_ERR "VFS: Can't write quota tree " 496 quota_error(dquot->dq_sb, "Can't write quota "
492 "block %u.\n", *blk); 497 "tree block %u", blk);
493 } 498 }
494 } 499 }
495out_buf: 500out_buf:
@@ -521,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
521 return -ENOMEM; 526 return -ENOMEM;
522 ret = read_blk(info, blk, buf); 527 ret = read_blk(info, blk, buf);
523 if (ret < 0) { 528 if (ret < 0) {
524 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 529 quota_error(dquot->dq_sb, "Can't read quota tree "
530 "block %u", blk);
525 goto out_buf; 531 goto out_buf;
526 } 532 }
527 ddquot = buf + sizeof(struct qt_disk_dqdbheader); 533 ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -531,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
531 ddquot += info->dqi_entry_size; 537 ddquot += info->dqi_entry_size;
532 } 538 }
533 if (i == qtree_dqstr_in_blk(info)) { 539 if (i == qtree_dqstr_in_blk(info)) {
534 printk(KERN_ERR "VFS: Quota for id %u referenced " 540 quota_error(dquot->dq_sb, "Quota for id %u referenced "
535 "but not present.\n", dquot->dq_id); 541 "but not present", dquot->dq_id);
536 ret = -EIO; 542 ret = -EIO;
537 goto out_buf; 543 goto out_buf;
538 } else { 544 } else {
@@ -556,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
556 return -ENOMEM; 562 return -ENOMEM;
557 ret = read_blk(info, blk, buf); 563 ret = read_blk(info, blk, buf);
558 if (ret < 0) { 564 if (ret < 0) {
559 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 565 quota_error(dquot->dq_sb, "Can't read quota tree block %u",
566 blk);
560 goto out_buf; 567 goto out_buf;
561 } 568 }
562 ret = 0; 569 ret = 0;
@@ -590,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
590#ifdef __QUOTA_QT_PARANOIA 597#ifdef __QUOTA_QT_PARANOIA
591 /* Invalidated quota? */ 598 /* Invalidated quota? */
592 if (!sb_dqopt(dquot->dq_sb)->files[type]) { 599 if (!sb_dqopt(dquot->dq_sb)->files[type]) {
593 printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); 600 quota_error(sb, "Quota invalidated while reading!");
594 return -EIO; 601 return -EIO;
595 } 602 }
596#endif 603#endif
@@ -599,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
599 offset = find_dqentry(info, dquot); 606 offset = find_dqentry(info, dquot);
600 if (offset <= 0) { /* Entry not present? */ 607 if (offset <= 0) { /* Entry not present? */
601 if (offset < 0) 608 if (offset < 0)
602 printk(KERN_ERR "VFS: Can't read quota " 609 quota_error(sb, "Can't read quota structure "
603 "structure for id %u.\n", dquot->dq_id); 610 "for id %u", dquot->dq_id);
604 dquot->dq_off = 0; 611 dquot->dq_off = 0;
605 set_bit(DQ_FAKE_B, &dquot->dq_flags); 612 set_bit(DQ_FAKE_B, &dquot->dq_flags);
606 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 613 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -617,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
617 if (ret != info->dqi_entry_size) { 624 if (ret != info->dqi_entry_size) {
618 if (ret >= 0) 625 if (ret >= 0)
619 ret = -EIO; 626 ret = -EIO;
620 printk(KERN_ERR "VFS: Error while reading quota " 627 quota_error(sb, "Error while reading quota structure for id %u",
621 "structure for id %u.\n", dquot->dq_id); 628 dquot->dq_id);
622 set_bit(DQ_FAKE_B, &dquot->dq_flags); 629 set_bit(DQ_FAKE_B, &dquot->dq_flags);
623 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 630 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
624 kfree(ddquot); 631 kfree(ddquot);
@@ -634,7 +641,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
634 spin_unlock(&dq_data_lock); 641 spin_unlock(&dq_data_lock);
635 kfree(ddquot); 642 kfree(ddquot);
636out: 643out:
637 dqstats.reads++; 644 dqstats_inc(DQST_READS);
638 return ret; 645 return ret;
639} 646}
640EXPORT_SYMBOL(qtree_read_dquot); 647EXPORT_SYMBOL(qtree_read_dquot);
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 2ae757e9c008..34b37a67bb16 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -71,7 +71,7 @@ static int v1_read_dqblk(struct dquot *dquot)
71 dquot->dq_dqb.dqb_ihardlimit == 0 && 71 dquot->dq_dqb.dqb_ihardlimit == 0 &&
72 dquot->dq_dqb.dqb_isoftlimit == 0) 72 dquot->dq_dqb.dqb_isoftlimit == 0)
73 set_bit(DQ_FAKE_B, &dquot->dq_flags); 73 set_bit(DQ_FAKE_B, &dquot->dq_flags);
74 dqstats.reads++; 74 dqstats_inc(DQST_READS);
75 75
76 return 0; 76 return 0;
77} 77}
@@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
95 (char *)&dqblk, sizeof(struct v1_disk_dqblk), 95 (char *)&dqblk, sizeof(struct v1_disk_dqblk),
96 v1_dqoff(dquot->dq_id)); 96 v1_dqoff(dquot->dq_id));
97 if (ret != sizeof(struct v1_disk_dqblk)) { 97 if (ret != sizeof(struct v1_disk_dqblk)) {
98 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", 98 quota_error(dquot->dq_sb, "dquota write failed");
99 dquot->dq_sb->s_id);
100 if (ret >= 0) 99 if (ret >= 0)
101 ret = -EIO; 100 ret = -EIO;
102 goto out; 101 goto out;
@@ -104,7 +103,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
104 ret = 0; 103 ret = 0;
105 104
106out: 105out:
107 dqstats.writes++; 106 dqstats_inc(DQST_WRITES);
108 107
109 return ret; 108 return ret;
110} 109}
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index e3da02f4986f..65444d29406b 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type,
63 size = sb->s_op->quota_read(sb, type, (char *)dqhead, 63 size = sb->s_op->quota_read(sb, type, (char *)dqhead,
64 sizeof(struct v2_disk_dqheader), 0); 64 sizeof(struct v2_disk_dqheader), 0);
65 if (size != sizeof(struct v2_disk_dqheader)) { 65 if (size != sizeof(struct v2_disk_dqheader)) {
66 printk(KERN_WARNING "quota_v2: Failed header read:" 66 quota_error(sb, "Failed header read: expected=%zd got=%zd",
67 " expected=%zd got=%zd\n", 67 sizeof(struct v2_disk_dqheader), size);
68 sizeof(struct v2_disk_dqheader), size);
69 return 0; 68 return 0;
70 } 69 }
71 return 1; 70 return 1;
@@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 105 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 106 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
108 if (size != sizeof(struct v2_disk_dqinfo)) { 107 if (size != sizeof(struct v2_disk_dqinfo)) {
109 printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", 108 quota_error(sb, "Can't read info structure");
110 sb->s_id);
111 return -1; 109 return -1;
112 } 110 }
113 info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); 111 info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
@@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
167 size = sb->s_op->quota_write(sb, type, (char *)&dinfo, 165 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
168 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 166 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
169 if (size != sizeof(struct v2_disk_dqinfo)) { 167 if (size != sizeof(struct v2_disk_dqinfo)) {
170 printk(KERN_WARNING "Can't write info structure on device %s.\n", 168 quota_error(sb, "Can't write info structure");
171 sb->s_id);
172 return -1; 169 return -1;
173 } 170 }
174 return 0; 171 return 0;
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
43 .write = do_sync_write, 43 .write = do_sync_write,
44 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
45 .mmap = generic_file_mmap, 45 .mmap = generic_file_mmap,
46 .fsync = simple_sync_file, 46 .fsync = noop_fsync,
47 .splice_read = generic_file_splice_read, 47 .splice_read = generic_file_splice_read,
48 .splice_write = generic_file_splice_write, 48 .splice_write = generic_file_splice_write,
49 .llseek = generic_file_llseek, 49 .llseek = generic_file_llseek,
50}; 50};
51 51
52const struct inode_operations ramfs_file_inode_operations = { 52const struct inode_operations ramfs_file_inode_operations = {
53 .setattr = simple_setattr,
53 .getattr = simple_getattr, 54 .getattr = simple_getattr,
54}; 55};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5ea4ad81a429..9eead2c796b7 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
42 .aio_read = generic_file_aio_read, 42 .aio_read = generic_file_aio_read,
43 .write = do_sync_write, 43 .write = do_sync_write,
44 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
45 .fsync = simple_sync_file, 45 .fsync = noop_fsync,
46 .splice_read = generic_file_splice_read, 46 .splice_read = generic_file_splice_read,
47 .splice_write = generic_file_splice_write, 47 .splice_write = generic_file_splice_write,
48 .llseek = generic_file_llseek, 48 .llseek = generic_file_llseek,
@@ -146,9 +146,8 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
146 return ret; 146 return ret;
147 } 147 }
148 148
149 ret = vmtruncate(inode, newsize); 149 truncate_setsize(inode, newsize);
150 150 return 0;
151 return ret;
152} 151}
153 152
154/*****************************************************************************/ 153/*****************************************************************************/
@@ -169,7 +168,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
169 168
170 /* pick out size-changing events */ 169 /* pick out size-changing events */
171 if (ia->ia_valid & ATTR_SIZE) { 170 if (ia->ia_valid & ATTR_SIZE) {
172 loff_t size = i_size_read(inode); 171 loff_t size = inode->i_size;
172
173 if (ia->ia_size != size) { 173 if (ia->ia_size != size) {
174 ret = ramfs_nommu_resize(inode, ia->ia_size, size); 174 ret = ramfs_nommu_resize(inode, ia->ia_size, size);
175 if (ret < 0 || ia->ia_valid == ATTR_SIZE) 175 if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -182,7 +182,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
182 } 182 }
183 } 183 }
184 184
185 ret = inode_setattr(inode, ia); 185 setattr_copy(inode, ia);
186 out: 186 out:
187 ia->ia_valid = old_ia_valid; 187 ia->ia_valid = old_ia_valid;
188 return ret; 188 return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c94853473ca9..a5ebae70dc6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
52 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, 52 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
53}; 53};
54 54
55struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) 55struct inode *ramfs_get_inode(struct super_block *sb,
56 const struct inode *dir, int mode, dev_t dev)
56{ 57{
57 struct inode * inode = new_inode(sb); 58 struct inode * inode = new_inode(sb);
58 59
59 if (inode) { 60 if (inode) {
60 inode->i_mode = mode; 61 inode_init_owner(inode, dir, mode);
61 inode->i_uid = current_fsuid();
62 inode->i_gid = current_fsgid();
63 inode->i_mapping->a_ops = &ramfs_aops; 62 inode->i_mapping->a_ops = &ramfs_aops;
64 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
65 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 64 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
95static int 94static int
96ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) 95ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
97{ 96{
98 struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev); 97 struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
99 int error = -ENOSPC; 98 int error = -ENOSPC;
100 99
101 if (inode) { 100 if (inode) {
102 if (dir->i_mode & S_ISGID) {
103 inode->i_gid = dir->i_gid;
104 if (S_ISDIR(mode))
105 inode->i_mode |= S_ISGID;
106 }
107 d_instantiate(dentry, inode); 101 d_instantiate(dentry, inode);
108 dget(dentry); /* Extra count - pin the dentry in core */ 102 dget(dentry); /* Extra count - pin the dentry in core */
109 error = 0; 103 error = 0;
@@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
130 struct inode *inode; 124 struct inode *inode;
131 int error = -ENOSPC; 125 int error = -ENOSPC;
132 126
133 inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); 127 inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
134 if (inode) { 128 if (inode) {
135 int l = strlen(symname)+1; 129 int l = strlen(symname)+1;
136 error = page_symlink(inode, symname, l); 130 error = page_symlink(inode, symname, l);
137 if (!error) { 131 if (!error) {
138 if (dir->i_mode & S_ISGID)
139 inode->i_gid = dir->i_gid;
140 d_instantiate(dentry, inode); 132 d_instantiate(dentry, inode);
141 dget(dentry); 133 dget(dentry);
142 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 134 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -214,7 +206,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
214 return 0; 206 return 0;
215} 207}
216 208
217static int ramfs_fill_super(struct super_block * sb, void * data, int silent) 209int ramfs_fill_super(struct super_block *sb, void *data, int silent)
218{ 210{
219 struct ramfs_fs_info *fsi; 211 struct ramfs_fs_info *fsi;
220 struct inode *inode = NULL; 212 struct inode *inode = NULL;
@@ -241,7 +233,7 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
241 sb->s_op = &ramfs_ops; 233 sb->s_op = &ramfs_ops;
242 sb->s_time_gran = 1; 234 sb->s_time_gran = 1;
243 235
244 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); 236 inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
245 if (!inode) { 237 if (!inode) {
246 err = -ENOMEM; 238 err = -ENOMEM;
247 goto fail; 239 goto fail;
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..74e36586e4d3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
97} 97}
98EXPORT_SYMBOL(generic_file_llseek); 98EXPORT_SYMBOL(generic_file_llseek);
99 99
100/**
101 * noop_llseek - No Operation Performed llseek implementation
102 * @file: file structure to seek on
103 * @offset: file offset to seek to
104 * @origin: type of seek
105 *
106 * This is an implementation of ->llseek useable for the rare special case when
107 * userspace expects the seek to succeed but the (device) file is actually not
108 * able to perform the seek. In this case you use noop_llseek() instead of
109 * falling back to the default implementation of ->llseek.
110 */
111loff_t noop_llseek(struct file *file, loff_t offset, int origin)
112{
113 return file->f_pos;
114}
115EXPORT_SYMBOL(noop_llseek);
116
100loff_t no_llseek(struct file *file, loff_t offset, int origin) 117loff_t no_llseek(struct file *file, loff_t offset, int origin)
101{ 118{
102 return -ESPIPE; 119 return -ESPIPE;
@@ -294,7 +311,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
294 else 311 else
295 ret = do_sync_read(file, buf, count, pos); 312 ret = do_sync_read(file, buf, count, pos);
296 if (ret > 0) { 313 if (ret > 0) {
297 fsnotify_access(file->f_path.dentry); 314 fsnotify_access(file);
298 add_rchar(current, ret); 315 add_rchar(current, ret);
299 } 316 }
300 inc_syscr(current); 317 inc_syscr(current);
@@ -350,7 +367,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
350 else 367 else
351 ret = do_sync_write(file, buf, count, pos); 368 ret = do_sync_write(file, buf, count, pos);
352 if (ret > 0) { 369 if (ret > 0) {
353 fsnotify_modify(file->f_path.dentry); 370 fsnotify_modify(file);
354 add_wchar(current, ret); 371 add_wchar(current, ret);
355 } 372 }
356 inc_syscw(current); 373 inc_syscw(current);
@@ -658,9 +675,9 @@ out:
658 kfree(iov); 675 kfree(iov);
659 if ((ret + (type == READ)) > 0) { 676 if ((ret + (type == READ)) > 0) {
660 if (type == READ) 677 if (type == READ)
661 fsnotify_access(file->f_path.dentry); 678 fsnotify_access(file);
662 else 679 else
663 fsnotify_modify(file->f_path.dentry); 680 fsnotify_modify(file);
664 } 681 }
665 return ret; 682 return ret;
666} 683}
diff --git a/fs/readdir.c b/fs/readdir.c
index 7723401f8d8b..356f71528ad6 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 1995 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/stddef.h>
7#include <linux/kernel.h> 8#include <linux/kernel.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/time.h> 10#include <linux/time.h>
@@ -54,7 +55,6 @@ EXPORT_SYMBOL(vfs_readdir);
54 * anyway. Thus the special "fillonedir()" function for that 55 * anyway. Thus the special "fillonedir()" function for that
55 * case (the low-level handlers don't need to care about this). 56 * case (the low-level handlers don't need to care about this).
56 */ 57 */
57#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
58 58
59#ifdef __ARCH_WANT_OLD_READDIR 59#ifdef __ARCH_WANT_OLD_READDIR
60 60
@@ -152,7 +152,8 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
152 struct linux_dirent __user * dirent; 152 struct linux_dirent __user * dirent;
153 struct getdents_callback * buf = (struct getdents_callback *) __buf; 153 struct getdents_callback * buf = (struct getdents_callback *) __buf;
154 unsigned long d_ino; 154 unsigned long d_ino;
155 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long)); 155 int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
156 sizeof(long));
156 157
157 buf->error = -EINVAL; /* only used if we fail.. */ 158 buf->error = -EINVAL; /* only used if we fail.. */
158 if (reclen > buf->count) 159 if (reclen > buf->count)
@@ -237,7 +238,8 @@ static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
237{ 238{
238 struct linux_dirent64 __user *dirent; 239 struct linux_dirent64 __user *dirent;
239 struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf; 240 struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
240 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64)); 241 int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
242 sizeof(u64));
241 243
242 buf->error = -EINVAL; /* only used if we fail.. */ 244 buf->error = -EINVAL; /* only used if we fail.. */
243 if (reclen > buf->count) 245 if (reclen > buf->count)
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,10 +14,10 @@
14extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
15 15
16static int reiserfs_readdir(struct file *, void *, filldir_t); 16static int reiserfs_readdir(struct file *, void *, filldir_t);
17static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 17static int reiserfs_dir_fsync(struct file *filp, int datasync);
18 int datasync);
19 18
20const struct file_operations reiserfs_dir_operations = { 19const struct file_operations reiserfs_dir_operations = {
20 .llseek = generic_file_llseek,
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .readdir = reiserfs_readdir, 22 .readdir = reiserfs_readdir,
23 .fsync = reiserfs_dir_fsync, 23 .fsync = reiserfs_dir_fsync,
@@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
27#endif 27#endif
28}; 28};
29 29
30static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 30static int reiserfs_dir_fsync(struct file *filp, int datasync)
31 int datasync)
32{ 31{
33 struct inode *inode = dentry->d_inode; 32 struct inode *inode = filp->f_mapping->host;
34 int err; 33 int err;
35 reiserfs_write_lock(inode->i_sb); 34 reiserfs_write_lock(inode->i_sb);
36 err = reiserfs_commit_for_inode(inode); 35 err = reiserfs_commit_for_inode(inode);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..6846371498b6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -38,20 +38,24 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
38 38
39 BUG_ON(!S_ISREG(inode->i_mode)); 39 BUG_ON(!S_ISREG(inode->i_mode));
40 40
41 if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
42 return 0;
43
44 mutex_lock(&(REISERFS_I(inode)->tailpack));
45
46 if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
47 mutex_unlock(&(REISERFS_I(inode)->tailpack));
48 return 0;
49 }
50
41 /* fast out for when nothing needs to be done */ 51 /* fast out for when nothing needs to be done */
42 if ((atomic_read(&inode->i_count) > 1 || 52 if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
43 !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
44 !tail_has_to_be_packed(inode)) && 53 !tail_has_to_be_packed(inode)) &&
45 REISERFS_I(inode)->i_prealloc_count <= 0) { 54 REISERFS_I(inode)->i_prealloc_count <= 0) {
55 mutex_unlock(&(REISERFS_I(inode)->tailpack));
46 return 0; 56 return 0;
47 } 57 }
48 58
49 mutex_lock(&inode->i_mutex);
50
51 mutex_lock(&(REISERFS_I(inode)->i_mmap));
52 if (REISERFS_I(inode)->i_flags & i_ever_mapped)
53 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
54
55 reiserfs_write_lock(inode->i_sb); 59 reiserfs_write_lock(inode->i_sb);
56 /* freeing preallocation only involves relogging blocks that 60 /* freeing preallocation only involves relogging blocks that
57 * are already in the current transaction. preallocation gets 61 * are already in the current transaction. preallocation gets
@@ -94,9 +98,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
94 if (!err) 98 if (!err)
95 err = jbegin_failure; 99 err = jbegin_failure;
96 100
97 if (!err && atomic_read(&inode->i_count) <= 1 && 101 if (!err &&
98 (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && 102 (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
99 tail_has_to_be_packed(inode)) { 103 tail_has_to_be_packed(inode)) {
104
100 /* if regular file is released by last holder and it has been 105 /* if regular file is released by last holder and it has been
101 appended (we append by unformatted node only) or its direct 106 appended (we append by unformatted node only) or its direct
102 item(s) had to be converted, then it may have to be 107 item(s) had to be converted, then it may have to be
@@ -104,27 +109,28 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
104 err = reiserfs_truncate_file(inode, 0); 109 err = reiserfs_truncate_file(inode, 0);
105 } 110 }
106 out: 111 out:
107 mutex_unlock(&(REISERFS_I(inode)->i_mmap));
108 mutex_unlock(&inode->i_mutex);
109 reiserfs_write_unlock(inode->i_sb); 112 reiserfs_write_unlock(inode->i_sb);
113 mutex_unlock(&(REISERFS_I(inode)->tailpack));
110 return err; 114 return err;
111} 115}
112 116
113static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma) 117static int reiserfs_file_open(struct inode *inode, struct file *file)
114{ 118{
115 struct inode *inode; 119 int err = dquot_file_open(inode, file);
116 120 if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
117 inode = file->f_path.dentry->d_inode; 121 /* somebody might be tailpacking on final close; wait for it */
118 mutex_lock(&(REISERFS_I(inode)->i_mmap)); 122 mutex_lock(&(REISERFS_I(inode)->tailpack));
119 REISERFS_I(inode)->i_flags |= i_ever_mapped; 123 atomic_inc(&REISERFS_I(inode)->openers);
120 mutex_unlock(&(REISERFS_I(inode)->i_mmap)); 124 mutex_unlock(&(REISERFS_I(inode)->tailpack));
121 125 }
122 return generic_file_mmap(file, vma); 126 return err;
123} 127}
124 128
125static void reiserfs_vfs_truncate_file(struct inode *inode) 129static void reiserfs_vfs_truncate_file(struct inode *inode)
126{ 130{
131 mutex_lock(&(REISERFS_I(inode)->tailpack));
127 reiserfs_truncate_file(inode, 1); 132 reiserfs_truncate_file(inode, 1);
133 mutex_unlock(&(REISERFS_I(inode)->tailpack));
128} 134}
129 135
130/* Sync a reiserfs file. */ 136/* Sync a reiserfs file. */
@@ -134,10 +140,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
134 * be removed... 140 * be removed...
135 */ 141 */
136 142
137static int reiserfs_sync_file(struct file *filp, 143static int reiserfs_sync_file(struct file *filp, int datasync)
138 struct dentry *dentry, int datasync)
139{ 144{
140 struct inode *inode = dentry->d_inode; 145 struct inode *inode = filp->f_mapping->host;
141 int err; 146 int err;
142 int barrier_done; 147 int barrier_done;
143 148
@@ -147,7 +152,8 @@ static int reiserfs_sync_file(struct file *filp,
147 barrier_done = reiserfs_commit_for_inode(inode); 152 barrier_done = reiserfs_commit_for_inode(inode);
148 reiserfs_write_unlock(inode->i_sb); 153 reiserfs_write_unlock(inode->i_sb);
149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
150 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
156 BLKDEV_IFL_WAIT);
151 if (barrier_done < 0) 157 if (barrier_done < 0)
152 return barrier_done; 158 return barrier_done;
153 return (err < 0) ? -EIO : 0; 159 return (err < 0) ? -EIO : 0;
@@ -288,8 +294,8 @@ const struct file_operations reiserfs_file_operations = {
288#ifdef CONFIG_COMPAT 294#ifdef CONFIG_COMPAT
289 .compat_ioctl = reiserfs_compat_ioctl, 295 .compat_ioctl = reiserfs_compat_ioctl,
290#endif 296#endif
291 .mmap = reiserfs_file_mmap, 297 .mmap = generic_file_mmap,
292 .open = dquot_file_open, 298 .open = reiserfs_file_open,
293 .release = reiserfs_file_release, 299 .release = reiserfs_file_release,
294 .fsync = reiserfs_sync_file, 300 .fsync = reiserfs_sync_file,
295 .aio_read = generic_file_aio_read, 301 .aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index dc2c65e04853..caa758377d66 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -25,7 +25,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
25int reiserfs_prepare_write(struct file *f, struct page *page, 25int reiserfs_prepare_write(struct file *f, struct page *page,
26 unsigned from, unsigned to); 26 unsigned from, unsigned to);
27 27
28void reiserfs_delete_inode(struct inode *inode) 28void reiserfs_evict_inode(struct inode *inode)
29{ 29{
30 /* We need blocks for transaction + (user+group) quota update (possibly delete) */ 30 /* We need blocks for transaction + (user+group) quota update (possibly delete) */
31 int jbegin_count = 31 int jbegin_count =
@@ -35,10 +35,12 @@ void reiserfs_delete_inode(struct inode *inode)
35 int depth; 35 int depth;
36 int err; 36 int err;
37 37
38 if (!is_bad_inode(inode)) 38 if (!inode->i_nlink && !is_bad_inode(inode))
39 dquot_initialize(inode); 39 dquot_initialize(inode);
40 40
41 truncate_inode_pages(&inode->i_data, 0); 41 truncate_inode_pages(&inode->i_data, 0);
42 if (inode->i_nlink)
43 goto no_delete;
42 44
43 depth = reiserfs_write_lock_once(inode->i_sb); 45 depth = reiserfs_write_lock_once(inode->i_sb);
44 46
@@ -77,9 +79,15 @@ void reiserfs_delete_inode(struct inode *inode)
77 ; 79 ;
78 } 80 }
79 out: 81 out:
80 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ 82 end_writeback(inode); /* note this must go after the journal_end to prevent deadlock */
83 dquot_drop(inode);
81 inode->i_blocks = 0; 84 inode->i_blocks = 0;
82 reiserfs_write_unlock_once(inode->i_sb, depth); 85 reiserfs_write_unlock_once(inode->i_sb, depth);
86 return;
87
88no_delete:
89 end_writeback(inode);
90 dquot_drop(inode);
83} 91}
84 92
85static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, 93static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
@@ -1138,7 +1146,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
1138 REISERFS_I(inode)->i_prealloc_count = 0; 1146 REISERFS_I(inode)->i_prealloc_count = 0;
1139 REISERFS_I(inode)->i_trans_id = 0; 1147 REISERFS_I(inode)->i_trans_id = 0;
1140 REISERFS_I(inode)->i_jl = NULL; 1148 REISERFS_I(inode)->i_jl = NULL;
1141 mutex_init(&(REISERFS_I(inode)->i_mmap));
1142 reiserfs_init_xattr_rwsem(inode); 1149 reiserfs_init_xattr_rwsem(inode);
1143 1150
1144 if (stat_data_v1(ih)) { 1151 if (stat_data_v1(ih)) {
@@ -1221,7 +1228,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
1221 inode_set_bytes(inode, 1228 inode_set_bytes(inode,
1222 to_real_used_space(inode, inode->i_blocks, 1229 to_real_used_space(inode, inode->i_blocks,
1223 SD_V2_SIZE)); 1230 SD_V2_SIZE));
1224 /* read persistent inode attributes from sd and initalise 1231 /* read persistent inode attributes from sd and initialise
1225 generic inode flags from them */ 1232 generic inode flags from them */
1226 REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); 1233 REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
1227 sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); 1234 sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
@@ -1841,7 +1848,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1841 REISERFS_I(inode)->i_attrs = 1848 REISERFS_I(inode)->i_attrs =
1842 REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; 1849 REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1843 sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); 1850 sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
1844 mutex_init(&(REISERFS_I(inode)->i_mmap));
1845 reiserfs_init_xattr_rwsem(inode); 1851 reiserfs_init_xattr_rwsem(inode);
1846 1852
1847 /* key to search for correct place for new stat data */ 1853 /* key to search for correct place for new stat data */
@@ -2587,8 +2593,7 @@ static int reiserfs_write_begin(struct file *file,
2587 old_ref = th->t_refcount; 2593 old_ref = th->t_refcount;
2588 th->t_refcount++; 2594 th->t_refcount++;
2589 } 2595 }
2590 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2596 ret = __block_write_begin(page, pos, len, reiserfs_get_block);
2591 reiserfs_get_block);
2592 if (ret && reiserfs_transaction_running(inode->i_sb)) { 2597 if (ret && reiserfs_transaction_running(inode->i_sb)) {
2593 struct reiserfs_transaction_handle *th = current->journal_info; 2598 struct reiserfs_transaction_handle *th = current->journal_info;
2594 /* this gets a little ugly. If reiserfs_get_block returned an 2599 /* this gets a little ugly. If reiserfs_get_block returned an
@@ -3059,10 +3064,25 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3059{ 3064{
3060 struct file *file = iocb->ki_filp; 3065 struct file *file = iocb->ki_filp;
3061 struct inode *inode = file->f_mapping->host; 3066 struct inode *inode = file->f_mapping->host;
3067 ssize_t ret;
3062 3068
3063 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3069 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3064 offset, nr_segs, 3070 offset, nr_segs,
3065 reiserfs_get_blocks_direct_io, NULL); 3071 reiserfs_get_blocks_direct_io, NULL);
3072
3073 /*
3074 * In case of error extending write may have instantiated a few
3075 * blocks outside i_size. Trim these off again.
3076 */
3077 if (unlikely((rw & WRITE) && ret < 0)) {
3078 loff_t isize = i_size_read(inode);
3079 loff_t end = offset + iov_length(iov, nr_segs);
3080
3081 if (end > isize)
3082 vmtruncate(inode, isize);
3083 }
3084
3085 return ret;
3066} 3086}
3067 3087
3068int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) 3088int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -3072,13 +3092,18 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3072 int depth; 3092 int depth;
3073 int error; 3093 int error;
3074 3094
3095 error = inode_change_ok(inode, attr);
3096 if (error)
3097 return error;
3098
3075 /* must be turned off for recursive notify_change calls */ 3099 /* must be turned off for recursive notify_change calls */
3076 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3100 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3077 3101
3078 depth = reiserfs_write_lock_once(inode->i_sb); 3102 depth = reiserfs_write_lock_once(inode->i_sb);
3079 if (attr->ia_valid & ATTR_SIZE) { 3103 if (is_quota_modification(inode, attr))
3080 dquot_initialize(inode); 3104 dquot_initialize(inode);
3081 3105
3106 if (attr->ia_valid & ATTR_SIZE) {
3082 /* version 2 items will be caught by the s_maxbytes check 3107 /* version 2 items will be caught by the s_maxbytes check
3083 ** done for us in vmtruncate 3108 ** done for us in vmtruncate
3084 */ 3109 */
@@ -3120,56 +3145,59 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3120 goto out; 3145 goto out;
3121 } 3146 }
3122 3147
3123 error = inode_change_ok(inode, attr); 3148 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3124 if (!error) { 3149 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3125 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3150 struct reiserfs_transaction_handle th;
3126 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3151 int jbegin_count =
3127 error = reiserfs_chown_xattrs(inode, attr); 3152 2 *
3153 (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
3154 REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
3155 2;
3128 3156
3129 if (!error) { 3157 error = reiserfs_chown_xattrs(inode, attr);
3130 struct reiserfs_transaction_handle th; 3158
3131 int jbegin_count = 3159 if (error)
3132 2 * 3160 return error;
3133 (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) + 3161
3134 REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) + 3162 /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
3135 2; 3163 error = journal_begin(&th, inode->i_sb, jbegin_count);
3136 3164 if (error)
3137 /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ 3165 goto out;
3138 error = 3166 error = dquot_transfer(inode, attr);
3139 journal_begin(&th, inode->i_sb, 3167 if (error) {
3140 jbegin_count); 3168 journal_end(&th, inode->i_sb, jbegin_count);
3141 if (error) 3169 goto out;
3142 goto out;
3143 error = dquot_transfer(inode, attr);
3144 if (error) {
3145 journal_end(&th, inode->i_sb,
3146 jbegin_count);
3147 goto out;
3148 }
3149 /* Update corresponding info in inode so that everything is in
3150 * one transaction */
3151 if (attr->ia_valid & ATTR_UID)
3152 inode->i_uid = attr->ia_uid;
3153 if (attr->ia_valid & ATTR_GID)
3154 inode->i_gid = attr->ia_gid;
3155 mark_inode_dirty(inode);
3156 error =
3157 journal_end(&th, inode->i_sb, jbegin_count);
3158 }
3159 }
3160 if (!error) {
3161 /*
3162 * Relax the lock here, as it might truncate the
3163 * inode pages and wait for inode pages locks.
3164 * To release such page lock, the owner needs the
3165 * reiserfs lock
3166 */
3167 reiserfs_write_unlock_once(inode->i_sb, depth);
3168 error = inode_setattr(inode, attr);
3169 depth = reiserfs_write_lock_once(inode->i_sb);
3170 } 3170 }
3171
3172 /* Update corresponding info in inode so that everything is in
3173 * one transaction */
3174 if (attr->ia_valid & ATTR_UID)
3175 inode->i_uid = attr->ia_uid;
3176 if (attr->ia_valid & ATTR_GID)
3177 inode->i_gid = attr->ia_gid;
3178 mark_inode_dirty(inode);
3179 error = journal_end(&th, inode->i_sb, jbegin_count);
3180 if (error)
3181 goto out;
3171 } 3182 }
3172 3183
3184 /*
3185 * Relax the lock here, as it might truncate the
3186 * inode pages and wait for inode pages locks.
3187 * To release such page lock, the owner needs the
3188 * reiserfs lock
3189 */
3190 reiserfs_write_unlock_once(inode->i_sb, depth);
3191 if ((attr->ia_valid & ATTR_SIZE) &&
3192 attr->ia_size != i_size_read(inode))
3193 error = vmtruncate(inode, attr->ia_size);
3194
3195 if (!error) {
3196 setattr_copy(inode, attr);
3197 mark_inode_dirty(inode);
3198 }
3199 depth = reiserfs_write_lock_once(inode->i_sb);
3200
3173 if (!error && reiserfs_posixacl(inode->i_sb)) { 3201 if (!error && reiserfs_posixacl(inode->i_sb)) {
3174 if (attr->ia_valid & ATTR_MODE) 3202 if (attr->ia_valid & ATTR_MODE)
3175 error = reiserfs_acl_chmod(inode); 3203 error = reiserfs_acl_chmod(inode);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de0712..5cbb81e134ac 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
170int reiserfs_unpack(struct inode *inode, struct file *filp) 170int reiserfs_unpack(struct inode *inode, struct file *filp)
171{ 171{
172 int retval = 0; 172 int retval = 0;
173 int depth;
173 int index; 174 int index;
174 struct page *page; 175 struct page *page;
175 struct address_space *mapping; 176 struct address_space *mapping;
@@ -188,8 +189,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
188 /* we need to make sure nobody is changing the file size beneath 189 /* we need to make sure nobody is changing the file size beneath
189 ** us 190 ** us
190 */ 191 */
191 mutex_lock(&inode->i_mutex); 192 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
192 reiserfs_write_lock(inode->i_sb); 193 depth = reiserfs_write_lock_once(inode->i_sb);
193 194
194 write_from = inode->i_size & (blocksize - 1); 195 write_from = inode->i_size & (blocksize - 1);
195 /* if we are on a block boundary, we are already unpacked. */ 196 /* if we are on a block boundary, we are already unpacked. */
@@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
224 225
225 out: 226 out:
226 mutex_unlock(&inode->i_mutex); 227 mutex_unlock(&inode->i_mutex);
227 reiserfs_write_unlock(inode->i_sb); 228 reiserfs_write_unlock_once(inode->i_sb, depth);
228 return retval; 229 return retval;
229} 230}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 19fbc810e8e7..812e2c05aa29 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -983,7 +983,6 @@ static int flush_older_commits(struct super_block *s,
983 983
984static int reiserfs_async_progress_wait(struct super_block *s) 984static int reiserfs_async_progress_wait(struct super_block *s)
985{ 985{
986 DEFINE_WAIT(wait);
987 struct reiserfs_journal *j = SB_JOURNAL(s); 986 struct reiserfs_journal *j = SB_JOURNAL(s);
988 987
989 if (atomic_read(&j->j_async_throttle)) { 988 if (atomic_read(&j->j_async_throttle)) {
@@ -2312,7 +2311,7 @@ static int journal_read_transaction(struct super_block *sb,
2312 /* flush out the real blocks */ 2311 /* flush out the real blocks */
2313 for (i = 0; i < get_desc_trans_len(desc); i++) { 2312 for (i = 0; i < get_desc_trans_len(desc); i++) {
2314 set_buffer_dirty(real_blocks[i]); 2313 set_buffer_dirty(real_blocks[i]);
2315 ll_rw_block(SWRITE, 1, real_blocks + i); 2314 write_dirty_buffer(real_blocks[i], WRITE);
2316 } 2315 }
2317 for (i = 0; i < get_desc_trans_len(desc); i++) { 2316 for (i = 0; i < get_desc_trans_len(desc); i++) {
2318 wait_on_buffer(real_blocks[i]); 2317 wait_on_buffer(real_blocks[i]);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index d0c43cb99ffc..ee78d4a0086a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode)
561*/ 561*/
562static int new_inode_init(struct inode *inode, struct inode *dir, int mode) 562static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
563{ 563{
564
565 /* the quota init calls have to know who to charge the quota to, so
566 ** we have to set uid and gid here
567 */
568 inode->i_uid = current_fsuid();
569 inode->i_mode = mode;
570 /* Make inode invalid - just in case we are going to drop it before 564 /* Make inode invalid - just in case we are going to drop it before
571 * the initialization happens */ 565 * the initialization happens */
572 INODE_PKEY(inode)->k_objectid = 0; 566 INODE_PKEY(inode)->k_objectid = 0;
573 567 /* the quota init calls have to know who to charge the quota to, so
574 if (dir->i_mode & S_ISGID) { 568 ** we have to set uid and gid here
575 inode->i_gid = dir->i_gid; 569 */
576 if (S_ISDIR(mode)) 570 inode_init_owner(inode, dir, mode);
577 inode->i_mode |= S_ISGID;
578 } else {
579 inode->i_gid = current_fsgid();
580 }
581 dquot_initialize(inode); 571 dquot_initialize(inode);
582 return 0; 572 return 0;
583} 573}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 59125fb36d42..e15ff612002d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
158#ifdef CONFIG_QUOTA 158#ifdef CONFIG_QUOTA
159 int i; 159 int i;
160 int ms_active_set; 160 int ms_active_set;
161 int quota_enabled[MAXQUOTAS];
161#endif 162#endif
162 163
163 /* compose key to look for "save" links */ 164 /* compose key to look for "save" links */
@@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
179 } 180 }
180 /* Turn on quotas so that they are updated correctly */ 181 /* Turn on quotas so that they are updated correctly */
181 for (i = 0; i < MAXQUOTAS; i++) { 182 for (i = 0; i < MAXQUOTAS; i++) {
183 quota_enabled[i] = 1;
182 if (REISERFS_SB(s)->s_qf_names[i]) { 184 if (REISERFS_SB(s)->s_qf_names[i]) {
183 int ret = reiserfs_quota_on_mount(s, i); 185 int ret;
186
187 if (sb_has_quota_active(s, i)) {
188 quota_enabled[i] = 0;
189 continue;
190 }
191 ret = reiserfs_quota_on_mount(s, i);
184 if (ret < 0) 192 if (ret < 0)
185 reiserfs_warning(s, "reiserfs-2500", 193 reiserfs_warning(s, "reiserfs-2500",
186 "cannot turn on journaled " 194 "cannot turn on journaled "
@@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
304#ifdef CONFIG_QUOTA 312#ifdef CONFIG_QUOTA
305 /* Turn quotas off */ 313 /* Turn quotas off */
306 for (i = 0; i < MAXQUOTAS; i++) { 314 for (i = 0; i < MAXQUOTAS; i++) {
307 if (sb_dqopt(s)->files[i]) 315 if (sb_dqopt(s)->files[i] && quota_enabled[i])
308 vfs_quota_off(s, i, 0); 316 dquot_quota_off(s, i);
309 } 317 }
310 if (ms_active_set) 318 if (ms_active_set)
311 /* Restore the flag back */ 319 /* Restore the flag back */
@@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
466 struct reiserfs_transaction_handle th; 474 struct reiserfs_transaction_handle th;
467 th.t_trans_id = 0; 475 th.t_trans_id = 0;
468 476
477 dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
478
469 reiserfs_write_lock(s); 479 reiserfs_write_lock(s);
470 480
471 if (s->s_dirt) 481 if (s->s_dirt)
@@ -515,6 +525,8 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
515 kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); 525 kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
516 if (!ei) 526 if (!ei)
517 return NULL; 527 return NULL;
528 atomic_set(&ei->openers, 0);
529 mutex_init(&ei->tailpack);
518 return &ei->vfs_inode; 530 return &ei->vfs_inode;
519} 531}
520 532
@@ -579,11 +591,6 @@ out:
579 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 591 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
580} 592}
581 593
582static void reiserfs_clear_inode(struct inode *inode)
583{
584 dquot_drop(inode);
585}
586
587#ifdef CONFIG_QUOTA 594#ifdef CONFIG_QUOTA
588static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, 595static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
589 size_t, loff_t); 596 size_t, loff_t);
@@ -596,8 +603,7 @@ static const struct super_operations reiserfs_sops = {
596 .destroy_inode = reiserfs_destroy_inode, 603 .destroy_inode = reiserfs_destroy_inode,
597 .write_inode = reiserfs_write_inode, 604 .write_inode = reiserfs_write_inode,
598 .dirty_inode = reiserfs_dirty_inode, 605 .dirty_inode = reiserfs_dirty_inode,
599 .clear_inode = reiserfs_clear_inode, 606 .evict_inode = reiserfs_evict_inode,
600 .delete_inode = reiserfs_delete_inode,
601 .put_super = reiserfs_put_super, 607 .put_super = reiserfs_put_super,
602 .write_super = reiserfs_write_super, 608 .write_super = reiserfs_write_super,
603 .sync_fs = reiserfs_sync_fs, 609 .sync_fs = reiserfs_sync_fs,
@@ -620,7 +626,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
620static int reiserfs_release_dquot(struct dquot *); 626static int reiserfs_release_dquot(struct dquot *);
621static int reiserfs_mark_dquot_dirty(struct dquot *); 627static int reiserfs_mark_dquot_dirty(struct dquot *);
622static int reiserfs_write_info(struct super_block *, int); 628static int reiserfs_write_info(struct super_block *, int);
623static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 629static int reiserfs_quota_on(struct super_block *, int, int, char *);
624 630
625static const struct dquot_operations reiserfs_quota_operations = { 631static const struct dquot_operations reiserfs_quota_operations = {
626 .write_dquot = reiserfs_write_dquot, 632 .write_dquot = reiserfs_write_dquot,
@@ -634,12 +640,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
634 640
635static const struct quotactl_ops reiserfs_qctl_operations = { 641static const struct quotactl_ops reiserfs_qctl_operations = {
636 .quota_on = reiserfs_quota_on, 642 .quota_on = reiserfs_quota_on,
637 .quota_off = vfs_quota_off, 643 .quota_off = dquot_quota_off,
638 .quota_sync = vfs_quota_sync, 644 .quota_sync = dquot_quota_sync,
639 .get_info = vfs_get_dqinfo, 645 .get_info = dquot_get_dqinfo,
640 .set_info = vfs_set_dqinfo, 646 .set_info = dquot_set_dqinfo,
641 .get_dqblk = vfs_get_dqblk, 647 .get_dqblk = dquot_get_dqblk,
642 .set_dqblk = vfs_set_dqblk, 648 .set_dqblk = dquot_set_dqblk,
643}; 649};
644#endif 650#endif
645 651
@@ -1242,6 +1248,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1242 if (s->s_flags & MS_RDONLY) 1248 if (s->s_flags & MS_RDONLY)
1243 /* it is read-only already */ 1249 /* it is read-only already */
1244 goto out_ok; 1250 goto out_ok;
1251
1252 err = dquot_suspend(s, -1);
1253 if (err < 0)
1254 goto out_err;
1255
1245 /* try to remount file system with read-only permissions */ 1256 /* try to remount file system with read-only permissions */
1246 if (sb_umount_state(rs) == REISERFS_VALID_FS 1257 if (sb_umount_state(rs) == REISERFS_VALID_FS
1247 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { 1258 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1295,6 +1306,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1295 s->s_dirt = 0; 1306 s->s_dirt = 0;
1296 1307
1297 if (!(*mount_flags & MS_RDONLY)) { 1308 if (!(*mount_flags & MS_RDONLY)) {
1309 dquot_resume(s, -1);
1298 finish_unfinished(s); 1310 finish_unfinished(s);
1299 reiserfs_xattr_init(s, *mount_flags); 1311 reiserfs_xattr_init(s, *mount_flags);
1300 } 1312 }
@@ -2022,15 +2034,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
2022 */ 2034 */
2023static int reiserfs_quota_on_mount(struct super_block *sb, int type) 2035static int reiserfs_quota_on_mount(struct super_block *sb, int type)
2024{ 2036{
2025 return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], 2037 return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
2026 REISERFS_SB(sb)->s_jquota_fmt, type); 2038 REISERFS_SB(sb)->s_jquota_fmt, type);
2027} 2039}
2028 2040
2029/* 2041/*
2030 * Standard function to be called on quota_on 2042 * Standard function to be called on quota_on
2031 */ 2043 */
2032static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, 2044static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2033 char *name, int remount) 2045 char *name)
2034{ 2046{
2035 int err; 2047 int err;
2036 struct path path; 2048 struct path path;
@@ -2039,9 +2051,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2039 2051
2040 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) 2052 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
2041 return -EINVAL; 2053 return -EINVAL;
2042 /* No more checks needed? Path and format_id are bogus anyway... */ 2054
2043 if (remount)
2044 return vfs_quota_on(sb, type, format_id, name, 1);
2045 err = kern_path(name, LOOKUP_FOLLOW, &path); 2055 err = kern_path(name, LOOKUP_FOLLOW, &path);
2046 if (err) 2056 if (err)
2047 return err; 2057 return err;
@@ -2085,7 +2095,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2085 if (err) 2095 if (err)
2086 goto out; 2096 goto out;
2087 } 2097 }
2088 err = vfs_quota_on_path(sb, type, format_id, &path); 2098 err = dquot_quota_on_path(sb, type, format_id, &path);
2089out: 2099out:
2090 path_put(&path); 2100 path_put(&path);
2091 return err; 2101 return err;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e7cc00e636dc..8c4cf273c672 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -723,11 +723,11 @@ out:
723 (handler) = *(handlers)++) 723 (handler) = *(handlers)++)
724 724
725/* This is the implementation for the xattr plugin infrastructure */ 725/* This is the implementation for the xattr plugin infrastructure */
726static inline struct xattr_handler * 726static inline const struct xattr_handler *
727find_xattr_handler_prefix(struct xattr_handler **handlers, 727find_xattr_handler_prefix(const struct xattr_handler **handlers,
728 const char *name) 728 const char *name)
729{ 729{
730 struct xattr_handler *xah; 730 const struct xattr_handler *xah;
731 731
732 if (!handlers) 732 if (!handlers)
733 return NULL; 733 return NULL;
@@ -748,7 +748,7 @@ ssize_t
748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, 748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
749 size_t size) 749 size_t size)
750{ 750{
751 struct xattr_handler *handler; 751 const struct xattr_handler *handler;
752 752
753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
754 754
@@ -767,7 +767,7 @@ int
767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, 767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
768 size_t size, int flags) 768 size_t size, int flags)
769{ 769{
770 struct xattr_handler *handler; 770 const struct xattr_handler *handler;
771 771
772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
773 773
@@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
784 */ 784 */
785int reiserfs_removexattr(struct dentry *dentry, const char *name) 785int reiserfs_removexattr(struct dentry *dentry, const char *name)
786{ 786{
787 struct xattr_handler *handler; 787 const struct xattr_handler *handler;
788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
789 789
790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
807 size_t size; 807 size_t size;
808 if (name[0] != '.' || 808 if (name[0] != '.' ||
809 (namelen != 1 && (name[1] != '.' || namelen != 2))) { 809 (namelen != 1 && (name[1] != '.' || namelen != 2))) {
810 struct xattr_handler *handler; 810 const struct xattr_handler *handler;
811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, 811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
812 name); 812 name);
813 if (!handler) /* Unsupported xattr name */ 813 if (!handler) /* Unsupported xattr name */
@@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
920#endif 920#endif
921 921
922/* Actual operations that are exported to VFS-land */ 922/* Actual operations that are exported to VFS-land */
923struct xattr_handler *reiserfs_xattr_handlers[] = { 923const struct xattr_handler *reiserfs_xattr_handlers[] = {
924#ifdef CONFIG_REISERFS_FS_XATTR 924#ifdef CONFIG_REISERFS_FS_XATTR
925 &reiserfs_xattr_user_handler, 925 &reiserfs_xattr_user_handler,
926 &reiserfs_xattr_trusted_handler, 926 &reiserfs_xattr_trusted_handler,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 9cdb759645a9..536d697a8a28 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
500 return size; 500 return size;
501} 501}
502 502
503struct xattr_handler reiserfs_posix_acl_access_handler = { 503const struct xattr_handler reiserfs_posix_acl_access_handler = {
504 .prefix = POSIX_ACL_XATTR_ACCESS, 504 .prefix = POSIX_ACL_XATTR_ACCESS,
505 .flags = ACL_TYPE_ACCESS, 505 .flags = ACL_TYPE_ACCESS,
506 .get = posix_acl_get, 506 .get = posix_acl_get,
@@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
520 return size; 520 return size;
521} 521}
522 522
523struct xattr_handler reiserfs_posix_acl_default_handler = { 523const struct xattr_handler reiserfs_posix_acl_default_handler = {
524 .prefix = POSIX_ACL_XATTR_DEFAULT, 524 .prefix = POSIX_ACL_XATTR_DEFAULT,
525 .flags = ACL_TYPE_DEFAULT, 525 .flags = ACL_TYPE_DEFAULT,
526 .get = posix_acl_get, 526 .get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 7271a477c041..237c6928d3c6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
111 sec->value = NULL; 111 sec->value = NULL;
112} 112}
113 113
114struct xattr_handler reiserfs_xattr_security_handler = { 114const struct xattr_handler reiserfs_xattr_security_handler = {
115 .prefix = XATTR_SECURITY_PREFIX, 115 .prefix = XATTR_SECURITY_PREFIX,
116 .get = security_get, 116 .get = security_get,
117 .set = security_set, 117 .set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3daf..9883736ce3ec 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
48 return len; 48 return len;
49} 49}
50 50
51struct xattr_handler reiserfs_xattr_trusted_handler = { 51const struct xattr_handler reiserfs_xattr_trusted_handler = {
52 .prefix = XATTR_TRUSTED_PREFIX, 52 .prefix = XATTR_TRUSTED_PREFIX,
53 .get = trusted_get, 53 .get = trusted_get,
54 .set = trusted_set, 54 .set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b911..45ae1a00013a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
44 return len; 44 return len;
45} 45}
46 46
47struct xattr_handler reiserfs_xattr_user_handler = { 47const struct xattr_handler reiserfs_xattr_user_handler = {
48 .prefix = XATTR_USER_PREFIX, 48 .prefix = XATTR_USER_PREFIX,
49 .get = user_get, 49 .get = user_get,
50 .set = user_set, 50 .set = user_set,
diff --git a/fs/signalfd.c b/fs/signalfd.c
index f329849ce3c0..1c5a6add779d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -88,6 +88,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
88 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); 88 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
89 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); 89 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
90 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); 90 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
91 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
91 break; 92 break;
92 case __SI_POLL: 93 case __SI_POLL:
93 err |= __put_user(kinfo->si_band, &uinfo->ssi_band); 94 err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
@@ -111,6 +112,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
111 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); 112 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
112 err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); 113 err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
113 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); 114 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
115 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
114 break; 116 break;
115 default: 117 default:
116 /* 118 /*
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
37 37
38const struct file_operations smb_dir_operations = 38const struct file_operations smb_dir_operations =
39{ 39{
40 .llseek = generic_file_llseek,
40 .read = generic_read_dir, 41 .read = generic_read_dir,
41 .readdir = smb_readdir, 42 .readdir = smb_readdir,
42 .ioctl = smb_ioctl, 43 .unlocked_ioctl = smb_ioctl,
43 .open = smb_dir_open, 44 .open = smb_dir_open,
44}; 45};
45 46
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dbf6548bbf06..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -28,8 +28,9 @@
28#include "proto.h" 28#include "proto.h"
29 29
30static int 30static int
31smb_fsync(struct file *file, struct dentry * dentry, int datasync) 31smb_fsync(struct file *file, int datasync)
32{ 32{
33 struct dentry *dentry = file->f_path.dentry;
33 struct smb_sb_info *server = server_from_dentry(dentry); 34 struct smb_sb_info *server = server_from_dentry(dentry);
34 int result; 35 int result;
35 36
@@ -437,7 +438,7 @@ const struct file_operations smb_file_operations =
437 .aio_read = smb_file_aio_read, 438 .aio_read = smb_file_aio_read,
438 .write = do_sync_write, 439 .write = do_sync_write,
439 .aio_write = smb_file_aio_write, 440 .aio_write = smb_file_aio_write,
440 .ioctl = smb_ioctl, 441 .unlocked_ioctl = smb_ioctl,
441 .mmap = smb_file_mmap, 442 .mmap = smb_file_mmap,
442 .open = smb_file_open, 443 .open = smb_file_open,
443 .release = smb_file_release, 444 .release = smb_file_release,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index dfa1d67f8fca..450c91941988 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -46,7 +46,7 @@
46 46
47#define SMB_TTL_DEFAULT 1000 47#define SMB_TTL_DEFAULT 1000
48 48
49static void smb_delete_inode(struct inode *); 49static void smb_evict_inode(struct inode *);
50static void smb_put_super(struct super_block *); 50static void smb_put_super(struct super_block *);
51static int smb_statfs(struct dentry *, struct kstatfs *); 51static int smb_statfs(struct dentry *, struct kstatfs *);
52static int smb_show_options(struct seq_file *, struct vfsmount *); 52static int smb_show_options(struct seq_file *, struct vfsmount *);
@@ -102,7 +102,7 @@ static const struct super_operations smb_sops =
102 .alloc_inode = smb_alloc_inode, 102 .alloc_inode = smb_alloc_inode,
103 .destroy_inode = smb_destroy_inode, 103 .destroy_inode = smb_destroy_inode,
104 .drop_inode = generic_delete_inode, 104 .drop_inode = generic_delete_inode,
105 .delete_inode = smb_delete_inode, 105 .evict_inode = smb_evict_inode,
106 .put_super = smb_put_super, 106 .put_super = smb_put_super,
107 .statfs = smb_statfs, 107 .statfs = smb_statfs,
108 .show_options = smb_show_options, 108 .show_options = smb_show_options,
@@ -324,15 +324,15 @@ out:
324 * All blocking cleanup operations need to go here to avoid races. 324 * All blocking cleanup operations need to go here to avoid races.
325 */ 325 */
326static void 326static void
327smb_delete_inode(struct inode *ino) 327smb_evict_inode(struct inode *ino)
328{ 328{
329 DEBUG1("ino=%ld\n", ino->i_ino); 329 DEBUG1("ino=%ld\n", ino->i_ino);
330 truncate_inode_pages(&ino->i_data, 0); 330 truncate_inode_pages(&ino->i_data, 0);
331 end_writeback(ino);
331 lock_kernel(); 332 lock_kernel();
332 if (smb_close(ino)) 333 if (smb_close(ino))
333 PARANOIA("could not close inode %ld\n", ino->i_ino); 334 PARANOIA("could not close inode %ld\n", ino->i_ino);
334 unlock_kernel(); 335 unlock_kernel();
335 clear_inode(ino);
336} 336}
337 337
338static struct option opts[] = { 338static struct option opts[] = {
@@ -714,9 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
714 error = server->ops->truncate(inode, attr->ia_size); 714 error = server->ops->truncate(inode, attr->ia_size);
715 if (error) 715 if (error)
716 goto out; 716 goto out;
717 error = vmtruncate(inode, attr->ia_size); 717 truncate_setsize(inode, attr->ia_size);
718 if (error)
719 goto out;
720 refresh = 1; 718 refresh = 1;
721 } 719 }
722 720
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/smp_lock.h>
16#include <linux/net.h> 17#include <linux/net.h>
17 18
18#include <linux/smb_fs.h> 19#include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
22 23
23#include "proto.h" 24#include "proto.h"
24 25
25int 26long
26smb_ioctl(struct inode *inode, struct file *filp, 27smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
27 unsigned int cmd, unsigned long arg)
28{ 28{
29 struct smb_sb_info *server = server_from_inode(inode); 29 struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
30 struct smb_conn_opt opt; 30 struct smb_conn_opt opt;
31 int result = -EINVAL; 31 int result = -EINVAL;
32 32
33 lock_kernel();
33 switch (cmd) { 34 switch (cmd) {
34 uid16_t uid16; 35 uid16_t uid16;
35 uid_t uid32; 36 uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
62 default: 63 default:
63 break; 64 break;
64 } 65 }
66 unlock_kernel();
65 67
66 return result; 68 return result;
67} 69}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
67extern const struct file_operations smb_file_operations; 67extern const struct file_operations smb_file_operations;
68extern const struct inode_operations smb_file_inode_operations; 68extern const struct inode_operations smb_file_inode_operations;
69/* ioctl.c */ 69/* ioctl.c */
70extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); 70extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
71/* smbiod.c */ 71/* smbiod.c */
72extern void smbiod_wake_up(void); 72extern void smbiod_wake_up(void);
73extern int smbiod_register_server(struct smb_sb_info *server); 73extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046b..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
19 18
20#include <asm/uaccess.h> 19#include <asm/uaccess.h>
21#include <asm/system.h> 20#include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..8f1dfaecc8f0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
193 break; 193 break;
194 } 194 }
195 195
196 if (pipe->nrbufs < PIPE_BUFFERS) { 196 if (pipe->nrbufs < pipe->buffers) {
197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
198 struct pipe_buffer *buf = pipe->bufs + newbuf; 198 struct pipe_buffer *buf = pipe->bufs + newbuf;
199 199
200 buf->page = spd->pages[page_nr]; 200 buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
214 214
215 if (!--spd->nr_pages) 215 if (!--spd->nr_pages)
216 break; 216 break;
217 if (pipe->nrbufs < PIPE_BUFFERS) 217 if (pipe->nrbufs < pipe->buffers)
218 continue; 218 continue;
219 219
220 break; 220 break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
265 page_cache_release(spd->pages[i]); 265 page_cache_release(spd->pages[i]);
266} 266}
267 267
268/*
269 * Check if we need to grow the arrays holding pages and partial page
270 * descriptions.
271 */
272int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
273{
274 if (pipe->buffers <= PIPE_DEF_BUFFERS)
275 return 0;
276
277 spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
278 spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
279
280 if (spd->pages && spd->partial)
281 return 0;
282
283 kfree(spd->pages);
284 kfree(spd->partial);
285 return -ENOMEM;
286}
287
288void splice_shrink_spd(struct pipe_inode_info *pipe,
289 struct splice_pipe_desc *spd)
290{
291 if (pipe->buffers <= PIPE_DEF_BUFFERS)
292 return;
293
294 kfree(spd->pages);
295 kfree(spd->partial);
296}
297
268static int 298static int
269__generic_file_splice_read(struct file *in, loff_t *ppos, 299__generic_file_splice_read(struct file *in, loff_t *ppos,
270 struct pipe_inode_info *pipe, size_t len, 300 struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
272{ 302{
273 struct address_space *mapping = in->f_mapping; 303 struct address_space *mapping = in->f_mapping;
274 unsigned int loff, nr_pages, req_pages; 304 unsigned int loff, nr_pages, req_pages;
275 struct page *pages[PIPE_BUFFERS]; 305 struct page *pages[PIPE_DEF_BUFFERS];
276 struct partial_page partial[PIPE_BUFFERS]; 306 struct partial_page partial[PIPE_DEF_BUFFERS];
277 struct page *page; 307 struct page *page;
278 pgoff_t index, end_index; 308 pgoff_t index, end_index;
279 loff_t isize; 309 loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
286 .spd_release = spd_release_page, 316 .spd_release = spd_release_page,
287 }; 317 };
288 318
319 if (splice_grow_spd(pipe, &spd))
320 return -ENOMEM;
321
289 index = *ppos >> PAGE_CACHE_SHIFT; 322 index = *ppos >> PAGE_CACHE_SHIFT;
290 loff = *ppos & ~PAGE_CACHE_MASK; 323 loff = *ppos & ~PAGE_CACHE_MASK;
291 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 324 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
292 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 325 nr_pages = min(req_pages, pipe->buffers);
293 326
294 /* 327 /*
295 * Lookup the (hopefully) full range of pages we need. 328 * Lookup the (hopefully) full range of pages we need.
296 */ 329 */
297 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 330 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
298 index += spd.nr_pages; 331 index += spd.nr_pages;
299 332
300 /* 333 /*
@@ -321,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
321 break; 354 break;
322 355
323 error = add_to_page_cache_lru(page, mapping, index, 356 error = add_to_page_cache_lru(page, mapping, index,
324 mapping_gfp_mask(mapping)); 357 GFP_KERNEL);
325 if (unlikely(error)) { 358 if (unlikely(error)) {
326 page_cache_release(page); 359 page_cache_release(page);
327 if (error == -EEXIST) 360 if (error == -EEXIST)
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
335 unlock_page(page); 368 unlock_page(page);
336 } 369 }
337 370
338 pages[spd.nr_pages++] = page; 371 spd.pages[spd.nr_pages++] = page;
339 index++; 372 index++;
340 } 373 }
341 374
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
356 * this_len is the max we'll use from this page 389 * this_len is the max we'll use from this page
357 */ 390 */
358 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 391 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
359 page = pages[page_nr]; 392 page = spd.pages[page_nr];
360 393
361 if (PageReadahead(page)) 394 if (PageReadahead(page))
362 page_cache_async_readahead(mapping, &in->f_ra, in, 395 page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -366,17 +399,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
366 * If the page isn't uptodate, we may need to start io on it 399 * If the page isn't uptodate, we may need to start io on it
367 */ 400 */
368 if (!PageUptodate(page)) { 401 if (!PageUptodate(page)) {
369 /* 402 lock_page(page);
370 * If in nonblock mode then dont block on waiting
371 * for an in-flight io page
372 */
373 if (flags & SPLICE_F_NONBLOCK) {
374 if (!trylock_page(page)) {
375 error = -EAGAIN;
376 break;
377 }
378 } else
379 lock_page(page);
380 403
381 /* 404 /*
382 * Page was truncated, or invalidated by the 405 * Page was truncated, or invalidated by the
@@ -393,8 +416,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
393 error = -ENOMEM; 416 error = -ENOMEM;
394 break; 417 break;
395 } 418 }
396 page_cache_release(pages[page_nr]); 419 page_cache_release(spd.pages[page_nr]);
397 pages[page_nr] = page; 420 spd.pages[page_nr] = page;
398 } 421 }
399 /* 422 /*
400 * page was already under io and is now done, great 423 * page was already under io and is now done, great
@@ -451,8 +474,8 @@ fill_it:
451 len = this_len; 474 len = this_len;
452 } 475 }
453 476
454 partial[page_nr].offset = loff; 477 spd.partial[page_nr].offset = loff;
455 partial[page_nr].len = this_len; 478 spd.partial[page_nr].len = this_len;
456 len -= this_len; 479 len -= this_len;
457 loff = 0; 480 loff = 0;
458 spd.nr_pages++; 481 spd.nr_pages++;
@@ -464,12 +487,13 @@ fill_it:
464 * we got, 'nr_pages' is how many pages are in the map. 487 * we got, 'nr_pages' is how many pages are in the map.
465 */ 488 */
466 while (page_nr < nr_pages) 489 while (page_nr < nr_pages)
467 page_cache_release(pages[page_nr++]); 490 page_cache_release(spd.pages[page_nr++]);
468 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 491 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
469 492
470 if (spd.nr_pages) 493 if (spd.nr_pages)
471 return splice_to_pipe(pipe, &spd); 494 error = splice_to_pipe(pipe, &spd);
472 495
496 splice_shrink_spd(pipe, &spd);
473 return error; 497 return error;
474} 498}
475 499
@@ -560,10 +584,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
560 unsigned int nr_pages; 584 unsigned int nr_pages;
561 unsigned int nr_freed; 585 unsigned int nr_freed;
562 size_t offset; 586 size_t offset;
563 struct page *pages[PIPE_BUFFERS]; 587 struct page *pages[PIPE_DEF_BUFFERS];
564 struct partial_page partial[PIPE_BUFFERS]; 588 struct partial_page partial[PIPE_DEF_BUFFERS];
565 struct iovec vec[PIPE_BUFFERS]; 589 struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
566 pgoff_t index;
567 ssize_t res; 590 ssize_t res;
568 size_t this_len; 591 size_t this_len;
569 int error; 592 int error;
@@ -576,11 +599,21 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
576 .spd_release = spd_release_page, 599 .spd_release = spd_release_page,
577 }; 600 };
578 601
579 index = *ppos >> PAGE_CACHE_SHIFT; 602 if (splice_grow_spd(pipe, &spd))
603 return -ENOMEM;
604
605 res = -ENOMEM;
606 vec = __vec;
607 if (pipe->buffers > PIPE_DEF_BUFFERS) {
608 vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
609 if (!vec)
610 goto shrink_ret;
611 }
612
580 offset = *ppos & ~PAGE_CACHE_MASK; 613 offset = *ppos & ~PAGE_CACHE_MASK;
581 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 614 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
582 615
583 for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { 616 for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
584 struct page *page; 617 struct page *page;
585 618
586 page = alloc_page(GFP_USER); 619 page = alloc_page(GFP_USER);
@@ -591,7 +624,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
591 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); 624 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
592 vec[i].iov_base = (void __user *) page_address(page); 625 vec[i].iov_base = (void __user *) page_address(page);
593 vec[i].iov_len = this_len; 626 vec[i].iov_len = this_len;
594 pages[i] = page; 627 spd.pages[i] = page;
595 spd.nr_pages++; 628 spd.nr_pages++;
596 len -= this_len; 629 len -= this_len;
597 offset = 0; 630 offset = 0;
@@ -610,11 +643,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
610 nr_freed = 0; 643 nr_freed = 0;
611 for (i = 0; i < spd.nr_pages; i++) { 644 for (i = 0; i < spd.nr_pages; i++) {
612 this_len = min_t(size_t, vec[i].iov_len, res); 645 this_len = min_t(size_t, vec[i].iov_len, res);
613 partial[i].offset = 0; 646 spd.partial[i].offset = 0;
614 partial[i].len = this_len; 647 spd.partial[i].len = this_len;
615 if (!this_len) { 648 if (!this_len) {
616 __free_page(pages[i]); 649 __free_page(spd.pages[i]);
617 pages[i] = NULL; 650 spd.pages[i] = NULL;
618 nr_freed++; 651 nr_freed++;
619 } 652 }
620 res -= this_len; 653 res -= this_len;
@@ -625,13 +658,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
625 if (res > 0) 658 if (res > 0)
626 *ppos += res; 659 *ppos += res;
627 660
661shrink_ret:
662 if (vec != __vec)
663 kfree(vec);
664 splice_shrink_spd(pipe, &spd);
628 return res; 665 return res;
629 666
630err: 667err:
631 for (i = 0; i < spd.nr_pages; i++) 668 for (i = 0; i < spd.nr_pages; i++)
632 __free_page(pages[i]); 669 __free_page(spd.pages[i]);
633 670
634 return error; 671 res = error;
672 goto shrink_ret;
635} 673}
636EXPORT_SYMBOL(default_file_splice_read); 674EXPORT_SYMBOL(default_file_splice_read);
637 675
@@ -784,7 +822,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
784 if (!buf->len) { 822 if (!buf->len) {
785 buf->ops = NULL; 823 buf->ops = NULL;
786 ops->release(pipe, buf); 824 ops->release(pipe, buf);
787 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 825 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
788 pipe->nrbufs--; 826 pipe->nrbufs--;
789 if (pipe->inode) 827 if (pipe->inode)
790 sd->need_wakeup = true; 828 sd->need_wakeup = true;
@@ -1211,7 +1249,7 @@ out_release:
1211 * If we did an incomplete transfer we must release 1249 * If we did an incomplete transfer we must release
1212 * the pipe buffers in question: 1250 * the pipe buffers in question:
1213 */ 1251 */
1214 for (i = 0; i < PIPE_BUFFERS; i++) { 1252 for (i = 0; i < pipe->buffers; i++) {
1215 struct pipe_buffer *buf = pipe->bufs + i; 1253 struct pipe_buffer *buf = pipe->bufs + i;
1216 1254
1217 if (buf->ops) { 1255 if (buf->ops) {
@@ -1232,7 +1270,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
1232{ 1270{
1233 struct file *file = sd->u.file; 1271 struct file *file = sd->u.file;
1234 1272
1235 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); 1273 return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
1274 sd->flags);
1236} 1275}
1237 1276
1238/** 1277/**
@@ -1321,8 +1360,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1321 if (off_in) 1360 if (off_in)
1322 return -ESPIPE; 1361 return -ESPIPE;
1323 if (off_out) { 1362 if (off_out) {
1324 if (!out->f_op || !out->f_op->llseek || 1363 if (!(out->f_mode & FMODE_PWRITE))
1325 out->f_op->llseek == no_llseek)
1326 return -EINVAL; 1364 return -EINVAL;
1327 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1365 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1328 return -EFAULT; 1366 return -EFAULT;
@@ -1342,8 +1380,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1342 if (off_out) 1380 if (off_out)
1343 return -ESPIPE; 1381 return -ESPIPE;
1344 if (off_in) { 1382 if (off_in) {
1345 if (!in->f_op || !in->f_op->llseek || 1383 if (!(in->f_mode & FMODE_PREAD))
1346 in->f_op->llseek == no_llseek)
1347 return -EINVAL; 1384 return -EINVAL;
1348 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1385 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1349 return -EFAULT; 1386 return -EFAULT;
@@ -1371,7 +1408,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1371 */ 1408 */
1372static int get_iovec_page_array(const struct iovec __user *iov, 1409static int get_iovec_page_array(const struct iovec __user *iov,
1373 unsigned int nr_vecs, struct page **pages, 1410 unsigned int nr_vecs, struct page **pages,
1374 struct partial_page *partial, int aligned) 1411 struct partial_page *partial, int aligned,
1412 unsigned int pipe_buffers)
1375{ 1413{
1376 int buffers = 0, error = 0; 1414 int buffers = 0, error = 0;
1377 1415
@@ -1414,8 +1452,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1414 break; 1452 break;
1415 1453
1416 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1454 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1417 if (npages > PIPE_BUFFERS - buffers) 1455 if (npages > pipe_buffers - buffers)
1418 npages = PIPE_BUFFERS - buffers; 1456 npages = pipe_buffers - buffers;
1419 1457
1420 error = get_user_pages_fast((unsigned long)base, npages, 1458 error = get_user_pages_fast((unsigned long)base, npages,
1421 0, &pages[buffers]); 1459 0, &pages[buffers]);
@@ -1450,7 +1488,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1450 * or if we mapped the max number of pages that we have 1488 * or if we mapped the max number of pages that we have
1451 * room for. 1489 * room for.
1452 */ 1490 */
1453 if (error < npages || buffers == PIPE_BUFFERS) 1491 if (error < npages || buffers == pipe_buffers)
1454 break; 1492 break;
1455 1493
1456 nr_vecs--; 1494 nr_vecs--;
@@ -1593,8 +1631,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1593 unsigned long nr_segs, unsigned int flags) 1631 unsigned long nr_segs, unsigned int flags)
1594{ 1632{
1595 struct pipe_inode_info *pipe; 1633 struct pipe_inode_info *pipe;
1596 struct page *pages[PIPE_BUFFERS]; 1634 struct page *pages[PIPE_DEF_BUFFERS];
1597 struct partial_page partial[PIPE_BUFFERS]; 1635 struct partial_page partial[PIPE_DEF_BUFFERS];
1598 struct splice_pipe_desc spd = { 1636 struct splice_pipe_desc spd = {
1599 .pages = pages, 1637 .pages = pages,
1600 .partial = partial, 1638 .partial = partial,
@@ -1602,17 +1640,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1602 .ops = &user_page_pipe_buf_ops, 1640 .ops = &user_page_pipe_buf_ops,
1603 .spd_release = spd_release_page, 1641 .spd_release = spd_release_page,
1604 }; 1642 };
1643 long ret;
1605 1644
1606 pipe = pipe_info(file->f_path.dentry->d_inode); 1645 pipe = pipe_info(file->f_path.dentry->d_inode);
1607 if (!pipe) 1646 if (!pipe)
1608 return -EBADF; 1647 return -EBADF;
1609 1648
1610 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1649 if (splice_grow_spd(pipe, &spd))
1611 flags & SPLICE_F_GIFT); 1650 return -ENOMEM;
1651
1652 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1653 spd.partial, flags & SPLICE_F_GIFT,
1654 pipe->buffers);
1612 if (spd.nr_pages <= 0) 1655 if (spd.nr_pages <= 0)
1613 return spd.nr_pages; 1656 ret = spd.nr_pages;
1657 else
1658 ret = splice_to_pipe(pipe, &spd);
1614 1659
1615 return splice_to_pipe(pipe, &spd); 1660 splice_shrink_spd(pipe, &spd);
1661 return ret;
1616} 1662}
1617 1663
1618/* 1664/*
@@ -1738,13 +1784,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1738 * Check ->nrbufs without the inode lock first. This function 1784 * Check ->nrbufs without the inode lock first. This function
1739 * is speculative anyways, so missing one is ok. 1785 * is speculative anyways, so missing one is ok.
1740 */ 1786 */
1741 if (pipe->nrbufs < PIPE_BUFFERS) 1787 if (pipe->nrbufs < pipe->buffers)
1742 return 0; 1788 return 0;
1743 1789
1744 ret = 0; 1790 ret = 0;
1745 pipe_lock(pipe); 1791 pipe_lock(pipe);
1746 1792
1747 while (pipe->nrbufs >= PIPE_BUFFERS) { 1793 while (pipe->nrbufs >= pipe->buffers) {
1748 if (!pipe->readers) { 1794 if (!pipe->readers) {
1749 send_sig(SIGPIPE, current, 0); 1795 send_sig(SIGPIPE, current, 0);
1750 ret = -EPIPE; 1796 ret = -EPIPE;
@@ -1810,7 +1856,7 @@ retry:
1810 * Cannot make any progress, because either the input 1856 * Cannot make any progress, because either the input
1811 * pipe is empty or the output pipe is full. 1857 * pipe is empty or the output pipe is full.
1812 */ 1858 */
1813 if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { 1859 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1814 /* Already processed some buffers, break */ 1860 /* Already processed some buffers, break */
1815 if (ret) 1861 if (ret)
1816 break; 1862 break;
@@ -1831,7 +1877,7 @@ retry:
1831 } 1877 }
1832 1878
1833 ibuf = ipipe->bufs + ipipe->curbuf; 1879 ibuf = ipipe->bufs + ipipe->curbuf;
1834 nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; 1880 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1835 obuf = opipe->bufs + nbuf; 1881 obuf = opipe->bufs + nbuf;
1836 1882
1837 if (len >= ibuf->len) { 1883 if (len >= ibuf->len) {
@@ -1841,7 +1887,7 @@ retry:
1841 *obuf = *ibuf; 1887 *obuf = *ibuf;
1842 ibuf->ops = NULL; 1888 ibuf->ops = NULL;
1843 opipe->nrbufs++; 1889 opipe->nrbufs++;
1844 ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; 1890 ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1845 ipipe->nrbufs--; 1891 ipipe->nrbufs--;
1846 input_wakeup = true; 1892 input_wakeup = true;
1847 } else { 1893 } else {
@@ -1914,11 +1960,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1914 * If we have iterated all input buffers or ran out of 1960 * If we have iterated all input buffers or ran out of
1915 * output room, break. 1961 * output room, break.
1916 */ 1962 */
1917 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1963 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1918 break; 1964 break;
1919 1965
1920 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1966 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1921 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1967 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1922 1968
1923 /* 1969 /*
1924 * Get a reference to this pipe buffer, 1970 * Get a reference to this pipe buffer,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..e5f63da64d04 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,13 +5,13 @@ config SQUASHFS
5 help 5 help
6 Saying Y here includes support for SquashFS 4.0 (a Compressed 6 Saying Y here includes support for SquashFS 4.0 (a Compressed
7 Read-Only File System). Squashfs is a highly compressed read-only 7 Read-Only File System). Squashfs is a highly compressed read-only
8 filesystem for Linux. It uses zlib compression to compress both 8 filesystem for Linux. It uses zlib/lzo compression to compress both
9 files, inodes and directories. Inodes in the system are very small 9 files, inodes and directories. Inodes in the system are very small
10 and all blocks are packed to minimise data overhead. Block sizes 10 and all blocks are packed to minimise data overhead. Block sizes
11 greater than 4K are supported up to a maximum of 1 Mbytes (default 11 greater than 4K are supported up to a maximum of 1 Mbytes (default
12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files 12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
13 (larger than 4GB), full uid/gid information, hard links and 13 (larger than 4GB), full uid/gid information, hard links and
14 timestamps. 14 timestamps.
15 15
16 Squashfs is intended for general read-only filesystem use, for 16 Squashfs is intended for general read-only filesystem use, for
17 archival use (i.e. in cases where a .tar.gz file may be used), and in 17 archival use (i.e. in cases where a .tar.gz file may be used), and in
@@ -26,9 +26,35 @@ config SQUASHFS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config SQUASHFS_EMBEDDED 29config SQUASHFS_XATTR
30 bool "Squashfs XATTR support"
31 depends on SQUASHFS
32 default n
33 help
34 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by
36 the kernel or by users (see the attr(5) manual page).
37
38 If unsure, say N.
39
40config SQUASHFS_LZO
41 bool "Include support for LZO compressed file systems"
42 depends on SQUASHFS
43 default n
44 select LZO_DECOMPRESS
45 help
46 Saying Y here includes support for reading Squashfs file systems
47 compressed with LZO compresssion. LZO compression is mainly
48 aimed at embedded systems with slower CPUs where the overheads
49 of zlib are too high.
50
51 LZO is not the standard compression used in Squashfs and so most
52 file systems will be readable without selecting this option.
30 53
31 bool "Additional option for memory-constrained systems" 54 If unsure, say N.
55
56config SQUASHFS_EMBEDDED
57 bool "Additional option for memory-constrained systems"
32 depends on SQUASHFS 58 depends on SQUASHFS
33 default n 59 default n
34 help 60 help
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..7672bac8d328 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 157478da6ac9..24af9ce9722f 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -40,9 +40,11 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
41}; 41};
42 42
43#ifndef CONFIG_SQUASHFS_LZO
43static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { 44static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 45 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
45}; 46};
47#endif
46 48
47static const struct squashfs_decompressor squashfs_unknown_comp_ops = { 49static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
48 NULL, NULL, NULL, 0, "unknown", 0 50 NULL, NULL, NULL, 0, "unknown", 0
@@ -51,7 +53,11 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
51static const struct squashfs_decompressor *decompressor[] = { 53static const struct squashfs_decompressor *decompressor[] = {
52 &squashfs_zlib_comp_ops, 54 &squashfs_zlib_comp_ops,
53 &squashfs_lzma_unsupported_comp_ops, 55 &squashfs_lzma_unsupported_comp_ops,
56#ifdef CONFIG_SQUASHFS_LZO
57 &squashfs_lzo_comp_ops,
58#else
54 &squashfs_lzo_unsupported_comp_ops, 59 &squashfs_lzo_unsupported_comp_ops,
60#endif
55 &squashfs_unknown_comp_ops 61 &squashfs_unknown_comp_ops
56}; 62};
57 63
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/xattr.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
46#include "squashfs_fs_i.h" 47#include "squashfs_fs_i.h"
47#include "squashfs.h" 48#include "squashfs.h"
49#include "xattr.h"
48 50
49/* 51/*
50 * Initialise VFS inode with the base inode information common to all 52 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
111 int err, type, offset = SQUASHFS_INODE_OFFSET(ino); 113 int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
112 union squashfs_inode squashfs_ino; 114 union squashfs_inode squashfs_ino;
113 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; 115 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
116 int xattr_id = SQUASHFS_INVALID_XATTR;
114 117
115 TRACE("Entered squashfs_read_inode\n"); 118 TRACE("Entered squashfs_read_inode\n");
116 119
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
199 frag_offset = 0; 202 frag_offset = 0;
200 } 203 }
201 204
205 xattr_id = le32_to_cpu(sqsh_ino->xattr);
202 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 206 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
203 inode->i_size = le64_to_cpu(sqsh_ino->file_size); 207 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
208 inode->i_op = &squashfs_inode_ops;
204 inode->i_fop = &generic_ro_fops; 209 inode->i_fop = &generic_ro_fops;
205 inode->i_mode |= S_IFREG; 210 inode->i_mode |= S_IFREG;
206 inode->i_blocks = ((inode->i_size - 211 inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
251 if (err < 0) 256 if (err < 0)
252 goto failed_read; 257 goto failed_read;
253 258
259 xattr_id = le32_to_cpu(sqsh_ino->xattr);
254 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 260 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
255 inode->i_size = le32_to_cpu(sqsh_ino->file_size); 261 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
256 inode->i_op = &squashfs_dir_inode_ops; 262 inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
280 286
281 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 287 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
282 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); 288 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
283 inode->i_op = &page_symlink_inode_operations; 289 inode->i_op = &squashfs_symlink_inode_ops;
284 inode->i_data.a_ops = &squashfs_symlink_aops; 290 inode->i_data.a_ops = &squashfs_symlink_aops;
285 inode->i_mode |= S_IFLNK; 291 inode->i_mode |= S_IFLNK;
286 squashfs_i(inode)->start = block; 292 squashfs_i(inode)->start = block;
287 squashfs_i(inode)->offset = offset; 293 squashfs_i(inode)->offset = offset;
288 294
295 if (type == SQUASHFS_LSYMLINK_TYPE) {
296 __le32 xattr;
297
298 err = squashfs_read_metadata(sb, NULL, &block,
299 &offset, inode->i_size);
300 if (err < 0)
301 goto failed_read;
302 err = squashfs_read_metadata(sb, &xattr, &block,
303 &offset, sizeof(xattr));
304 if (err < 0)
305 goto failed_read;
306 xattr_id = le32_to_cpu(xattr);
307 }
308
289 TRACE("Symbolic link inode %x:%x, start_block %llx, offset " 309 TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
290 "%x\n", SQUASHFS_INODE_BLK(ino), offset, 310 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
291 block, offset); 311 block, offset);
292 break; 312 break;
293 } 313 }
294 case SQUASHFS_BLKDEV_TYPE: 314 case SQUASHFS_BLKDEV_TYPE:
295 case SQUASHFS_CHRDEV_TYPE: 315 case SQUASHFS_CHRDEV_TYPE: {
296 case SQUASHFS_LBLKDEV_TYPE:
297 case SQUASHFS_LCHRDEV_TYPE: {
298 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; 316 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
299 unsigned int rdev; 317 unsigned int rdev;
300 318
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
315 SQUASHFS_INODE_BLK(ino), offset, rdev); 333 SQUASHFS_INODE_BLK(ino), offset, rdev);
316 break; 334 break;
317 } 335 }
336 case SQUASHFS_LBLKDEV_TYPE:
337 case SQUASHFS_LCHRDEV_TYPE: {
338 struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
339 unsigned int rdev;
340
341 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
342 sizeof(*sqsh_ino));
343 if (err < 0)
344 goto failed_read;
345
346 if (type == SQUASHFS_LCHRDEV_TYPE)
347 inode->i_mode |= S_IFCHR;
348 else
349 inode->i_mode |= S_IFBLK;
350 xattr_id = le32_to_cpu(sqsh_ino->xattr);
351 inode->i_op = &squashfs_inode_ops;
352 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
353 rdev = le32_to_cpu(sqsh_ino->rdev);
354 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
355
356 TRACE("Device inode %x:%x, rdev %x\n",
357 SQUASHFS_INODE_BLK(ino), offset, rdev);
358 break;
359 }
318 case SQUASHFS_FIFO_TYPE: 360 case SQUASHFS_FIFO_TYPE:
319 case SQUASHFS_SOCKET_TYPE: 361 case SQUASHFS_SOCKET_TYPE: {
320 case SQUASHFS_LFIFO_TYPE:
321 case SQUASHFS_LSOCKET_TYPE: {
322 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; 362 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
323 363
324 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, 364 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
334 init_special_inode(inode, inode->i_mode, 0); 374 init_special_inode(inode, inode->i_mode, 0);
335 break; 375 break;
336 } 376 }
377 case SQUASHFS_LFIFO_TYPE:
378 case SQUASHFS_LSOCKET_TYPE: {
379 struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
380
381 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
382 sizeof(*sqsh_ino));
383 if (err < 0)
384 goto failed_read;
385
386 if (type == SQUASHFS_LFIFO_TYPE)
387 inode->i_mode |= S_IFIFO;
388 else
389 inode->i_mode |= S_IFSOCK;
390 xattr_id = le32_to_cpu(sqsh_ino->xattr);
391 inode->i_op = &squashfs_inode_ops;
392 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
393 init_special_inode(inode, inode->i_mode, 0);
394 break;
395 }
337 default: 396 default:
338 ERROR("Unknown inode type %d in squashfs_iget!\n", type); 397 ERROR("Unknown inode type %d in squashfs_iget!\n", type);
339 return -EINVAL; 398 return -EINVAL;
340 } 399 }
341 400
401 if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
402 err = squashfs_xattr_lookup(sb, xattr_id,
403 &squashfs_i(inode)->xattr_count,
404 &squashfs_i(inode)->xattr_size,
405 &squashfs_i(inode)->xattr);
406 if (err < 0)
407 goto failed_read;
408 inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
409 + 1;
410 } else
411 squashfs_i(inode)->xattr_count = 0;
412
342 return 0; 413 return 0;
343 414
344failed_read: 415failed_read:
345 ERROR("Unable to read inode 0x%llx\n", ino); 416 ERROR("Unable to read inode 0x%llx\n", ino);
346 return err; 417 return err;
347} 418}
419
420
421const struct inode_operations squashfs_inode_ops = {
422 .getxattr = generic_getxattr,
423 .listxattr = squashfs_listxattr
424};
425
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
new file mode 100644
index 000000000000..5d87789bf1c1
--- /dev/null
+++ b/fs/squashfs/lzo_wrapper.c
@@ -0,0 +1,136 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010 LG Electronics
5 * Chan Jeong <chan.jeong@lge.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * lzo_wrapper.c
22 */
23
24#include <linux/mutex.h>
25#include <linux/buffer_head.h>
26#include <linux/slab.h>
27#include <linux/vmalloc.h>
28#include <linux/lzo.h>
29
30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h"
34#include "decompressor.h"
35
36struct squashfs_lzo {
37 void *input;
38 void *output;
39};
40
41static void *lzo_init(struct squashfs_sb_info *msblk)
42{
43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
44
45 struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL);
46 if (stream == NULL)
47 goto failed;
48 stream->input = vmalloc(block_size);
49 if (stream->input == NULL)
50 goto failed;
51 stream->output = vmalloc(block_size);
52 if (stream->output == NULL)
53 goto failed2;
54
55 return stream;
56
57failed2:
58 vfree(stream->input);
59failed:
60 ERROR("Failed to allocate lzo workspace\n");
61 kfree(stream);
62 return NULL;
63}
64
65
66static void lzo_free(void *strm)
67{
68 struct squashfs_lzo *stream = strm;
69
70 if (stream) {
71 vfree(stream->input);
72 vfree(stream->output);
73 }
74 kfree(stream);
75}
76
77
78static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
79 struct buffer_head **bh, int b, int offset, int length, int srclength,
80 int pages)
81{
82 struct squashfs_lzo *stream = msblk->stream;
83 void *buff = stream->input;
84 int avail, i, bytes = length, res;
85 size_t out_len = srclength;
86
87 mutex_lock(&msblk->read_data_mutex);
88
89 for (i = 0; i < b; i++) {
90 wait_on_buffer(bh[i]);
91 if (!buffer_uptodate(bh[i]))
92 goto block_release;
93
94 avail = min(bytes, msblk->devblksize - offset);
95 memcpy(buff, bh[i]->b_data + offset, avail);
96 buff += avail;
97 bytes -= avail;
98 offset = 0;
99 put_bh(bh[i]);
100 }
101
102 res = lzo1x_decompress_safe(stream->input, (size_t)length,
103 stream->output, &out_len);
104 if (res != LZO_E_OK)
105 goto failed;
106
107 res = bytes = (int)out_len;
108 for (i = 0, buff = stream->output; bytes && i < pages; i++) {
109 avail = min_t(int, bytes, PAGE_CACHE_SIZE);
110 memcpy(buffer[i], buff, avail);
111 buff += avail;
112 bytes -= avail;
113 }
114
115 mutex_unlock(&msblk->read_data_mutex);
116 return res;
117
118block_release:
119 for (; i < b; i++)
120 put_bh(bh[i]);
121
122failed:
123 mutex_unlock(&msblk->read_data_mutex);
124
125 ERROR("lzo decompression failed, data probably corrupt\n");
126 return -EIO;
127}
128
129const struct squashfs_decompressor squashfs_lzo_comp_ops = {
130 .init = lzo_init,
131 .free = lzo_free,
132 .decompress = lzo_uncompress,
133 .id = LZO_COMPRESSION,
134 .name = "lzo",
135 .supported = 1
136};
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/xattr.h>
60 61
61#include "squashfs_fs.h" 62#include "squashfs_fs.h"
62#include "squashfs_fs_sb.h" 63#include "squashfs_fs_sb.h"
63#include "squashfs_fs_i.h" 64#include "squashfs_fs_i.h"
64#include "squashfs.h" 65#include "squashfs.h"
66#include "xattr.h"
65 67
66/* 68/*
67 * Lookup name in the directory index, returning the location of the metadata 69 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
237 239
238 240
239const struct inode_operations squashfs_dir_inode_ops = { 241const struct inode_operations squashfs_dir_inode_ops = {
240 .lookup = squashfs_lookup 242 .lookup = squashfs_lookup,
243 .getxattr = generic_getxattr,
244 .listxattr = squashfs_listxattr
241}; 245};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..5d45569d5f72 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
73 unsigned int); 73 unsigned int);
74extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
75 75
76/* xattr.c */
77extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
78
76/* 79/*
77 * Inodes, files and decompressor operations 80 * Inodes, files, decompressor and xattr operations
78 */ 81 */
79 82
80/* dir.c */ 83/* dir.c */
@@ -86,11 +89,21 @@ extern const struct export_operations squashfs_export_ops;
86/* file.c */ 89/* file.c */
87extern const struct address_space_operations squashfs_aops; 90extern const struct address_space_operations squashfs_aops;
88 91
92/* inode.c */
93extern const struct inode_operations squashfs_inode_ops;
94
89/* namei.c */ 95/* namei.c */
90extern const struct inode_operations squashfs_dir_inode_ops; 96extern const struct inode_operations squashfs_dir_inode_ops;
91 97
92/* symlink.c */ 98/* symlink.c */
93extern const struct address_space_operations squashfs_symlink_aops; 99extern const struct address_space_operations squashfs_symlink_aops;
100extern const struct inode_operations squashfs_symlink_inode_ops;
101
102/* xattr.c */
103extern const struct xattr_handler *squashfs_xattr_handlers[];
94 104
95/* zlib_wrapper.c */ 105/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 106extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
107
108/* lzo_wrapper.c */
109extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..c5137fc9ab11 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
46#define SQUASHFS_NAME_LEN 256 46#define SQUASHFS_NAME_LEN 256
47 47
48#define SQUASHFS_INVALID_FRAG (0xffffffffU) 48#define SQUASHFS_INVALID_FRAG (0xffffffffU)
49#define SQUASHFS_INVALID_XATTR (0xffffffffU)
49#define SQUASHFS_INVALID_BLK (-1LL) 50#define SQUASHFS_INVALID_BLK (-1LL)
50 51
51/* Filesystem flags */ 52/* Filesystem flags */
@@ -96,6 +97,13 @@
96#define SQUASHFS_LFIFO_TYPE 13 97#define SQUASHFS_LFIFO_TYPE 13
97#define SQUASHFS_LSOCKET_TYPE 14 98#define SQUASHFS_LSOCKET_TYPE 14
98 99
100/* Xattr types */
101#define SQUASHFS_XATTR_USER 0
102#define SQUASHFS_XATTR_TRUSTED 1
103#define SQUASHFS_XATTR_SECURITY 2
104#define SQUASHFS_XATTR_VALUE_OOL 256
105#define SQUASHFS_XATTR_PREFIX_MASK 0xff
106
99/* Flag whether block is compressed or uncompressed, bit is set if block is 107/* Flag whether block is compressed or uncompressed, bit is set if block is
100 * uncompressed */ 108 * uncompressed */
101#define SQUASHFS_COMPRESSED_BIT (1 << 15) 109#define SQUASHFS_COMPRESSED_BIT (1 << 15)
@@ -174,6 +182,24 @@
174 182
175#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ 183#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
176 sizeof(u64)) 184 sizeof(u64))
185/* xattr id lookup table defines */
186#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
187
188#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
189 SQUASHFS_METADATA_SIZE)
190
191#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \
192 SQUASHFS_METADATA_SIZE)
193
194#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \
195 SQUASHFS_METADATA_SIZE - 1) / \
196 SQUASHFS_METADATA_SIZE)
197
198#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\
199 sizeof(u64))
200#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16))
201
202#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff))
177 203
178/* cached data constants for filesystem */ 204/* cached data constants for filesystem */
179#define SQUASHFS_CACHED_BLKS 8 205#define SQUASHFS_CACHED_BLKS 8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
228 __le64 root_inode; 254 __le64 root_inode;
229 __le64 bytes_used; 255 __le64 bytes_used;
230 __le64 id_table_start; 256 __le64 id_table_start;
231 __le64 xattr_table_start; 257 __le64 xattr_id_table_start;
232 __le64 inode_table_start; 258 __le64 inode_table_start;
233 __le64 directory_table_start; 259 __le64 directory_table_start;
234 __le64 fragment_table_start; 260 __le64 fragment_table_start;
@@ -248,7 +274,7 @@ struct squashfs_base_inode {
248 __le16 uid; 274 __le16 uid;
249 __le16 guid; 275 __le16 guid;
250 __le32 mtime; 276 __le32 mtime;
251 __le32 inode_number; 277 __le32 inode_number;
252}; 278};
253 279
254struct squashfs_ipc_inode { 280struct squashfs_ipc_inode {
@@ -257,19 +283,42 @@ struct squashfs_ipc_inode {
257 __le16 uid; 283 __le16 uid;
258 __le16 guid; 284 __le16 guid;
259 __le32 mtime; 285 __le32 mtime;
260 __le32 inode_number; 286 __le32 inode_number;
261 __le32 nlink; 287 __le32 nlink;
262}; 288};
263 289
290struct squashfs_lipc_inode {
291 __le16 inode_type;
292 __le16 mode;
293 __le16 uid;
294 __le16 guid;
295 __le32 mtime;
296 __le32 inode_number;
297 __le32 nlink;
298 __le32 xattr;
299};
300
264struct squashfs_dev_inode { 301struct squashfs_dev_inode {
265 __le16 inode_type; 302 __le16 inode_type;
266 __le16 mode; 303 __le16 mode;
267 __le16 uid; 304 __le16 uid;
268 __le16 guid; 305 __le16 guid;
269 __le32 mtime; 306 __le32 mtime;
270 __le32 inode_number; 307 __le32 inode_number;
308 __le32 nlink;
309 __le32 rdev;
310};
311
312struct squashfs_ldev_inode {
313 __le16 inode_type;
314 __le16 mode;
315 __le16 uid;
316 __le16 guid;
317 __le32 mtime;
318 __le32 inode_number;
271 __le32 nlink; 319 __le32 nlink;
272 __le32 rdev; 320 __le32 rdev;
321 __le32 xattr;
273}; 322};
274 323
275struct squashfs_symlink_inode { 324struct squashfs_symlink_inode {
@@ -278,7 +327,7 @@ struct squashfs_symlink_inode {
278 __le16 uid; 327 __le16 uid;
279 __le16 guid; 328 __le16 guid;
280 __le32 mtime; 329 __le32 mtime;
281 __le32 inode_number; 330 __le32 inode_number;
282 __le32 nlink; 331 __le32 nlink;
283 __le32 symlink_size; 332 __le32 symlink_size;
284 char symlink[0]; 333 char symlink[0];
@@ -290,7 +339,7 @@ struct squashfs_reg_inode {
290 __le16 uid; 339 __le16 uid;
291 __le16 guid; 340 __le16 guid;
292 __le32 mtime; 341 __le32 mtime;
293 __le32 inode_number; 342 __le32 inode_number;
294 __le32 start_block; 343 __le32 start_block;
295 __le32 fragment; 344 __le32 fragment;
296 __le32 offset; 345 __le32 offset;
@@ -304,7 +353,7 @@ struct squashfs_lreg_inode {
304 __le16 uid; 353 __le16 uid;
305 __le16 guid; 354 __le16 guid;
306 __le32 mtime; 355 __le32 mtime;
307 __le32 inode_number; 356 __le32 inode_number;
308 __le64 start_block; 357 __le64 start_block;
309 __le64 file_size; 358 __le64 file_size;
310 __le64 sparse; 359 __le64 sparse;
@@ -321,7 +370,7 @@ struct squashfs_dir_inode {
321 __le16 uid; 370 __le16 uid;
322 __le16 guid; 371 __le16 guid;
323 __le32 mtime; 372 __le32 mtime;
324 __le32 inode_number; 373 __le32 inode_number;
325 __le32 start_block; 374 __le32 start_block;
326 __le32 nlink; 375 __le32 nlink;
327 __le16 file_size; 376 __le16 file_size;
@@ -335,7 +384,7 @@ struct squashfs_ldir_inode {
335 __le16 uid; 384 __le16 uid;
336 __le16 guid; 385 __le16 guid;
337 __le32 mtime; 386 __le32 mtime;
338 __le32 inode_number; 387 __le32 inode_number;
339 __le32 nlink; 388 __le32 nlink;
340 __le32 file_size; 389 __le32 file_size;
341 __le32 start_block; 390 __le32 start_block;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
349union squashfs_inode { 398union squashfs_inode {
350 struct squashfs_base_inode base; 399 struct squashfs_base_inode base;
351 struct squashfs_dev_inode dev; 400 struct squashfs_dev_inode dev;
401 struct squashfs_ldev_inode ldev;
352 struct squashfs_symlink_inode symlink; 402 struct squashfs_symlink_inode symlink;
353 struct squashfs_reg_inode reg; 403 struct squashfs_reg_inode reg;
354 struct squashfs_lreg_inode lreg; 404 struct squashfs_lreg_inode lreg;
355 struct squashfs_dir_inode dir; 405 struct squashfs_dir_inode dir;
356 struct squashfs_ldir_inode ldir; 406 struct squashfs_ldir_inode ldir;
357 struct squashfs_ipc_inode ipc; 407 struct squashfs_ipc_inode ipc;
408 struct squashfs_lipc_inode lipc;
358}; 409};
359 410
360struct squashfs_dir_entry { 411struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
377 unsigned int unused; 428 unsigned int unused;
378}; 429};
379 430
431struct squashfs_xattr_entry {
432 __le16 type;
433 __le16 size;
434 char data[0];
435};
436
437struct squashfs_xattr_val {
438 __le32 vsize;
439 char value[0];
440};
441
442struct squashfs_xattr_id {
443 __le64 xattr;
444 __le32 count;
445 __le32 size;
446};
447
448struct squashfs_xattr_id_table {
449 __le64 xattr_table_start;
450 __le32 xattr_ids;
451 __le32 unused;
452};
453
380#endif 454#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
26struct squashfs_inode_info { 26struct squashfs_inode_info {
27 u64 start; 27 u64 start;
28 int offset; 28 int offset;
29 u64 xattr;
30 unsigned int xattr_size;
31 int xattr_count;
29 union { 32 union {
30 struct { 33 struct {
31 u64 fragment_block; 34 u64 fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
61 int next_meta_index; 61 int next_meta_index;
62 __le64 *id_table; 62 __le64 *id_table;
63 __le64 *fragment_index; 63 __le64 *fragment_index;
64 __le64 *xattr_id_table;
64 struct mutex read_data_mutex; 65 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 66 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 67 struct meta_index *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
68 __le64 *inode_lookup_table; 69 __le64 *inode_lookup_table;
69 u64 inode_table; 70 u64 inode_table;
70 u64 directory_table; 71 u64 directory_table;
72 u64 xattr_table;
71 unsigned int block_size; 73 unsigned int block_size;
72 unsigned short block_log; 74 unsigned short block_log;
73 long long bytes_used; 75 long long bytes_used;
74 unsigned int inodes; 76 unsigned int inodes;
77 int xattr_ids;
75}; 78};
76#endif 79#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/magic.h> 38#include <linux/magic.h>
39#include <linux/xattr.h>
39 40
40#include "squashfs_fs.h" 41#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 42#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h" 43#include "squashfs_fs_i.h"
43#include "squashfs.h" 44#include "squashfs.h"
44#include "decompressor.h" 45#include "decompressor.h"
46#include "xattr.h"
45 47
46static struct file_system_type squashfs_fs_type; 48static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 49static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
82 long long root_inode; 84 long long root_inode;
83 unsigned short flags; 85 unsigned short flags;
84 unsigned int fragments; 86 unsigned int fragments;
85 u64 lookup_table_start; 87 u64 lookup_table_start, xattr_id_table_start;
86 int err; 88 int err;
87 89
88 TRACE("Entered squashfs_fill_superblock\n"); 90 TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
139 if (msblk->decompressor == NULL) 141 if (msblk->decompressor == NULL)
140 goto failed_mount; 142 goto failed_mount;
141 143
142 /*
143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored.
145 */
146 if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
147 ERROR("Xattrs in filesystem, these will be ignored\n");
148
149 /* Check the filesystem does not extend beyond the end of the 144 /* Check the filesystem does not extend beyond the end of the
150 block device */ 145 block device */
151 msblk->bytes_used = le64_to_cpu(sblk->bytes_used); 146 msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
253allocate_lookup_table: 248allocate_lookup_table:
254 lookup_table_start = le64_to_cpu(sblk->lookup_table_start); 249 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
255 if (lookup_table_start == SQUASHFS_INVALID_BLK) 250 if (lookup_table_start == SQUASHFS_INVALID_BLK)
256 goto allocate_root; 251 goto allocate_xattr_table;
257 252
258 /* Allocate and read inode lookup table */ 253 /* Allocate and read inode lookup table */
259 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, 254 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
266 261
267 sb->s_export_op = &squashfs_export_ops; 262 sb->s_export_op = &squashfs_export_ops;
268 263
264allocate_xattr_table:
265 sb->s_xattr = squashfs_xattr_handlers;
266 xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
267 if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
268 goto allocate_root;
269
270 /* Allocate and read xattr id lookup table */
271 msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
272 xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
273 if (IS_ERR(msblk->xattr_id_table)) {
274 err = PTR_ERR(msblk->xattr_id_table);
275 msblk->xattr_id_table = NULL;
276 if (err != -ENOTSUPP)
277 goto failed_mount;
278 }
269allocate_root: 279allocate_root:
270 root = new_inode(sb); 280 root = new_inode(sb);
271 if (!root) { 281 if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
301 kfree(msblk->inode_lookup_table); 311 kfree(msblk->inode_lookup_table);
302 kfree(msblk->fragment_index); 312 kfree(msblk->fragment_index);
303 kfree(msblk->id_table); 313 kfree(msblk->id_table);
314 kfree(msblk->xattr_id_table);
304 kfree(sb->s_fs_info); 315 kfree(sb->s_fs_info);
305 sb->s_fs_info = NULL; 316 sb->s_fs_info = NULL;
306 kfree(sblk); 317 kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
355 kfree(sbi->fragment_index); 366 kfree(sbi->fragment_index);
356 kfree(sbi->meta_index); 367 kfree(sbi->meta_index);
357 kfree(sbi->inode_lookup_table); 368 kfree(sbi->inode_lookup_table);
369 kfree(sbi->xattr_id_table);
358 kfree(sb->s_fs_info); 370 kfree(sb->s_fs_info);
359 sb->s_fs_info = NULL; 371 sb->s_fs_info = NULL;
360 } 372 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/string.h> 36#include <linux/string.h>
37#include <linux/pagemap.h> 37#include <linux/pagemap.h>
38#include <linux/xattr.h>
38 39
39#include "squashfs_fs.h" 40#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
42#include "squashfs.h" 43#include "squashfs.h"
44#include "xattr.h"
43 45
44static int squashfs_symlink_readpage(struct file *file, struct page *page) 46static int squashfs_symlink_readpage(struct file *file, struct page *page)
45{ 47{
@@ -114,3 +116,12 @@ error_out:
114const struct address_space_operations squashfs_symlink_aops = { 116const struct address_space_operations squashfs_symlink_aops = {
115 .readpage = squashfs_symlink_readpage 117 .readpage = squashfs_symlink_readpage
116}; 118};
119
120const struct inode_operations squashfs_symlink_inode_ops = {
121 .readlink = generic_readlink,
122 .follow_link = page_follow_link_light,
123 .put_link = page_put_link,
124 .getxattr = generic_getxattr,
125 .listxattr = squashfs_listxattr
126};
127
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..652b8541f9c6
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr.c
22 */
23
24#include <linux/init.h>
25#include <linux/module.h>
26#include <linux/string.h>
27#include <linux/fs.h>
28#include <linux/vfs.h>
29#include <linux/xattr.h>
30#include <linux/slab.h>
31
32#include "squashfs_fs.h"
33#include "squashfs_fs_sb.h"
34#include "squashfs_fs_i.h"
35#include "squashfs.h"
36
37static const struct xattr_handler *squashfs_xattr_handler(int);
38
39ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
40 size_t buffer_size)
41{
42 struct inode *inode = d->d_inode;
43 struct super_block *sb = inode->i_sb;
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
46 + msblk->xattr_table;
47 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
48 int count = squashfs_i(inode)->xattr_count;
49 size_t rest = buffer_size;
50 int err;
51
52 /* check that the file system has xattrs */
53 if (msblk->xattr_id_table == NULL)
54 return -EOPNOTSUPP;
55
56 /* loop reading each xattr name */
57 while (count--) {
58 struct squashfs_xattr_entry entry;
59 struct squashfs_xattr_val val;
60 const struct xattr_handler *handler;
61 int name_size, prefix_size = 0;
62
63 err = squashfs_read_metadata(sb, &entry, &start, &offset,
64 sizeof(entry));
65 if (err < 0)
66 goto failed;
67
68 name_size = le16_to_cpu(entry.size);
69 handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
70 if (handler)
71 prefix_size = handler->list(d, buffer, rest, NULL,
72 name_size, handler->flags);
73 if (prefix_size) {
74 if (buffer) {
75 if (prefix_size + name_size + 1 > rest) {
76 err = -ERANGE;
77 goto failed;
78 }
79 buffer += prefix_size;
80 }
81 err = squashfs_read_metadata(sb, buffer, &start,
82 &offset, name_size);
83 if (err < 0)
84 goto failed;
85 if (buffer) {
86 buffer[name_size] = '\0';
87 buffer += name_size + 1;
88 }
89 rest -= prefix_size + name_size + 1;
90 } else {
91 /* no handler or insuffficient privileges, so skip */
92 err = squashfs_read_metadata(sb, NULL, &start,
93 &offset, name_size);
94 if (err < 0)
95 goto failed;
96 }
97
98
99 /* skip remaining xattr entry */
100 err = squashfs_read_metadata(sb, &val, &start, &offset,
101 sizeof(val));
102 if (err < 0)
103 goto failed;
104
105 err = squashfs_read_metadata(sb, NULL, &start, &offset,
106 le32_to_cpu(val.vsize));
107 if (err < 0)
108 goto failed;
109 }
110 err = buffer_size - rest;
111
112failed:
113 return err;
114}
115
116
117static int squashfs_xattr_get(struct inode *inode, int name_index,
118 const char *name, void *buffer, size_t buffer_size)
119{
120 struct super_block *sb = inode->i_sb;
121 struct squashfs_sb_info *msblk = sb->s_fs_info;
122 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
123 + msblk->xattr_table;
124 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
125 int count = squashfs_i(inode)->xattr_count;
126 int name_len = strlen(name);
127 int err, vsize;
128 char *target = kmalloc(name_len, GFP_KERNEL);
129
130 if (target == NULL)
131 return -ENOMEM;
132
133 /* loop reading each xattr name */
134 for (; count; count--) {
135 struct squashfs_xattr_entry entry;
136 struct squashfs_xattr_val val;
137 int type, prefix, name_size;
138
139 err = squashfs_read_metadata(sb, &entry, &start, &offset,
140 sizeof(entry));
141 if (err < 0)
142 goto failed;
143
144 name_size = le16_to_cpu(entry.size);
145 type = le16_to_cpu(entry.type);
146 prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
147
148 if (prefix == name_index && name_size == name_len)
149 err = squashfs_read_metadata(sb, target, &start,
150 &offset, name_size);
151 else
152 err = squashfs_read_metadata(sb, NULL, &start,
153 &offset, name_size);
154 if (err < 0)
155 goto failed;
156
157 if (prefix == name_index && name_size == name_len &&
158 strncmp(target, name, name_size) == 0) {
159 /* found xattr */
160 if (type & SQUASHFS_XATTR_VALUE_OOL) {
161 __le64 xattr;
162 /* val is a reference to the real location */
163 err = squashfs_read_metadata(sb, &val, &start,
164 &offset, sizeof(val));
165 if (err < 0)
166 goto failed;
167 err = squashfs_read_metadata(sb, &xattr, &start,
168 &offset, sizeof(xattr));
169 if (err < 0)
170 goto failed;
171 xattr = le64_to_cpu(xattr);
172 start = SQUASHFS_XATTR_BLK(xattr) +
173 msblk->xattr_table;
174 offset = SQUASHFS_XATTR_OFFSET(xattr);
175 }
176 /* read xattr value */
177 err = squashfs_read_metadata(sb, &val, &start, &offset,
178 sizeof(val));
179 if (err < 0)
180 goto failed;
181
182 vsize = le32_to_cpu(val.vsize);
183 if (buffer) {
184 if (vsize > buffer_size) {
185 err = -ERANGE;
186 goto failed;
187 }
188 err = squashfs_read_metadata(sb, buffer, &start,
189 &offset, vsize);
190 if (err < 0)
191 goto failed;
192 }
193 break;
194 }
195
196 /* no match, skip remaining xattr entry */
197 err = squashfs_read_metadata(sb, &val, &start, &offset,
198 sizeof(val));
199 if (err < 0)
200 goto failed;
201 err = squashfs_read_metadata(sb, NULL, &start, &offset,
202 le32_to_cpu(val.vsize));
203 if (err < 0)
204 goto failed;
205 }
206 err = count ? vsize : -ENODATA;
207
208failed:
209 kfree(target);
210 return err;
211}
212
213
214/*
215 * User namespace support
216 */
217static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
218 const char *name, size_t name_len, int type)
219{
220 if (list && XATTR_USER_PREFIX_LEN <= list_size)
221 memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
222 return XATTR_USER_PREFIX_LEN;
223}
224
225static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
226 size_t size, int type)
227{
228 if (name[0] == '\0')
229 return -EINVAL;
230
231 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
232 buffer, size);
233}
234
235static const struct xattr_handler squashfs_xattr_user_handler = {
236 .prefix = XATTR_USER_PREFIX,
237 .list = squashfs_user_list,
238 .get = squashfs_user_get
239};
240
241/*
242 * Trusted namespace support
243 */
244static size_t squashfs_trusted_list(struct dentry *d, char *list,
245 size_t list_size, const char *name, size_t name_len, int type)
246{
247 if (!capable(CAP_SYS_ADMIN))
248 return 0;
249
250 if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
251 memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
252 return XATTR_TRUSTED_PREFIX_LEN;
253}
254
255static int squashfs_trusted_get(struct dentry *d, const char *name,
256 void *buffer, size_t size, int type)
257{
258 if (name[0] == '\0')
259 return -EINVAL;
260
261 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
262 buffer, size);
263}
264
265static const struct xattr_handler squashfs_xattr_trusted_handler = {
266 .prefix = XATTR_TRUSTED_PREFIX,
267 .list = squashfs_trusted_list,
268 .get = squashfs_trusted_get
269};
270
271/*
272 * Security namespace support
273 */
274static size_t squashfs_security_list(struct dentry *d, char *list,
275 size_t list_size, const char *name, size_t name_len, int type)
276{
277 if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
278 memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
279 return XATTR_SECURITY_PREFIX_LEN;
280}
281
282static int squashfs_security_get(struct dentry *d, const char *name,
283 void *buffer, size_t size, int type)
284{
285 if (name[0] == '\0')
286 return -EINVAL;
287
288 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
289 buffer, size);
290}
291
292static const struct xattr_handler squashfs_xattr_security_handler = {
293 .prefix = XATTR_SECURITY_PREFIX,
294 .list = squashfs_security_list,
295 .get = squashfs_security_get
296};
297
298static const struct xattr_handler *squashfs_xattr_handler(int type)
299{
300 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
301 /* ignore unrecognised type */
302 return NULL;
303
304 switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
305 case SQUASHFS_XATTR_USER:
306 return &squashfs_xattr_user_handler;
307 case SQUASHFS_XATTR_TRUSTED:
308 return &squashfs_xattr_trusted_handler;
309 case SQUASHFS_XATTR_SECURITY:
310 return &squashfs_xattr_security_handler;
311 default:
312 /* ignore unrecognised type */
313 return NULL;
314 }
315}
316
317const struct xattr_handler *squashfs_xattr_handlers[] = {
318 &squashfs_xattr_user_handler,
319 &squashfs_xattr_trusted_handler,
320 &squashfs_xattr_security_handler,
321 NULL
322};
323
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..49fe0d719fbf
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr.h
22 */
23
24#ifdef CONFIG_SQUASHFS_XATTR
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
28 int *, unsigned long long *);
29#else
30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
31 u64 start, u64 *xattr_table_start, int *xattr_ids)
32{
33 ERROR("Xattrs in filesystem, these will be ignored\n");
34 return ERR_PTR(-ENOTSUPP);
35}
36
37static inline int squashfs_xattr_lookup(struct super_block *sb,
38 unsigned int index, int *count, int *size,
39 unsigned long long *xattr)
40{
41 return 0;
42}
43#define squashfs_listxattr NULL
44#define generic_getxattr NULL
45#define squashfs_xattr_handlers NULL
46#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24/*
25 * This file implements code to map the 32-bit xattr id stored in the inode
26 * into the on disk location of the xattr data.
27 */
28
29#include <linux/fs.h>
30#include <linux/vfs.h>
31#include <linux/slab.h>
32
33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h"
37
38/*
39 * Map xattr id using the xattr id look up table
40 */
41int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
42 int *count, unsigned int *size, unsigned long long *xattr)
43{
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 int block = SQUASHFS_XATTR_BLOCK(index);
46 int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
47 u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
48 struct squashfs_xattr_id id;
49 int err;
50
51 err = squashfs_read_metadata(sb, &id, &start_block, &offset,
52 sizeof(id));
53 if (err < 0)
54 return err;
55
56 *xattr = le64_to_cpu(id.xattr);
57 *size = le32_to_cpu(id.size);
58 *count = le32_to_cpu(id.count);
59 return 0;
60}
61
62
63/*
64 * Read uncompressed xattr id lookup table indexes from disk into memory
65 */
66__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
67 u64 *xattr_table_start, int *xattr_ids)
68{
69 unsigned int len;
70 __le64 *xid_table;
71 struct squashfs_xattr_id_table id_table;
72 int err;
73
74 err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
75 if (err < 0) {
76 ERROR("unable to read xattr id table\n");
77 return ERR_PTR(err);
78 }
79 *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
80 *xattr_ids = le32_to_cpu(id_table.xattr_ids);
81 len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
82
83 TRACE("In read_xattr_index_table, length %d\n", len);
84
85 /* Allocate xattr id lookup table indexes */
86 xid_table = kmalloc(len, GFP_KERNEL);
87 if (xid_table == NULL) {
88 ERROR("Failed to allocate xattr id index table\n");
89 return ERR_PTR(-ENOMEM);
90 }
91
92 err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
93 if (err < 0) {
94 ERROR("unable to read xattr id index table\n");
95 kfree(xid_table);
96 return ERR_PTR(err);
97 }
98
99 return xid_table;
100}
diff --git a/fs/stat.c b/fs/stat.c
index c4ecd52c5737..12e90e213900 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -68,7 +68,8 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
68} 68}
69EXPORT_SYMBOL(vfs_fstat); 69EXPORT_SYMBOL(vfs_fstat);
70 70
71int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag) 71int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
72 int flag)
72{ 73{
73 struct path path; 74 struct path path;
74 int error = -EINVAL; 75 int error = -EINVAL;
@@ -91,13 +92,13 @@ out:
91} 92}
92EXPORT_SYMBOL(vfs_fstatat); 93EXPORT_SYMBOL(vfs_fstatat);
93 94
94int vfs_stat(char __user *name, struct kstat *stat) 95int vfs_stat(const char __user *name, struct kstat *stat)
95{ 96{
96 return vfs_fstatat(AT_FDCWD, name, stat, 0); 97 return vfs_fstatat(AT_FDCWD, name, stat, 0);
97} 98}
98EXPORT_SYMBOL(vfs_stat); 99EXPORT_SYMBOL(vfs_stat);
99 100
100int vfs_lstat(char __user *name, struct kstat *stat) 101int vfs_lstat(const char __user *name, struct kstat *stat)
101{ 102{
102 return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); 103 return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
103} 104}
@@ -147,7 +148,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
147 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 148 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
148} 149}
149 150
150SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) 151SYSCALL_DEFINE2(stat, const char __user *, filename,
152 struct __old_kernel_stat __user *, statbuf)
151{ 153{
152 struct kstat stat; 154 struct kstat stat;
153 int error; 155 int error;
@@ -159,7 +161,8 @@ SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *
159 return cp_old_stat(&stat, statbuf); 161 return cp_old_stat(&stat, statbuf);
160} 162}
161 163
162SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) 164SYSCALL_DEFINE2(lstat, const char __user *, filename,
165 struct __old_kernel_stat __user *, statbuf)
163{ 166{
164 struct kstat stat; 167 struct kstat stat;
165 int error; 168 int error;
@@ -234,7 +237,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
234 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 237 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
235} 238}
236 239
237SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf) 240SYSCALL_DEFINE2(newstat, const char __user *, filename,
241 struct stat __user *, statbuf)
238{ 242{
239 struct kstat stat; 243 struct kstat stat;
240 int error = vfs_stat(filename, &stat); 244 int error = vfs_stat(filename, &stat);
@@ -244,7 +248,8 @@ SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
244 return cp_new_stat(&stat, statbuf); 248 return cp_new_stat(&stat, statbuf);
245} 249}
246 250
247SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf) 251SYSCALL_DEFINE2(newlstat, const char __user *, filename,
252 struct stat __user *, statbuf)
248{ 253{
249 struct kstat stat; 254 struct kstat stat;
250 int error; 255 int error;
@@ -257,7 +262,7 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf
257} 262}
258 263
259#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) 264#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
260SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename, 265SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
261 struct stat __user *, statbuf, int, flag) 266 struct stat __user *, statbuf, int, flag)
262{ 267{
263 struct kstat stat; 268 struct kstat stat;
@@ -355,7 +360,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
355 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 360 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
356} 361}
357 362
358SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf) 363SYSCALL_DEFINE2(stat64, const char __user *, filename,
364 struct stat64 __user *, statbuf)
359{ 365{
360 struct kstat stat; 366 struct kstat stat;
361 int error = vfs_stat(filename, &stat); 367 int error = vfs_stat(filename, &stat);
@@ -366,7 +372,8 @@ SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf
366 return error; 372 return error;
367} 373}
368 374
369SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf) 375SYSCALL_DEFINE2(lstat64, const char __user *, filename,
376 struct stat64 __user *, statbuf)
370{ 377{
371 struct kstat stat; 378 struct kstat stat;
372 int error = vfs_lstat(filename, &stat); 379 int error = vfs_lstat(filename, &stat);
@@ -388,7 +395,7 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
388 return error; 395 return error;
389} 396}
390 397
391SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, 398SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
392 struct stat64 __user *, statbuf, int, flag) 399 struct stat64 __user *, statbuf, int, flag)
393{ 400{
394 struct kstat stat; 401 struct kstat stat;
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 000000000000..30ea8c8a996b
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,243 @@
1#include <linux/syscalls.h>
2#include <linux/module.h>
3#include <linux/fs.h>
4#include <linux/file.h>
5#include <linux/mount.h>
6#include <linux/namei.h>
7#include <linux/statfs.h>
8#include <linux/security.h>
9#include <linux/uaccess.h>
10
11static int flags_by_mnt(int mnt_flags)
12{
13 int flags = 0;
14
15 if (mnt_flags & MNT_READONLY)
16 flags |= ST_RDONLY;
17 if (mnt_flags & MNT_NOSUID)
18 flags |= ST_NOSUID;
19 if (mnt_flags & MNT_NODEV)
20 flags |= ST_NODEV;
21 if (mnt_flags & MNT_NOEXEC)
22 flags |= ST_NOEXEC;
23 if (mnt_flags & MNT_NOATIME)
24 flags |= ST_NOATIME;
25 if (mnt_flags & MNT_NODIRATIME)
26 flags |= ST_NODIRATIME;
27 if (mnt_flags & MNT_RELATIME)
28 flags |= ST_RELATIME;
29 return flags;
30}
31
32static int flags_by_sb(int s_flags)
33{
34 int flags = 0;
35 if (s_flags & MS_SYNCHRONOUS)
36 flags |= ST_SYNCHRONOUS;
37 if (s_flags & MS_MANDLOCK)
38 flags |= ST_MANDLOCK;
39 return flags;
40}
41
42static int calculate_f_flags(struct vfsmount *mnt)
43{
44 return ST_VALID | flags_by_mnt(mnt->mnt_flags) |
45 flags_by_sb(mnt->mnt_sb->s_flags);
46}
47
48int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
49{
50 int retval;
51
52 if (!dentry->d_sb->s_op->statfs)
53 return -ENOSYS;
54
55 memset(buf, 0, sizeof(*buf));
56 retval = security_sb_statfs(dentry);
57 if (retval)
58 return retval;
59 retval = dentry->d_sb->s_op->statfs(dentry, buf);
60 if (retval == 0 && buf->f_frsize == 0)
61 buf->f_frsize = buf->f_bsize;
62 return retval;
63}
64
65int vfs_statfs(struct path *path, struct kstatfs *buf)
66{
67 int error;
68
69 error = statfs_by_dentry(path->dentry, buf);
70 if (!error)
71 buf->f_flags = calculate_f_flags(path->mnt);
72 return error;
73}
74EXPORT_SYMBOL(vfs_statfs);
75
76static int do_statfs_native(struct path *path, struct statfs *buf)
77{
78 struct kstatfs st;
79 int retval;
80
81 retval = vfs_statfs(path, &st);
82 if (retval)
83 return retval;
84
85 if (sizeof(*buf) == sizeof(st))
86 memcpy(buf, &st, sizeof(st));
87 else {
88 if (sizeof buf->f_blocks == 4) {
89 if ((st.f_blocks | st.f_bfree | st.f_bavail |
90 st.f_bsize | st.f_frsize) &
91 0xffffffff00000000ULL)
92 return -EOVERFLOW;
93 /*
94 * f_files and f_ffree may be -1; it's okay to stuff
95 * that into 32 bits
96 */
97 if (st.f_files != -1 &&
98 (st.f_files & 0xffffffff00000000ULL))
99 return -EOVERFLOW;
100 if (st.f_ffree != -1 &&
101 (st.f_ffree & 0xffffffff00000000ULL))
102 return -EOVERFLOW;
103 }
104
105 buf->f_type = st.f_type;
106 buf->f_bsize = st.f_bsize;
107 buf->f_blocks = st.f_blocks;
108 buf->f_bfree = st.f_bfree;
109 buf->f_bavail = st.f_bavail;
110 buf->f_files = st.f_files;
111 buf->f_ffree = st.f_ffree;
112 buf->f_fsid = st.f_fsid;
113 buf->f_namelen = st.f_namelen;
114 buf->f_frsize = st.f_frsize;
115 buf->f_flags = st.f_flags;
116 memset(buf->f_spare, 0, sizeof(buf->f_spare));
117 }
118 return 0;
119}
120
121static int do_statfs64(struct path *path, struct statfs64 *buf)
122{
123 struct kstatfs st;
124 int retval;
125
126 retval = vfs_statfs(path, &st);
127 if (retval)
128 return retval;
129
130 if (sizeof(*buf) == sizeof(st))
131 memcpy(buf, &st, sizeof(st));
132 else {
133 buf->f_type = st.f_type;
134 buf->f_bsize = st.f_bsize;
135 buf->f_blocks = st.f_blocks;
136 buf->f_bfree = st.f_bfree;
137 buf->f_bavail = st.f_bavail;
138 buf->f_files = st.f_files;
139 buf->f_ffree = st.f_ffree;
140 buf->f_fsid = st.f_fsid;
141 buf->f_namelen = st.f_namelen;
142 buf->f_frsize = st.f_frsize;
143 buf->f_flags = st.f_flags;
144 memset(buf->f_spare, 0, sizeof(buf->f_spare));
145 }
146 return 0;
147}
148
149SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
150{
151 struct path path;
152 int error;
153
154 error = user_path(pathname, &path);
155 if (!error) {
156 struct statfs tmp;
157 error = do_statfs_native(&path, &tmp);
158 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
159 error = -EFAULT;
160 path_put(&path);
161 }
162 return error;
163}
164
165SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
166{
167 struct path path;
168 long error;
169
170 if (sz != sizeof(*buf))
171 return -EINVAL;
172 error = user_path(pathname, &path);
173 if (!error) {
174 struct statfs64 tmp;
175 error = do_statfs64(&path, &tmp);
176 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
177 error = -EFAULT;
178 path_put(&path);
179 }
180 return error;
181}
182
183SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
184{
185 struct file *file;
186 struct statfs tmp;
187 int error;
188
189 error = -EBADF;
190 file = fget(fd);
191 if (!file)
192 goto out;
193 error = do_statfs_native(&file->f_path, &tmp);
194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
195 error = -EFAULT;
196 fput(file);
197out:
198 return error;
199}
200
201SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
202{
203 struct file *file;
204 struct statfs64 tmp;
205 int error;
206
207 if (sz != sizeof(*buf))
208 return -EINVAL;
209
210 error = -EBADF;
211 file = fget(fd);
212 if (!file)
213 goto out;
214 error = do_statfs64(&file->f_path, &tmp);
215 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
216 error = -EFAULT;
217 fput(file);
218out:
219 return error;
220}
221
222SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
223{
224 struct super_block *s;
225 struct ustat tmp;
226 struct kstatfs sbuf;
227 int err;
228
229 s = user_get_super(new_decode_dev(dev));
230 if (!s)
231 return -EINVAL;
232
233 err = statfs_by_dentry(s->s_root, &sbuf);
234 drop_super(s);
235 if (err)
236 return err;
237
238 memset(&tmp,0,sizeof(struct ustat));
239 tmp.f_tfree = sbuf.f_bfree;
240 tmp.f_tinode = sbuf.f_ffree;
241
242 return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
243}
diff --git a/fs/super.c b/fs/super.c
index 1527e6a0ee35..8819e3a7ff20 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,23 +22,14 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/init.h>
26#include <linux/smp_lock.h>
27#include <linux/acct.h> 25#include <linux/acct.h>
28#include <linux/blkdev.h> 26#include <linux/blkdev.h>
29#include <linux/quotaops.h>
30#include <linux/namei.h>
31#include <linux/mount.h> 27#include <linux/mount.h>
32#include <linux/security.h> 28#include <linux/security.h>
33#include <linux/syscalls.h>
34#include <linux/vfs.h>
35#include <linux/writeback.h> /* for the emergency remount stuff */ 29#include <linux/writeback.h> /* for the emergency remount stuff */
36#include <linux/idr.h> 30#include <linux/idr.h>
37#include <linux/kobject.h>
38#include <linux/mutex.h> 31#include <linux/mutex.h>
39#include <linux/file.h>
40#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
41#include <asm/uaccess.h>
42#include "internal.h" 33#include "internal.h"
43 34
44 35
@@ -63,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
63 s = NULL; 54 s = NULL;
64 goto out; 55 goto out;
65 } 56 }
57#ifdef CONFIG_SMP
58 s->s_files = alloc_percpu(struct list_head);
59 if (!s->s_files) {
60 security_sb_free(s);
61 kfree(s);
62 s = NULL;
63 goto out;
64 } else {
65 int i;
66
67 for_each_possible_cpu(i)
68 INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
69 }
70#else
66 INIT_LIST_HEAD(&s->s_files); 71 INIT_LIST_HEAD(&s->s_files);
72#endif
67 INIT_LIST_HEAD(&s->s_instances); 73 INIT_LIST_HEAD(&s->s_instances);
68 INIT_HLIST_HEAD(&s->s_anon); 74 INIT_HLIST_HEAD(&s->s_anon);
69 INIT_LIST_HEAD(&s->s_inodes); 75 INIT_LIST_HEAD(&s->s_inodes);
@@ -93,16 +99,15 @@ static struct super_block *alloc_super(struct file_system_type *type)
93 * subclass. 99 * subclass.
94 */ 100 */
95 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); 101 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
96 s->s_count = S_BIAS; 102 s->s_count = 1;
97 atomic_set(&s->s_active, 1); 103 atomic_set(&s->s_active, 1);
98 mutex_init(&s->s_vfs_rename_mutex); 104 mutex_init(&s->s_vfs_rename_mutex);
105 lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
99 mutex_init(&s->s_dquot.dqio_mutex); 106 mutex_init(&s->s_dquot.dqio_mutex);
100 mutex_init(&s->s_dquot.dqonoff_mutex); 107 mutex_init(&s->s_dquot.dqonoff_mutex);
101 init_rwsem(&s->s_dquot.dqptr_sem); 108 init_rwsem(&s->s_dquot.dqptr_sem);
102 init_waitqueue_head(&s->s_wait_unfrozen); 109 init_waitqueue_head(&s->s_wait_unfrozen);
103 s->s_maxbytes = MAX_NON_LFS; 110 s->s_maxbytes = MAX_NON_LFS;
104 s->dq_op = sb_dquot_ops;
105 s->s_qcop = sb_quotactl_ops;
106 s->s_op = &default_op; 111 s->s_op = &default_op;
107 s->s_time_gran = 1000000000; 112 s->s_time_gran = 1000000000;
108 } 113 }
@@ -118,6 +123,9 @@ out:
118 */ 123 */
119static inline void destroy_super(struct super_block *s) 124static inline void destroy_super(struct super_block *s)
120{ 125{
126#ifdef CONFIG_SMP
127 free_percpu(s->s_files);
128#endif
121 security_sb_free(s); 129 security_sb_free(s);
122 kfree(s->s_subtype); 130 kfree(s->s_subtype);
123 kfree(s->s_options); 131 kfree(s->s_options);
@@ -127,39 +135,14 @@ static inline void destroy_super(struct super_block *s)
127/* Superblock refcounting */ 135/* Superblock refcounting */
128 136
129/* 137/*
130 * Drop a superblock's refcount. Returns non-zero if the superblock was 138 * Drop a superblock's refcount. The caller must hold sb_lock.
131 * destroyed. The caller must hold sb_lock.
132 */ 139 */
133static int __put_super(struct super_block *sb) 140void __put_super(struct super_block *sb)
134{ 141{
135 int ret = 0;
136
137 if (!--sb->s_count) { 142 if (!--sb->s_count) {
143 list_del_init(&sb->s_list);
138 destroy_super(sb); 144 destroy_super(sb);
139 ret = 1;
140 } 145 }
141 return ret;
142}
143
144/*
145 * Drop a superblock's refcount.
146 * Returns non-zero if the superblock is about to be destroyed and
147 * at least is already removed from super_blocks list, so if we are
148 * making a loop through super blocks then we need to restart.
149 * The caller must hold sb_lock.
150 */
151int __put_super_and_need_restart(struct super_block *sb)
152{
153 /* check for race with generic_shutdown_super() */
154 if (list_empty(&sb->s_list)) {
155 /* super block is removed, need to restart... */
156 __put_super(sb);
157 return 1;
158 }
159 /* can't be the last, since s_list is still in use */
160 sb->s_count--;
161 BUG_ON(sb->s_count == 0);
162 return 0;
163} 146}
164 147
165/** 148/**
@@ -178,57 +161,47 @@ void put_super(struct super_block *sb)
178 161
179 162
180/** 163/**
181 * deactivate_super - drop an active reference to superblock 164 * deactivate_locked_super - drop an active reference to superblock
182 * @s: superblock to deactivate 165 * @s: superblock to deactivate
183 * 166 *
184 * Drops an active reference to superblock, acquiring a temprory one if 167 * Drops an active reference to superblock, converting it into a temprory
185 * there is no active references left. In that case we lock superblock, 168 * one if there is no other active references left. In that case we
186 * tell fs driver to shut it down and drop the temporary reference we 169 * tell fs driver to shut it down and drop the temporary reference we
187 * had just acquired. 170 * had just acquired.
171 *
172 * Caller holds exclusive lock on superblock; that lock is released.
188 */ 173 */
189void deactivate_super(struct super_block *s) 174void deactivate_locked_super(struct super_block *s)
190{ 175{
191 struct file_system_type *fs = s->s_type; 176 struct file_system_type *fs = s->s_type;
192 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 177 if (atomic_dec_and_test(&s->s_active)) {
193 s->s_count -= S_BIAS-1;
194 spin_unlock(&sb_lock);
195 vfs_dq_off(s, 0);
196 down_write(&s->s_umount);
197 fs->kill_sb(s); 178 fs->kill_sb(s);
198 put_filesystem(fs); 179 put_filesystem(fs);
199 put_super(s); 180 put_super(s);
181 } else {
182 up_write(&s->s_umount);
200 } 183 }
201} 184}
202 185
203EXPORT_SYMBOL(deactivate_super); 186EXPORT_SYMBOL(deactivate_locked_super);
204 187
205/** 188/**
206 * deactivate_locked_super - drop an active reference to superblock 189 * deactivate_super - drop an active reference to superblock
207 * @s: superblock to deactivate 190 * @s: superblock to deactivate
208 * 191 *
209 * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that 192 * Variant of deactivate_locked_super(), except that superblock is *not*
210 * it does not unlock it until it's all over. As the result, it's safe to 193 * locked by caller. If we are going to drop the final active reference,
211 * use to dispose of new superblock on ->get_sb() failure exits - nobody 194 * lock will be acquired prior to that.
212 * will see the sucker until it's all over. Equivalent using up_write +
213 * deactivate_super is safe for that purpose only if superblock is either
214 * safe to use or has NULL ->s_root when we unlock.
215 */ 195 */
216void deactivate_locked_super(struct super_block *s) 196void deactivate_super(struct super_block *s)
217{ 197{
218 struct file_system_type *fs = s->s_type; 198 if (!atomic_add_unless(&s->s_active, -1, 1)) {
219 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 199 down_write(&s->s_umount);
220 s->s_count -= S_BIAS-1; 200 deactivate_locked_super(s);
221 spin_unlock(&sb_lock);
222 vfs_dq_off(s, 0);
223 fs->kill_sb(s);
224 put_filesystem(fs);
225 put_super(s);
226 } else {
227 up_write(&s->s_umount);
228 } 201 }
229} 202}
230 203
231EXPORT_SYMBOL(deactivate_locked_super); 204EXPORT_SYMBOL(deactivate_super);
232 205
233/** 206/**
234 * grab_super - acquire an active reference 207 * grab_super - acquire an active reference
@@ -243,22 +216,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
243 */ 216 */
244static int grab_super(struct super_block *s) __releases(sb_lock) 217static int grab_super(struct super_block *s) __releases(sb_lock)
245{ 218{
219 if (atomic_inc_not_zero(&s->s_active)) {
220 spin_unlock(&sb_lock);
221 return 1;
222 }
223 /* it's going away */
246 s->s_count++; 224 s->s_count++;
247 spin_unlock(&sb_lock); 225 spin_unlock(&sb_lock);
226 /* wait for it to die */
248 down_write(&s->s_umount); 227 down_write(&s->s_umount);
249 if (s->s_root) {
250 spin_lock(&sb_lock);
251 if (s->s_count > S_BIAS) {
252 atomic_inc(&s->s_active);
253 s->s_count--;
254 spin_unlock(&sb_lock);
255 return 1;
256 }
257 spin_unlock(&sb_lock);
258 }
259 up_write(&s->s_umount); 228 up_write(&s->s_umount);
260 put_super(s); 229 put_super(s);
261 yield();
262 return 0; 230 return 0;
263} 231}
264 232
@@ -321,8 +289,7 @@ void generic_shutdown_super(struct super_block *sb)
321 } 289 }
322 spin_lock(&sb_lock); 290 spin_lock(&sb_lock);
323 /* should be initialized for __put_super_and_need_restart() */ 291 /* should be initialized for __put_super_and_need_restart() */
324 list_del_init(&sb->s_list); 292 list_del_init(&sb->s_instances);
325 list_del(&sb->s_instances);
326 spin_unlock(&sb_lock); 293 spin_unlock(&sb_lock);
327 up_write(&sb->s_umount); 294 up_write(&sb->s_umount);
328} 295}
@@ -356,6 +323,12 @@ retry:
356 if (s) { 323 if (s) {
357 up_write(&s->s_umount); 324 up_write(&s->s_umount);
358 destroy_super(s); 325 destroy_super(s);
326 s = NULL;
327 }
328 down_write(&old->s_umount);
329 if (unlikely(!(old->s_flags & MS_BORN))) {
330 deactivate_locked_super(old);
331 goto retry;
359 } 332 }
360 return old; 333 return old;
361 } 334 }
@@ -408,11 +381,12 @@ EXPORT_SYMBOL(drop_super);
408 */ 381 */
409void sync_supers(void) 382void sync_supers(void)
410{ 383{
411 struct super_block *sb; 384 struct super_block *sb, *p = NULL;
412 385
413 spin_lock(&sb_lock); 386 spin_lock(&sb_lock);
414restart:
415 list_for_each_entry(sb, &super_blocks, s_list) { 387 list_for_each_entry(sb, &super_blocks, s_list) {
388 if (list_empty(&sb->s_instances))
389 continue;
416 if (sb->s_op->write_super && sb->s_dirt) { 390 if (sb->s_op->write_super && sb->s_dirt) {
417 sb->s_count++; 391 sb->s_count++;
418 spin_unlock(&sb_lock); 392 spin_unlock(&sb_lock);
@@ -423,10 +397,47 @@ restart:
423 up_read(&sb->s_umount); 397 up_read(&sb->s_umount);
424 398
425 spin_lock(&sb_lock); 399 spin_lock(&sb_lock);
426 if (__put_super_and_need_restart(sb)) 400 if (p)
427 goto restart; 401 __put_super(p);
402 p = sb;
428 } 403 }
429 } 404 }
405 if (p)
406 __put_super(p);
407 spin_unlock(&sb_lock);
408}
409
410/**
411 * iterate_supers - call function for all active superblocks
412 * @f: function to call
413 * @arg: argument to pass to it
414 *
415 * Scans the superblock list and calls given function, passing it
416 * locked superblock and given argument.
417 */
418void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
419{
420 struct super_block *sb, *p = NULL;
421
422 spin_lock(&sb_lock);
423 list_for_each_entry(sb, &super_blocks, s_list) {
424 if (list_empty(&sb->s_instances))
425 continue;
426 sb->s_count++;
427 spin_unlock(&sb_lock);
428
429 down_read(&sb->s_umount);
430 if (sb->s_root)
431 f(sb, arg);
432 up_read(&sb->s_umount);
433
434 spin_lock(&sb_lock);
435 if (p)
436 __put_super(p);
437 p = sb;
438 }
439 if (p)
440 __put_super(p);
430 spin_unlock(&sb_lock); 441 spin_unlock(&sb_lock);
431} 442}
432 443
@@ -438,7 +449,7 @@ restart:
438 * mounted on the device given. %NULL is returned if no match is found. 449 * mounted on the device given. %NULL is returned if no match is found.
439 */ 450 */
440 451
441struct super_block * get_super(struct block_device *bdev) 452struct super_block *get_super(struct block_device *bdev)
442{ 453{
443 struct super_block *sb; 454 struct super_block *sb;
444 455
@@ -448,17 +459,20 @@ struct super_block * get_super(struct block_device *bdev)
448 spin_lock(&sb_lock); 459 spin_lock(&sb_lock);
449rescan: 460rescan:
450 list_for_each_entry(sb, &super_blocks, s_list) { 461 list_for_each_entry(sb, &super_blocks, s_list) {
462 if (list_empty(&sb->s_instances))
463 continue;
451 if (sb->s_bdev == bdev) { 464 if (sb->s_bdev == bdev) {
452 sb->s_count++; 465 sb->s_count++;
453 spin_unlock(&sb_lock); 466 spin_unlock(&sb_lock);
454 down_read(&sb->s_umount); 467 down_read(&sb->s_umount);
468 /* still alive? */
455 if (sb->s_root) 469 if (sb->s_root)
456 return sb; 470 return sb;
457 up_read(&sb->s_umount); 471 up_read(&sb->s_umount);
458 /* restart only when sb is no longer on the list */ 472 /* nope, got unmounted */
459 spin_lock(&sb_lock); 473 spin_lock(&sb_lock);
460 if (__put_super_and_need_restart(sb)) 474 __put_super(sb);
461 goto rescan; 475 goto rescan;
462 } 476 }
463 } 477 }
464 spin_unlock(&sb_lock); 478 spin_unlock(&sb_lock);
@@ -473,7 +487,7 @@ EXPORT_SYMBOL(get_super);
473 * 487 *
474 * Scans the superblock list and finds the superblock of the file system 488 * Scans the superblock list and finds the superblock of the file system
475 * mounted on the device given. Returns the superblock with an active 489 * mounted on the device given. Returns the superblock with an active
476 * reference and s_umount held exclusively or %NULL if none was found. 490 * reference or %NULL if none was found.
477 */ 491 */
478struct super_block *get_active_super(struct block_device *bdev) 492struct super_block *get_active_super(struct block_device *bdev)
479{ 493{
@@ -482,81 +496,49 @@ struct super_block *get_active_super(struct block_device *bdev)
482 if (!bdev) 496 if (!bdev)
483 return NULL; 497 return NULL;
484 498
499restart:
485 spin_lock(&sb_lock); 500 spin_lock(&sb_lock);
486 list_for_each_entry(sb, &super_blocks, s_list) { 501 list_for_each_entry(sb, &super_blocks, s_list) {
487 if (sb->s_bdev != bdev) 502 if (list_empty(&sb->s_instances))
488 continue; 503 continue;
489 504 if (sb->s_bdev == bdev) {
490 sb->s_count++; 505 if (grab_super(sb)) /* drops sb_lock */
491 spin_unlock(&sb_lock);
492 down_write(&sb->s_umount);
493 if (sb->s_root) {
494 spin_lock(&sb_lock);
495 if (sb->s_count > S_BIAS) {
496 atomic_inc(&sb->s_active);
497 sb->s_count--;
498 spin_unlock(&sb_lock);
499 return sb; 506 return sb;
500 } 507 else
501 spin_unlock(&sb_lock); 508 goto restart;
502 } 509 }
503 up_write(&sb->s_umount);
504 put_super(sb);
505 yield();
506 spin_lock(&sb_lock);
507 } 510 }
508 spin_unlock(&sb_lock); 511 spin_unlock(&sb_lock);
509 return NULL; 512 return NULL;
510} 513}
511 514
512struct super_block * user_get_super(dev_t dev) 515struct super_block *user_get_super(dev_t dev)
513{ 516{
514 struct super_block *sb; 517 struct super_block *sb;
515 518
516 spin_lock(&sb_lock); 519 spin_lock(&sb_lock);
517rescan: 520rescan:
518 list_for_each_entry(sb, &super_blocks, s_list) { 521 list_for_each_entry(sb, &super_blocks, s_list) {
522 if (list_empty(&sb->s_instances))
523 continue;
519 if (sb->s_dev == dev) { 524 if (sb->s_dev == dev) {
520 sb->s_count++; 525 sb->s_count++;
521 spin_unlock(&sb_lock); 526 spin_unlock(&sb_lock);
522 down_read(&sb->s_umount); 527 down_read(&sb->s_umount);
528 /* still alive? */
523 if (sb->s_root) 529 if (sb->s_root)
524 return sb; 530 return sb;
525 up_read(&sb->s_umount); 531 up_read(&sb->s_umount);
526 /* restart only when sb is no longer on the list */ 532 /* nope, got unmounted */
527 spin_lock(&sb_lock); 533 spin_lock(&sb_lock);
528 if (__put_super_and_need_restart(sb)) 534 __put_super(sb);
529 goto rescan; 535 goto rescan;
530 } 536 }
531 } 537 }
532 spin_unlock(&sb_lock); 538 spin_unlock(&sb_lock);
533 return NULL; 539 return NULL;
534} 540}
535 541
536SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
537{
538 struct super_block *s;
539 struct ustat tmp;
540 struct kstatfs sbuf;
541 int err = -EINVAL;
542
543 s = user_get_super(new_decode_dev(dev));
544 if (s == NULL)
545 goto out;
546 err = vfs_statfs(s->s_root, &sbuf);
547 drop_super(s);
548 if (err)
549 goto out;
550
551 memset(&tmp,0,sizeof(struct ustat));
552 tmp.f_tfree = sbuf.f_bfree;
553 tmp.f_tinode = sbuf.f_ffree;
554
555 err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
556out:
557 return err;
558}
559
560/** 542/**
561 * do_remount_sb - asks filesystem to change mount options. 543 * do_remount_sb - asks filesystem to change mount options.
562 * @sb: superblock in question 544 * @sb: superblock in question
@@ -569,7 +551,7 @@ out:
569int do_remount_sb(struct super_block *sb, int flags, void *data, int force) 551int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
570{ 552{
571 int retval; 553 int retval;
572 int remount_rw, remount_ro; 554 int remount_ro;
573 555
574 if (sb->s_frozen != SB_UNFROZEN) 556 if (sb->s_frozen != SB_UNFROZEN)
575 return -EBUSY; 557 return -EBUSY;
@@ -585,7 +567,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
585 sync_filesystem(sb); 567 sync_filesystem(sb);
586 568
587 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 569 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
588 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
589 570
590 /* If we are remounting RDONLY and current sb is read/write, 571 /* If we are remounting RDONLY and current sb is read/write,
591 make sure there are no rw files opened */ 572 make sure there are no rw files opened */
@@ -594,9 +575,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
594 mark_files_ro(sb); 575 mark_files_ro(sb);
595 else if (!fs_may_remount_ro(sb)) 576 else if (!fs_may_remount_ro(sb))
596 return -EBUSY; 577 return -EBUSY;
597 retval = vfs_dq_off(sb, 1);
598 if (retval < 0 && retval != -ENOSYS)
599 return -EBUSY;
600 } 578 }
601 579
602 if (sb->s_op->remount_fs) { 580 if (sb->s_op->remount_fs) {
@@ -605,8 +583,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
605 return retval; 583 return retval;
606 } 584 }
607 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 585 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
608 if (remount_rw) 586
609 vfs_dq_quota_on_remount(sb);
610 /* 587 /*
611 * Some filesystems modify their metadata via some other path than the 588 * Some filesystems modify their metadata via some other path than the
612 * bdev buffer cache (eg. use a private mapping, or directories in 589 * bdev buffer cache (eg. use a private mapping, or directories in
@@ -622,25 +599,29 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
622 599
623static void do_emergency_remount(struct work_struct *work) 600static void do_emergency_remount(struct work_struct *work)
624{ 601{
625 struct super_block *sb; 602 struct super_block *sb, *p = NULL;
626 603
627 spin_lock(&sb_lock); 604 spin_lock(&sb_lock);
628 list_for_each_entry(sb, &super_blocks, s_list) { 605 list_for_each_entry(sb, &super_blocks, s_list) {
606 if (list_empty(&sb->s_instances))
607 continue;
629 sb->s_count++; 608 sb->s_count++;
630 spin_unlock(&sb_lock); 609 spin_unlock(&sb_lock);
631 down_write(&sb->s_umount); 610 down_write(&sb->s_umount);
632 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { 611 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
633 /* 612 /*
634 * ->remount_fs needs lock_kernel().
635 *
636 * What lock protects sb->s_flags?? 613 * What lock protects sb->s_flags??
637 */ 614 */
638 do_remount_sb(sb, MS_RDONLY, NULL, 1); 615 do_remount_sb(sb, MS_RDONLY, NULL, 1);
639 } 616 }
640 up_write(&sb->s_umount); 617 up_write(&sb->s_umount);
641 put_super(sb);
642 spin_lock(&sb_lock); 618 spin_lock(&sb_lock);
619 if (p)
620 __put_super(p);
621 p = sb;
643 } 622 }
623 if (p)
624 __put_super(p);
644 spin_unlock(&sb_lock); 625 spin_unlock(&sb_lock);
645 kfree(work); 626 kfree(work);
646 printk("Emergency Remount complete\n"); 627 printk("Emergency Remount complete\n");
@@ -821,7 +802,16 @@ int get_sb_bdev(struct file_system_type *fs_type,
821 goto error_bdev; 802 goto error_bdev;
822 } 803 }
823 804
805 /*
806 * s_umount nests inside bd_mutex during
807 * __invalidate_device(). close_bdev_exclusive()
808 * acquires bd_mutex and can't be called under
809 * s_umount. Drop s_umount temporarily. This is safe
810 * as we're holding an active reference.
811 */
812 up_write(&s->s_umount);
824 close_bdev_exclusive(bdev, mode); 813 close_bdev_exclusive(bdev, mode);
814 down_write(&s->s_umount);
825 } else { 815 } else {
826 char b[BDEVNAME_SIZE]; 816 char b[BDEVNAME_SIZE];
827 817
@@ -957,6 +947,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
957 goto out_free_secdata; 947 goto out_free_secdata;
958 BUG_ON(!mnt->mnt_sb); 948 BUG_ON(!mnt->mnt_sb);
959 WARN_ON(!mnt->mnt_sb->s_bdi); 949 WARN_ON(!mnt->mnt_sb->s_bdi);
950 mnt->mnt_sb->s_flags |= MS_BORN;
960 951
961 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); 952 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
962 if (error) 953 if (error)
@@ -990,6 +981,96 @@ out:
990 981
991EXPORT_SYMBOL_GPL(vfs_kern_mount); 982EXPORT_SYMBOL_GPL(vfs_kern_mount);
992 983
984/**
985 * freeze_super - lock the filesystem and force it into a consistent state
986 * @sb: the super to lock
987 *
988 * Syncs the super to make sure the filesystem is consistent and calls the fs's
989 * freeze_fs. Subsequent calls to this without first thawing the fs will return
990 * -EBUSY.
991 */
992int freeze_super(struct super_block *sb)
993{
994 int ret;
995
996 atomic_inc(&sb->s_active);
997 down_write(&sb->s_umount);
998 if (sb->s_frozen) {
999 deactivate_locked_super(sb);
1000 return -EBUSY;
1001 }
1002
1003 if (sb->s_flags & MS_RDONLY) {
1004 sb->s_frozen = SB_FREEZE_TRANS;
1005 smp_wmb();
1006 up_write(&sb->s_umount);
1007 return 0;
1008 }
1009
1010 sb->s_frozen = SB_FREEZE_WRITE;
1011 smp_wmb();
1012
1013 sync_filesystem(sb);
1014
1015 sb->s_frozen = SB_FREEZE_TRANS;
1016 smp_wmb();
1017
1018 sync_blockdev(sb->s_bdev);
1019 if (sb->s_op->freeze_fs) {
1020 ret = sb->s_op->freeze_fs(sb);
1021 if (ret) {
1022 printk(KERN_ERR
1023 "VFS:Filesystem freeze failed\n");
1024 sb->s_frozen = SB_UNFROZEN;
1025 deactivate_locked_super(sb);
1026 return ret;
1027 }
1028 }
1029 up_write(&sb->s_umount);
1030 return 0;
1031}
1032EXPORT_SYMBOL(freeze_super);
1033
1034/**
1035 * thaw_super -- unlock filesystem
1036 * @sb: the super to thaw
1037 *
1038 * Unlocks the filesystem and marks it writeable again after freeze_super().
1039 */
1040int thaw_super(struct super_block *sb)
1041{
1042 int error;
1043
1044 down_write(&sb->s_umount);
1045 if (sb->s_frozen == SB_UNFROZEN) {
1046 up_write(&sb->s_umount);
1047 return -EINVAL;
1048 }
1049
1050 if (sb->s_flags & MS_RDONLY)
1051 goto out;
1052
1053 if (sb->s_op->unfreeze_fs) {
1054 error = sb->s_op->unfreeze_fs(sb);
1055 if (error) {
1056 printk(KERN_ERR
1057 "VFS:Filesystem thaw failed\n");
1058 sb->s_frozen = SB_FREEZE_TRANS;
1059 up_write(&sb->s_umount);
1060 return error;
1061 }
1062 }
1063
1064out:
1065 sb->s_frozen = SB_UNFROZEN;
1066 smp_wmb();
1067 wake_up(&sb->s_wait_unfrozen);
1068 deactivate_locked_super(sb);
1069
1070 return 0;
1071}
1072EXPORT_SYMBOL(thaw_super);
1073
993static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 1074static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
994{ 1075{
995 int err; 1076 int err;
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7c..ba76b9623e7e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
77} 77}
78EXPORT_SYMBOL_GPL(sync_filesystem); 78EXPORT_SYMBOL_GPL(sync_filesystem);
79 79
80static void sync_one_sb(struct super_block *sb, void *arg)
81{
82 if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
83 __sync_filesystem(sb, *(int *)arg);
84}
80/* 85/*
81 * Sync all the data for all the filesystems (called by sys_sync() and 86 * Sync all the data for all the filesystems (called by sys_sync() and
82 * emergency sync) 87 * emergency sync)
83 *
84 * This operation is careful to avoid the livelock which could easily happen
85 * if two or more filesystems are being continuously dirtied. s_need_sync
86 * is used only here. We set it against all filesystems and then clear it as
87 * we sync them. So redirtied filesystems are skipped.
88 *
89 * But if process A is currently running sync_filesystems and then process B
90 * calls sync_filesystems as well, process B will set all the s_need_sync
91 * flags again, which will cause process A to resync everything. Fix that with
92 * a local mutex.
93 */ 88 */
94static void sync_filesystems(int wait) 89static void sync_filesystems(int wait)
95{ 90{
96 struct super_block *sb; 91 iterate_supers(sync_one_sb, &wait);
97 static DEFINE_MUTEX(mutex);
98
99 mutex_lock(&mutex); /* Could be down_interruptible */
100 spin_lock(&sb_lock);
101 list_for_each_entry(sb, &super_blocks, s_list)
102 sb->s_need_sync = 1;
103
104restart:
105 list_for_each_entry(sb, &super_blocks, s_list) {
106 if (!sb->s_need_sync)
107 continue;
108 sb->s_need_sync = 0;
109 sb->s_count++;
110 spin_unlock(&sb_lock);
111
112 down_read(&sb->s_umount);
113 if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
114 __sync_filesystem(sb, wait);
115 up_read(&sb->s_umount);
116
117 /* restart only when sb is no longer on the list */
118 spin_lock(&sb_lock);
119 if (__put_super_and_need_restart(sb))
120 goto restart;
121 }
122 spin_unlock(&sb_lock);
123 mutex_unlock(&mutex);
124} 92}
125 93
126/* 94/*
@@ -160,37 +128,9 @@ void emergency_sync(void)
160 } 128 }
161} 129}
162 130
163/*
164 * Generic function to fsync a file.
165 *
166 * filp may be NULL if called via the msync of a vma.
167 */
168int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
169{
170 struct inode * inode = dentry->d_inode;
171 struct super_block * sb;
172 int ret, err;
173
174 /* sync the inode to buffers */
175 ret = write_inode_now(inode, 0);
176
177 /* sync the superblock to buffers */
178 sb = inode->i_sb;
179 if (sb->s_dirt && sb->s_op->write_super)
180 sb->s_op->write_super(sb);
181
182 /* .. finally sync the buffers to disk */
183 err = sync_blockdev(sb->s_bdev);
184 if (!ret)
185 ret = err;
186 return ret;
187}
188EXPORT_SYMBOL(file_fsync);
189
190/** 131/**
191 * vfs_fsync_range - helper to sync a range of data & metadata to disk 132 * vfs_fsync_range - helper to sync a range of data & metadata to disk
192 * @file: file to sync 133 * @file: file to sync
193 * @dentry: dentry of @file
194 * @start: offset in bytes of the beginning of data range to sync 134 * @start: offset in bytes of the beginning of data range to sync
195 * @end: offset in bytes of the end of data range (inclusive) 135 * @end: offset in bytes of the end of data range (inclusive)
196 * @datasync: perform only datasync 136 * @datasync: perform only datasync
@@ -198,32 +138,13 @@ EXPORT_SYMBOL(file_fsync);
198 * Write back data in range @start..@end and metadata for @file to disk. If 138 * Write back data in range @start..@end and metadata for @file to disk. If
199 * @datasync is set only metadata needed to access modified file data is 139 * @datasync is set only metadata needed to access modified file data is
200 * written. 140 * written.
201 *
202 * In case this function is called from nfsd @file may be %NULL and
203 * only @dentry is set. This can only happen when the filesystem
204 * implements the export_operations API.
205 */ 141 */
206int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, 142int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
207 loff_t end, int datasync)
208{ 143{
209 const struct file_operations *fop; 144 struct address_space *mapping = file->f_mapping;
210 struct address_space *mapping;
211 int err, ret; 145 int err, ret;
212 146
213 /* 147 if (!file->f_op || !file->f_op->fsync) {
214 * Get mapping and operations from the file in case we have
215 * as file, or get the default values for them in case we
216 * don't have a struct file available. Damn nfsd..
217 */
218 if (file) {
219 mapping = file->f_mapping;
220 fop = file->f_op;
221 } else {
222 mapping = dentry->d_inode->i_mapping;
223 fop = dentry->d_inode->i_fop;
224 }
225
226 if (!fop || !fop->fsync) {
227 ret = -EINVAL; 148 ret = -EINVAL;
228 goto out; 149 goto out;
229 } 150 }
@@ -235,7 +156,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
235 * livelocks in fsync_buffers_list(). 156 * livelocks in fsync_buffers_list().
236 */ 157 */
237 mutex_lock(&mapping->host->i_mutex); 158 mutex_lock(&mapping->host->i_mutex);
238 err = fop->fsync(file, dentry, datasync); 159 err = file->f_op->fsync(file, datasync);
239 if (!ret) 160 if (!ret)
240 ret = err; 161 ret = err;
241 mutex_unlock(&mapping->host->i_mutex); 162 mutex_unlock(&mapping->host->i_mutex);
@@ -248,19 +169,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
248/** 169/**
249 * vfs_fsync - perform a fsync or fdatasync on a file 170 * vfs_fsync - perform a fsync or fdatasync on a file
250 * @file: file to sync 171 * @file: file to sync
251 * @dentry: dentry of @file
252 * @datasync: only perform a fdatasync operation 172 * @datasync: only perform a fdatasync operation
253 * 173 *
254 * Write back data and metadata for @file to disk. If @datasync is 174 * Write back data and metadata for @file to disk. If @datasync is
255 * set only metadata needed to access modified file data is written. 175 * set only metadata needed to access modified file data is written.
256 *
257 * In case this function is called from nfsd @file may be %NULL and
258 * only @dentry is set. This can only happen when the filesystem
259 * implements the export_operations API.
260 */ 176 */
261int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) 177int vfs_fsync(struct file *file, int datasync)
262{ 178{
263 return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); 179 return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
264} 180}
265EXPORT_SYMBOL(vfs_fsync); 181EXPORT_SYMBOL(vfs_fsync);
266 182
@@ -271,7 +187,7 @@ static int do_fsync(unsigned int fd, int datasync)
271 187
272 file = fget(fd); 188 file = fget(fd);
273 if (file) { 189 if (file) {
274 ret = vfs_fsync(file, file->f_path.dentry, datasync); 190 ret = vfs_fsync(file, datasync);
275 fput(file); 191 fput(file);
276 } 192 }
277 return ret; 193 return ret;
@@ -299,8 +215,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
299{ 215{
300 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) 216 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
301 return 0; 217 return 0;
302 return vfs_fsync_range(file, file->f_path.dentry, pos, 218 return vfs_fsync_range(file, pos, pos + count - 1,
303 pos + count - 1,
304 (file->f_flags & __O_SYNC) ? 0 : 1); 219 (file->f_flags & __O_SYNC) ? 0 : 1);
305} 220}
306EXPORT_SYMBOL(generic_write_sync); 221EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e9d293593e52..4e321f7353fa 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,9 +46,9 @@ struct bin_buffer {
46}; 46};
47 47
48static int 48static int
49fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) 49fill_read(struct file *file, char *buffer, loff_t off, size_t count)
50{ 50{
51 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 51 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
52 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; 52 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
53 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 53 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
54 int rc; 54 int rc;
@@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
59 59
60 rc = -EIO; 60 rc = -EIO;
61 if (attr->read) 61 if (attr->read)
62 rc = attr->read(kobj, attr, buffer, off, count); 62 rc = attr->read(file, kobj, attr, buffer, off, count);
63 63
64 sysfs_put_active(attr_sd); 64 sysfs_put_active(attr_sd);
65 65
@@ -70,8 +70,7 @@ static ssize_t
70read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) 70read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
71{ 71{
72 struct bin_buffer *bb = file->private_data; 72 struct bin_buffer *bb = file->private_data;
73 struct dentry *dentry = file->f_path.dentry; 73 int size = file->f_path.dentry->d_inode->i_size;
74 int size = dentry->d_inode->i_size;
75 loff_t offs = *off; 74 loff_t offs = *off;
76 int count = min_t(size_t, bytes, PAGE_SIZE); 75 int count = min_t(size_t, bytes, PAGE_SIZE);
77 char *temp; 76 char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
92 91
93 mutex_lock(&bb->mutex); 92 mutex_lock(&bb->mutex);
94 93
95 count = fill_read(dentry, bb->buffer, offs, count); 94 count = fill_read(file, bb->buffer, offs, count);
96 if (count < 0) { 95 if (count < 0) {
97 mutex_unlock(&bb->mutex); 96 mutex_unlock(&bb->mutex);
98 goto out_free; 97 goto out_free;
@@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
117} 116}
118 117
119static int 118static int
120flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) 119flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
121{ 120{
122 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 121 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
123 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; 122 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
124 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 123 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
125 int rc; 124 int rc;
@@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
130 129
131 rc = -EIO; 130 rc = -EIO;
132 if (attr->write) 131 if (attr->write)
133 rc = attr->write(kobj, attr, buffer, offset, count); 132 rc = attr->write(file, kobj, attr, buffer, offset, count);
134 133
135 sysfs_put_active(attr_sd); 134 sysfs_put_active(attr_sd);
136 135
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
141 size_t bytes, loff_t *off) 140 size_t bytes, loff_t *off)
142{ 141{
143 struct bin_buffer *bb = file->private_data; 142 struct bin_buffer *bb = file->private_data;
144 struct dentry *dentry = file->f_path.dentry; 143 int size = file->f_path.dentry->d_inode->i_size;
145 int size = dentry->d_inode->i_size;
146 loff_t offs = *off; 144 loff_t offs = *off;
147 int count = min_t(size_t, bytes, PAGE_SIZE); 145 int count = min_t(size_t, bytes, PAGE_SIZE);
148 char *temp; 146 char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
165 163
166 memcpy(bb->buffer, temp, count); 164 memcpy(bb->buffer, temp, count);
167 165
168 count = flush_write(dentry, bb->buffer, offs, count); 166 count = flush_write(file, bb->buffer, offs, count);
169 mutex_unlock(&bb->mutex); 167 mutex_unlock(&bb->mutex);
170 168
171 if (count > 0) 169 if (count > 0)
@@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
363 if (!attr->mmap) 361 if (!attr->mmap)
364 goto out_put; 362 goto out_put;
365 363
366 rc = attr->mmap(kobj, attr, vma); 364 rc = attr->mmap(file, kobj, attr, vma);
367 if (rc) 365 if (rc)
368 goto out_put; 366 goto out_put;
369 367
@@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
501void sysfs_remove_bin_file(struct kobject *kobj, 499void sysfs_remove_bin_file(struct kobject *kobj,
502 const struct bin_attribute *attr) 500 const struct bin_attribute *attr)
503{ 501{
504 sysfs_hash_and_remove(kobj->sd, attr->attr.name); 502 sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
505} 503}
506 504
507EXPORT_SYMBOL_GPL(sysfs_create_bin_file); 505EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 590717861c7a..7e54bac8c4b0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
380{ 380{
381 struct sysfs_inode_attrs *ps_iattr; 381 struct sysfs_inode_attrs *ps_iattr;
382 382
383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) 383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
384 return -EEXIST; 384 return -EEXIST;
385 385
386 sd->s_parent = sysfs_get(acxt->parent_sd); 386 sd->s_parent = sysfs_get(acxt->parent_sd);
@@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
533 * Pointer to sysfs_dirent if found, NULL if not. 533 * Pointer to sysfs_dirent if found, NULL if not.
534 */ 534 */
535struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 535struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
536 const void *ns,
536 const unsigned char *name) 537 const unsigned char *name)
537{ 538{
538 struct sysfs_dirent *sd; 539 struct sysfs_dirent *sd;
539 540
540 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) 541 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
542 if (ns && sd->s_ns && (sd->s_ns != ns))
543 continue;
541 if (!strcmp(sd->s_name, name)) 544 if (!strcmp(sd->s_name, name))
542 return sd; 545 return sd;
546 }
543 return NULL; 547 return NULL;
544} 548}
545 549
@@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
558 * Pointer to sysfs_dirent if found, NULL if not. 562 * Pointer to sysfs_dirent if found, NULL if not.
559 */ 563 */
560struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 564struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
565 const void *ns,
561 const unsigned char *name) 566 const unsigned char *name)
562{ 567{
563 struct sysfs_dirent *sd; 568 struct sysfs_dirent *sd;
564 569
565 mutex_lock(&sysfs_mutex); 570 mutex_lock(&sysfs_mutex);
566 sd = sysfs_find_dirent(parent_sd, name); 571 sd = sysfs_find_dirent(parent_sd, ns, name);
567 sysfs_get(sd); 572 sysfs_get(sd);
568 mutex_unlock(&sysfs_mutex); 573 mutex_unlock(&sysfs_mutex);
569 574
@@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
572EXPORT_SYMBOL_GPL(sysfs_get_dirent); 577EXPORT_SYMBOL_GPL(sysfs_get_dirent);
573 578
574static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, 579static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
575 const char *name, struct sysfs_dirent **p_sd) 580 enum kobj_ns_type type, const void *ns, const char *name,
581 struct sysfs_dirent **p_sd)
576{ 582{
577 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 583 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
578 struct sysfs_addrm_cxt acxt; 584 struct sysfs_addrm_cxt acxt;
@@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
583 sd = sysfs_new_dirent(name, mode, SYSFS_DIR); 589 sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
584 if (!sd) 590 if (!sd)
585 return -ENOMEM; 591 return -ENOMEM;
592
593 sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
594 sd->s_ns = ns;
586 sd->s_dir.kobj = kobj; 595 sd->s_dir.kobj = kobj;
587 596
588 /* link in */ 597 /* link in */
@@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
601int sysfs_create_subdir(struct kobject *kobj, const char *name, 610int sysfs_create_subdir(struct kobject *kobj, const char *name,
602 struct sysfs_dirent **p_sd) 611 struct sysfs_dirent **p_sd)
603{ 612{
604 return create_dir(kobj, kobj->sd, name, p_sd); 613 return create_dir(kobj, kobj->sd,
614 KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
615}
616
617/**
618 * sysfs_read_ns_type: return associated ns_type
619 * @kobj: the kobject being queried
620 *
621 * Each kobject can be tagged with exactly one namespace type
622 * (i.e. network or user). Return the ns_type associated with
623 * this object if any
624 */
625static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
626{
627 const struct kobj_ns_type_operations *ops;
628 enum kobj_ns_type type;
629
630 ops = kobj_child_ns_ops(kobj);
631 if (!ops)
632 return KOBJ_NS_TYPE_NONE;
633
634 type = ops->type;
635 BUG_ON(type <= KOBJ_NS_TYPE_NONE);
636 BUG_ON(type >= KOBJ_NS_TYPES);
637 BUG_ON(!kobj_ns_type_registered(type));
638
639 return type;
605} 640}
606 641
607/** 642/**
@@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
610 */ 645 */
611int sysfs_create_dir(struct kobject * kobj) 646int sysfs_create_dir(struct kobject * kobj)
612{ 647{
648 enum kobj_ns_type type;
613 struct sysfs_dirent *parent_sd, *sd; 649 struct sysfs_dirent *parent_sd, *sd;
650 const void *ns = NULL;
614 int error = 0; 651 int error = 0;
615 652
616 BUG_ON(!kobj); 653 BUG_ON(!kobj);
@@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj)
620 else 657 else
621 parent_sd = &sysfs_root; 658 parent_sd = &sysfs_root;
622 659
623 error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); 660 if (sysfs_ns_type(parent_sd))
661 ns = kobj->ktype->namespace(kobj);
662 type = sysfs_read_ns_type(kobj);
663
664 error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
624 if (!error) 665 if (!error)
625 kobj->sd = sd; 666 kobj->sd = sd;
626 return error; 667 return error;
@@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
630 struct nameidata *nd) 671 struct nameidata *nd)
631{ 672{
632 struct dentry *ret = NULL; 673 struct dentry *ret = NULL;
633 struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata; 674 struct dentry *parent = dentry->d_parent;
675 struct sysfs_dirent *parent_sd = parent->d_fsdata;
634 struct sysfs_dirent *sd; 676 struct sysfs_dirent *sd;
635 struct inode *inode; 677 struct inode *inode;
678 enum kobj_ns_type type;
679 const void *ns;
636 680
637 mutex_lock(&sysfs_mutex); 681 mutex_lock(&sysfs_mutex);
638 682
639 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); 683 type = sysfs_ns_type(parent_sd);
684 ns = sysfs_info(dir->i_sb)->ns[type];
685
686 sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
640 687
641 /* no such entry */ 688 /* no such entry */
642 if (!sd) { 689 if (!sd) {
@@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj)
735} 782}
736 783
737int sysfs_rename(struct sysfs_dirent *sd, 784int sysfs_rename(struct sysfs_dirent *sd,
738 struct sysfs_dirent *new_parent_sd, const char *new_name) 785 struct sysfs_dirent *new_parent_sd, const void *new_ns,
786 const char *new_name)
739{ 787{
740 const char *dup_name = NULL; 788 const char *dup_name = NULL;
741 int error; 789 int error;
@@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
743 mutex_lock(&sysfs_mutex); 791 mutex_lock(&sysfs_mutex);
744 792
745 error = 0; 793 error = 0;
746 if ((sd->s_parent == new_parent_sd) && 794 if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
747 (strcmp(sd->s_name, new_name) == 0)) 795 (strcmp(sd->s_name, new_name) == 0))
748 goto out; /* nothing to rename */ 796 goto out; /* nothing to rename */
749 797
750 error = -EEXIST; 798 error = -EEXIST;
751 if (sysfs_find_dirent(new_parent_sd, new_name)) 799 if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
752 goto out; 800 goto out;
753 801
754 /* rename sysfs_dirent */ 802 /* rename sysfs_dirent */
@@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
770 sd->s_parent = new_parent_sd; 818 sd->s_parent = new_parent_sd;
771 sysfs_link_sibling(sd); 819 sysfs_link_sibling(sd);
772 } 820 }
821 sd->s_ns = new_ns;
773 822
774 error = 0; 823 error = 0;
775 out: 824 out:
@@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
780 829
781int sysfs_rename_dir(struct kobject *kobj, const char *new_name) 830int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
782{ 831{
783 return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name); 832 struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
833 const void *new_ns = NULL;
834
835 if (sysfs_ns_type(parent_sd))
836 new_ns = kobj->ktype->namespace(kobj);
837
838 return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
784} 839}
785 840
786int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) 841int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
787{ 842{
788 struct sysfs_dirent *sd = kobj->sd; 843 struct sysfs_dirent *sd = kobj->sd;
789 struct sysfs_dirent *new_parent_sd; 844 struct sysfs_dirent *new_parent_sd;
845 const void *new_ns = NULL;
790 846
791 BUG_ON(!sd->s_parent); 847 BUG_ON(!sd->s_parent);
848 if (sysfs_ns_type(sd->s_parent))
849 new_ns = kobj->ktype->namespace(kobj);
792 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? 850 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
793 new_parent_kobj->sd : &sysfs_root; 851 new_parent_kobj->sd : &sysfs_root;
794 852
795 return sysfs_rename(sd, new_parent_sd, sd->s_name); 853 return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
796} 854}
797 855
798/* Relationship between s_mode and the DT_xxx types */ 856/* Relationship between s_mode and the DT_xxx types */
@@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
807 return 0; 865 return 0;
808} 866}
809 867
810static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd, 868static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
811 ino_t ino, struct sysfs_dirent *pos) 869 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
812{ 870{
813 if (pos) { 871 if (pos) {
814 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && 872 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
815 pos->s_parent == parent_sd && 873 pos->s_parent == parent_sd &&
816 ino == pos->s_ino; 874 ino == pos->s_ino;
817 sysfs_put(pos); 875 sysfs_put(pos);
818 if (valid) 876 if (!valid)
819 return pos; 877 pos = NULL;
820 } 878 }
821 pos = NULL; 879 if (!pos && (ino > 1) && (ino < INT_MAX)) {
822 if ((ino > 1) && (ino < INT_MAX)) {
823 pos = parent_sd->s_dir.children; 880 pos = parent_sd->s_dir.children;
824 while (pos && (ino > pos->s_ino)) 881 while (pos && (ino > pos->s_ino))
825 pos = pos->s_sibling; 882 pos = pos->s_sibling;
826 } 883 }
884 while (pos && pos->s_ns && pos->s_ns != ns)
885 pos = pos->s_sibling;
827 return pos; 886 return pos;
828} 887}
829 888
830static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd, 889static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
831 ino_t ino, struct sysfs_dirent *pos) 890 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
832{ 891{
833 pos = sysfs_dir_pos(parent_sd, ino, pos); 892 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
834 if (pos) 893 if (pos)
835 pos = pos->s_sibling; 894 pos = pos->s_sibling;
895 while (pos && pos->s_ns && pos->s_ns != ns)
896 pos = pos->s_sibling;
836 return pos; 897 return pos;
837} 898}
838 899
@@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
841 struct dentry *dentry = filp->f_path.dentry; 902 struct dentry *dentry = filp->f_path.dentry;
842 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 903 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
843 struct sysfs_dirent *pos = filp->private_data; 904 struct sysfs_dirent *pos = filp->private_data;
905 enum kobj_ns_type type;
906 const void *ns;
844 ino_t ino; 907 ino_t ino;
845 908
909 type = sysfs_ns_type(parent_sd);
910 ns = sysfs_info(dentry->d_sb)->ns[type];
911
846 if (filp->f_pos == 0) { 912 if (filp->f_pos == 0) {
847 ino = parent_sd->s_ino; 913 ino = parent_sd->s_ino;
848 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) 914 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
857 filp->f_pos++; 923 filp->f_pos++;
858 } 924 }
859 mutex_lock(&sysfs_mutex); 925 mutex_lock(&sysfs_mutex);
860 for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos); 926 for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
861 pos; 927 pos;
862 pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) { 928 pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
863 const char * name; 929 const char * name;
864 unsigned int type; 930 unsigned int type;
865 int len, ret; 931 int len, ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e222b2582746..da3fefe91a8f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
340 char *p; 340 char *p;
341 341
342 p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); 342 p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
343 if (p) 343 if (!IS_ERR(p))
344 memmove(last_sysfs_file, p, strlen(p) + 1); 344 memmove(last_sysfs_file, p, strlen(p) + 1);
345 345
346 /* need attr_sd for attr and ops, its parent for kobj */ 346 /* need attr_sd for attr and ops, its parent for kobj */
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
478 mutex_lock(&sysfs_mutex); 478 mutex_lock(&sysfs_mutex);
479 479
480 if (sd && dir) 480 if (sd && dir)
481 sd = sysfs_find_dirent(sd, dir); 481 /* Only directories are tagged, so no need to pass
482 * a tag explicitly.
483 */
484 sd = sysfs_find_dirent(sd, NULL, dir);
482 if (sd && attr) 485 if (sd && attr)
483 sd = sysfs_find_dirent(sd, attr); 486 sd = sysfs_find_dirent(sd, NULL, attr);
484 if (sd) 487 if (sd)
485 sysfs_notify_dirent(sd); 488 sysfs_notify_dirent(sd);
486 489
@@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
569 int error; 572 int error;
570 573
571 if (group) 574 if (group)
572 dir_sd = sysfs_get_dirent(kobj->sd, group); 575 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
573 else 576 else
574 dir_sd = sysfs_get(kobj->sd); 577 dir_sd = sysfs_get(kobj->sd);
575 578
@@ -590,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
590 * @mode: file permissions. 593 * @mode: file permissions.
591 * 594 *
592 */ 595 */
593int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) 596int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
597 mode_t mode)
594{ 598{
595 struct sysfs_dirent *sd; 599 struct sysfs_dirent *sd;
596 struct iattr newattrs; 600 struct iattr newattrs;
@@ -599,7 +603,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
599 mutex_lock(&sysfs_mutex); 603 mutex_lock(&sysfs_mutex);
600 604
601 rc = -ENOENT; 605 rc = -ENOENT;
602 sd = sysfs_find_dirent(kobj->sd, attr->name); 606 sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
603 if (!sd) 607 if (!sd)
604 goto out; 608 goto out;
605 609
@@ -624,7 +628,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
624 628
625void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) 629void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
626{ 630{
627 sysfs_hash_and_remove(kobj->sd, attr->name); 631 sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
628} 632}
629 633
630void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) 634void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
@@ -646,11 +650,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
646 struct sysfs_dirent *dir_sd; 650 struct sysfs_dirent *dir_sd;
647 651
648 if (group) 652 if (group)
649 dir_sd = sysfs_get_dirent(kobj->sd, group); 653 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
650 else 654 else
651 dir_sd = sysfs_get(kobj->sd); 655 dir_sd = sysfs_get(kobj->sd);
652 if (dir_sd) { 656 if (dir_sd) {
653 sysfs_hash_and_remove(dir_sd, attr->name); 657 sysfs_hash_and_remove(dir_sd, NULL, attr->name);
654 sysfs_put(dir_sd); 658 sysfs_put(dir_sd);
655 } 659 }
656} 660}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
23 int i; 23 int i;
24 24
25 for (i = 0, attr = grp->attrs; *attr; i++, attr++) 25 for (i = 0, attr = grp->attrs; *attr; i++, attr++)
26 sysfs_hash_and_remove(dir_sd, (*attr)->name); 26 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
27} 27}
28 28
29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
39 * visibility. Do this by first removing then 39 * visibility. Do this by first removing then
40 * re-adding (if required) the file */ 40 * re-adding (if required) the file */
41 if (update) 41 if (update)
42 sysfs_hash_and_remove(dir_sd, (*attr)->name); 42 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
43 if (grp->is_visible) { 43 if (grp->is_visible) {
44 mode = grp->is_visible(kobj, *attr, i); 44 mode = grp->is_visible(kobj, *attr, i);
45 if (!mode) 45 if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
132 struct sysfs_dirent *sd; 132 struct sysfs_dirent *sd;
133 133
134 if (grp->name) { 134 if (grp->name) {
135 sd = sysfs_get_dirent(dir_sd, grp->name); 135 sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
136 if (!sd) { 136 if (!sd) {
137 WARN(!sd, KERN_WARNING "sysfs group %p not found for " 137 WARN(!sd, KERN_WARNING "sysfs group %p not found for "
138 "kobject '%s'\n", grp, kobject_name(kobj)); 138 "kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index a4a0a9419711..cffb1fd8ba33 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,13 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
117 if (error) 117 if (error)
118 goto out; 118 goto out;
119 119
120 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ 120 error = sysfs_sd_setattr(sd, iattr);
121
122 error = inode_setattr(inode, iattr);
123 if (error) 121 if (error)
124 goto out; 122 goto out;
125 123
126 error = sysfs_sd_setattr(sd, iattr); 124 /* this ignores size changes */
125 setattr_copy(inode, iattr);
126
127out: 127out:
128 mutex_unlock(&sysfs_mutex); 128 mutex_unlock(&sysfs_mutex);
129 return error; 129 return error;
@@ -312,19 +312,19 @@ struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
312 * The sysfs_dirent serves as both an inode and a directory entry for sysfs. 312 * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
313 * To prevent the sysfs inode numbers from being freed prematurely we take a 313 * To prevent the sysfs inode numbers from being freed prematurely we take a
314 * reference to sysfs_dirent from the sysfs inode. A 314 * reference to sysfs_dirent from the sysfs inode. A
315 * super_operations.delete_inode() implementation is needed to drop that 315 * super_operations.evict_inode() implementation is needed to drop that
316 * reference upon inode destruction. 316 * reference upon inode destruction.
317 */ 317 */
318void sysfs_delete_inode(struct inode *inode) 318void sysfs_evict_inode(struct inode *inode)
319{ 319{
320 struct sysfs_dirent *sd = inode->i_private; 320 struct sysfs_dirent *sd = inode->i_private;
321 321
322 truncate_inode_pages(&inode->i_data, 0); 322 truncate_inode_pages(&inode->i_data, 0);
323 clear_inode(inode); 323 end_writeback(inode);
324 sysfs_put(sd); 324 sysfs_put(sd);
325} 325}
326 326
327int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) 327int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
328{ 328{
329 struct sysfs_addrm_cxt acxt; 329 struct sysfs_addrm_cxt acxt;
330 struct sysfs_dirent *sd; 330 struct sysfs_dirent *sd;
@@ -334,7 +334,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
334 334
335 sysfs_addrm_start(&acxt, dir_sd); 335 sysfs_addrm_start(&acxt, dir_sd);
336 336
337 sd = sysfs_find_dirent(dir_sd, name); 337 sd = sysfs_find_dirent(dir_sd, ns, name);
338 if (sd && (sd->s_ns != ns))
339 sd = NULL;
338 if (sd) 340 if (sd)
339 sysfs_remove_one(&acxt, sd); 341 sysfs_remove_one(&acxt, sd);
340 342
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 776137828dca..f2af22574c50 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -29,13 +29,13 @@ struct kmem_cache *sysfs_dir_cachep;
29static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
30 .statfs = simple_statfs, 30 .statfs = simple_statfs,
31 .drop_inode = generic_delete_inode, 31 .drop_inode = generic_delete_inode,
32 .delete_inode = sysfs_delete_inode, 32 .evict_inode = sysfs_evict_inode,
33}; 33};
34 34
35struct sysfs_dirent sysfs_root = { 35struct sysfs_dirent sysfs_root = {
36 .s_name = "", 36 .s_name = "",
37 .s_count = ATOMIC_INIT(1), 37 .s_count = ATOMIC_INIT(1),
38 .s_flags = SYSFS_DIR, 38 .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
39 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, 39 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
40 .s_ino = 1, 40 .s_ino = 1,
41}; 41};
@@ -72,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
72 return 0; 72 return 0;
73} 73}
74 74
75static int sysfs_test_super(struct super_block *sb, void *data)
76{
77 struct sysfs_super_info *sb_info = sysfs_info(sb);
78 struct sysfs_super_info *info = data;
79 enum kobj_ns_type type;
80 int found = 1;
81
82 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
83 if (sb_info->ns[type] != info->ns[type])
84 found = 0;
85 }
86 return found;
87}
88
89static int sysfs_set_super(struct super_block *sb, void *data)
90{
91 int error;
92 error = set_anon_super(sb, data);
93 if (!error)
94 sb->s_fs_info = data;
95 return error;
96}
97
75static int sysfs_get_sb(struct file_system_type *fs_type, 98static int sysfs_get_sb(struct file_system_type *fs_type,
76 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 99 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
77{ 100{
78 return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); 101 struct sysfs_super_info *info;
102 enum kobj_ns_type type;
103 struct super_block *sb;
104 int error;
105
106 error = -ENOMEM;
107 info = kzalloc(sizeof(*info), GFP_KERNEL);
108 if (!info)
109 goto out;
110
111 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
112 info->ns[type] = kobj_ns_current(type);
113
114 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
115 if (IS_ERR(sb) || sb->s_fs_info != info)
116 kfree(info);
117 if (IS_ERR(sb)) {
118 error = PTR_ERR(sb);
119 goto out;
120 }
121 if (!sb->s_root) {
122 sb->s_flags = flags;
123 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
124 if (error) {
125 deactivate_locked_super(sb);
126 goto out;
127 }
128 sb->s_flags |= MS_ACTIVE;
129 }
130
131 simple_set_mnt(mnt, sb);
132 error = 0;
133out:
134 return error;
135}
136
137static void sysfs_kill_sb(struct super_block *sb)
138{
139 struct sysfs_super_info *info = sysfs_info(sb);
140
141 /* Remove the superblock from fs_supers/s_instances
142 * so we can't find it, before freeing sysfs_super_info.
143 */
144 kill_anon_super(sb);
145 kfree(info);
79} 146}
80 147
81static struct file_system_type sysfs_fs_type = { 148static struct file_system_type sysfs_fs_type = {
82 .name = "sysfs", 149 .name = "sysfs",
83 .get_sb = sysfs_get_sb, 150 .get_sb = sysfs_get_sb,
84 .kill_sb = kill_anon_super, 151 .kill_sb = sysfs_kill_sb,
85}; 152};
86 153
154void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
155{
156 struct super_block *sb;
157
158 mutex_lock(&sysfs_mutex);
159 spin_lock(&sb_lock);
160 list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
161 struct sysfs_super_info *info = sysfs_info(sb);
162 /*
163 * If we see a superblock on the fs_supers/s_instances
164 * list the unmount has not completed and sb->s_fs_info
165 * points to a valid struct sysfs_super_info.
166 */
167 /* Ignore superblocks with the wrong ns */
168 if (info->ns[type] != ns)
169 continue;
170 info->ns[type] = NULL;
171 }
172 spin_unlock(&sb_lock);
173 mutex_unlock(&sysfs_mutex);
174}
175
87int __init sysfs_init(void) 176int __init sysfs_init(void)
88{ 177{
89 int err = -ENOMEM; 178 int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index b93ec51fa7ac..a7ac78f8e67a 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
28 struct sysfs_dirent *target_sd = NULL; 28 struct sysfs_dirent *target_sd = NULL;
29 struct sysfs_dirent *sd = NULL; 29 struct sysfs_dirent *sd = NULL;
30 struct sysfs_addrm_cxt acxt; 30 struct sysfs_addrm_cxt acxt;
31 enum kobj_ns_type ns_type;
31 int error; 32 int error;
32 33
33 BUG_ON(!name); 34 BUG_ON(!name);
@@ -58,14 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
58 if (!sd) 59 if (!sd)
59 goto out_put; 60 goto out_put;
60 61
62 ns_type = sysfs_ns_type(parent_sd);
63 if (ns_type)
64 sd->s_ns = target->ktype->namespace(target);
61 sd->s_symlink.target_sd = target_sd; 65 sd->s_symlink.target_sd = target_sd;
62 target_sd = NULL; /* reference is now owned by the symlink */ 66 target_sd = NULL; /* reference is now owned by the symlink */
63 67
64 sysfs_addrm_start(&acxt, parent_sd); 68 sysfs_addrm_start(&acxt, parent_sd);
65 if (warn) 69 /* Symlinks must be between directories with the same ns_type */
66 error = sysfs_add_one(&acxt, sd); 70 if (!ns_type ||
67 else 71 (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
68 error = __sysfs_add_one(&acxt, sd); 72 if (warn)
73 error = sysfs_add_one(&acxt, sd);
74 else
75 error = __sysfs_add_one(&acxt, sd);
76 } else {
77 error = -EINVAL;
78 WARN(1, KERN_WARNING
79 "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
80 parent_sd->s_name,
81 sd->s_name,
82 sd->s_symlink.target_sd->s_parent->s_name,
83 sd->s_symlink.target_sd->s_name);
84 }
69 sysfs_addrm_finish(&acxt); 85 sysfs_addrm_finish(&acxt);
70 86
71 if (error) 87 if (error)
@@ -107,6 +123,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
107} 123}
108 124
109/** 125/**
126 * sysfs_delete_link - remove symlink in object's directory.
127 * @kobj: object we're acting for.
128 * @targ: object we're pointing to.
129 * @name: name of the symlink to remove.
130 *
131 * Unlike sysfs_remove_link sysfs_delete_link has enough information
132 * to successfully delete symlinks in tagged directories.
133 */
134void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
135 const char *name)
136{
137 const void *ns = NULL;
138 spin_lock(&sysfs_assoc_lock);
139 if (targ->sd && sysfs_ns_type(kobj->sd))
140 ns = targ->sd->s_ns;
141 spin_unlock(&sysfs_assoc_lock);
142 sysfs_hash_and_remove(kobj->sd, ns, name);
143}
144
145/**
110 * sysfs_remove_link - remove symlink in object's directory. 146 * sysfs_remove_link - remove symlink in object's directory.
111 * @kobj: object we're acting for. 147 * @kobj: object we're acting for.
112 * @name: name of the symlink to remove. 148 * @name: name of the symlink to remove.
@@ -121,7 +157,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
121 else 157 else
122 parent_sd = kobj->sd; 158 parent_sd = kobj->sd;
123 159
124 sysfs_hash_and_remove(parent_sd, name); 160 sysfs_hash_and_remove(parent_sd, NULL, name);
125} 161}
126 162
127/** 163/**
@@ -137,6 +173,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
137 const char *old, const char *new) 173 const char *old, const char *new)
138{ 174{
139 struct sysfs_dirent *parent_sd, *sd = NULL; 175 struct sysfs_dirent *parent_sd, *sd = NULL;
176 const void *old_ns = NULL, *new_ns = NULL;
140 int result; 177 int result;
141 178
142 if (!kobj) 179 if (!kobj)
@@ -144,8 +181,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
144 else 181 else
145 parent_sd = kobj->sd; 182 parent_sd = kobj->sd;
146 183
184 if (targ->sd)
185 old_ns = targ->sd->s_ns;
186
147 result = -ENOENT; 187 result = -ENOENT;
148 sd = sysfs_get_dirent(parent_sd, old); 188 sd = sysfs_get_dirent(parent_sd, old_ns, old);
149 if (!sd) 189 if (!sd)
150 goto out; 190 goto out;
151 191
@@ -155,7 +195,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
155 if (sd->s_symlink.target_sd->s_dir.kobj != targ) 195 if (sd->s_symlink.target_sd->s_dir.kobj != targ)
156 goto out; 196 goto out;
157 197
158 result = sysfs_rename(sd, parent_sd, new); 198 if (sysfs_ns_type(parent_sd))
199 new_ns = targ->ktype->namespace(targ);
200
201 result = sysfs_rename(sd, parent_sd, new_ns, new);
159 202
160out: 203out:
161 sysfs_put(sd); 204 sysfs_put(sd);
@@ -261,3 +304,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
261 304
262EXPORT_SYMBOL_GPL(sysfs_create_link); 305EXPORT_SYMBOL_GPL(sysfs_create_link);
263EXPORT_SYMBOL_GPL(sysfs_remove_link); 306EXPORT_SYMBOL_GPL(sysfs_remove_link);
307EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 30f5a44fb5d3..d9be60a2e956 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
58 struct sysfs_dirent *s_sibling; 58 struct sysfs_dirent *s_sibling;
59 const char *s_name; 59 const char *s_name;
60 60
61 const void *s_ns; /* namespace tag */
61 union { 62 union {
62 struct sysfs_elem_dir s_dir; 63 struct sysfs_elem_dir s_dir;
63 struct sysfs_elem_symlink s_symlink; 64 struct sysfs_elem_symlink s_symlink;
@@ -81,14 +82,27 @@ struct sysfs_dirent {
81#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) 82#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
82#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) 83#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
83 84
84#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK 85/* identify any namespace tag on sysfs_dirents */
85#define SYSFS_FLAG_REMOVED 0x0200 86#define SYSFS_NS_TYPE_MASK 0xff00
87#define SYSFS_NS_TYPE_SHIFT 8
88
89#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
90#define SYSFS_FLAG_REMOVED 0x020000
86 91
87static inline unsigned int sysfs_type(struct sysfs_dirent *sd) 92static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
88{ 93{
89 return sd->s_flags & SYSFS_TYPE_MASK; 94 return sd->s_flags & SYSFS_TYPE_MASK;
90} 95}
91 96
97/*
98 * Return any namespace tags on this dirent.
99 * enum kobj_ns_type is defined in linux/kobject.h
100 */
101static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
102{
103 return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
104}
105
92#ifdef CONFIG_DEBUG_LOCK_ALLOC 106#ifdef CONFIG_DEBUG_LOCK_ALLOC
93#define sysfs_dirent_init_lockdep(sd) \ 107#define sysfs_dirent_init_lockdep(sd) \
94do { \ 108do { \
@@ -114,6 +128,16 @@ struct sysfs_addrm_cxt {
114/* 128/*
115 * mount.c 129 * mount.c
116 */ 130 */
131
132/*
133 * Each sb is associated with a set of namespace tags (i.e.
134 * the network namespace of the task which mounted this sysfs
135 * instance).
136 */
137struct sysfs_super_info {
138 const void *ns[KOBJ_NS_TYPES];
139};
140#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
117extern struct sysfs_dirent sysfs_root; 141extern struct sysfs_dirent sysfs_root;
118extern struct kmem_cache *sysfs_dir_cachep; 142extern struct kmem_cache *sysfs_dir_cachep;
119 143
@@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
137void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); 161void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
138 162
139struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 163struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
164 const void *ns,
140 const unsigned char *name); 165 const unsigned char *name);
141struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 166struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
167 const void *ns,
142 const unsigned char *name); 168 const unsigned char *name);
143struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); 169struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
144 170
@@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
149void sysfs_remove_subdir(struct sysfs_dirent *sd); 175void sysfs_remove_subdir(struct sysfs_dirent *sd);
150 176
151int sysfs_rename(struct sysfs_dirent *sd, 177int sysfs_rename(struct sysfs_dirent *sd,
152 struct sysfs_dirent *new_parent_sd, const char *new_name); 178 struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
153 179
154static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) 180static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
155{ 181{
@@ -172,14 +198,14 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
172 * inode.c 198 * inode.c
173 */ 199 */
174struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); 200struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
175void sysfs_delete_inode(struct inode *inode); 201void sysfs_evict_inode(struct inode *inode);
176int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); 202int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
177int sysfs_permission(struct inode *inode, int mask); 203int sysfs_permission(struct inode *inode, int mask);
178int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 204int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
179int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); 205int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
180int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 206int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
181 size_t size, int flags); 207 size_t size, int flags);
182int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 208int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
183int sysfs_inode_init(void); 209int sysfs_inode_init(void);
184 210
185/* 211/*
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 1dabed286b4c..a77c42157620 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
24 .llseek = generic_file_llseek, 24 .llseek = generic_file_llseek,
25 .read = generic_read_dir, 25 .read = generic_read_dir,
26 .readdir = sysv_readdir, 26 .readdir = sysv_readdir,
27 .fsync = simple_fsync, 27 .fsync = generic_file_fsync,
28}; 28};
29 29
30static inline void dir_put_page(struct page *page) 30static inline void dir_put_page(struct page *page)
@@ -218,8 +218,7 @@ got_it:
218 pos = page_offset(page) + 218 pos = page_offset(page) +
219 (char*)de - (char*)page_address(page); 219 (char*)de - (char*)page_address(page);
220 lock_page(page); 220 lock_page(page);
221 err = __sysv_write_begin(NULL, page->mapping, pos, SYSV_DIRSIZE, 221 err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
222 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
223 if (err) 222 if (err)
224 goto out_unlock; 223 goto out_unlock;
225 memcpy (de->name, name, namelen); 224 memcpy (de->name, name, namelen);
@@ -239,15 +238,13 @@ out_unlock:
239 238
240int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) 239int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
241{ 240{
242 struct address_space *mapping = page->mapping; 241 struct inode *inode = page->mapping->host;
243 struct inode *inode = (struct inode*)mapping->host;
244 char *kaddr = (char*)page_address(page); 242 char *kaddr = (char*)page_address(page);
245 loff_t pos = page_offset(page) + (char *)de - kaddr; 243 loff_t pos = page_offset(page) + (char *)de - kaddr;
246 int err; 244 int err;
247 245
248 lock_page(page); 246 lock_page(page);
249 err = __sysv_write_begin(NULL, mapping, pos, SYSV_DIRSIZE, 247 err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
250 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
251 BUG_ON(err); 248 BUG_ON(err);
252 de->inode = 0; 249 de->inode = 0;
253 err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); 250 err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
@@ -259,16 +256,14 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
259 256
260int sysv_make_empty(struct inode *inode, struct inode *dir) 257int sysv_make_empty(struct inode *inode, struct inode *dir)
261{ 258{
262 struct address_space *mapping = inode->i_mapping; 259 struct page *page = grab_cache_page(inode->i_mapping, 0);
263 struct page *page = grab_cache_page(mapping, 0);
264 struct sysv_dir_entry * de; 260 struct sysv_dir_entry * de;
265 char *base; 261 char *base;
266 int err; 262 int err;
267 263
268 if (!page) 264 if (!page)
269 return -ENOMEM; 265 return -ENOMEM;
270 err = __sysv_write_begin(NULL, mapping, 0, 2 * SYSV_DIRSIZE, 266 err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE);
271 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
272 if (err) { 267 if (err) {
273 unlock_page(page); 268 unlock_page(page);
274 goto fail; 269 goto fail;
@@ -341,15 +336,13 @@ not_empty:
341void sysv_set_link(struct sysv_dir_entry *de, struct page *page, 336void sysv_set_link(struct sysv_dir_entry *de, struct page *page,
342 struct inode *inode) 337 struct inode *inode)
343{ 338{
344 struct address_space *mapping = page->mapping; 339 struct inode *dir = page->mapping->host;
345 struct inode *dir = mapping->host;
346 loff_t pos = page_offset(page) + 340 loff_t pos = page_offset(page) +
347 (char *)de-(char*)page_address(page); 341 (char *)de-(char*)page_address(page);
348 int err; 342 int err;
349 343
350 lock_page(page); 344 lock_page(page);
351 err = __sysv_write_begin(NULL, mapping, pos, SYSV_DIRSIZE, 345 err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
352 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
353 BUG_ON(err); 346 BUG_ON(err);
354 de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); 347 de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
355 err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); 348 err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..0a65939508e9 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,11 +26,33 @@ const struct file_operations sysv_file_operations = {
26 .write = do_sync_write, 26 .write = do_sync_write,
27 .aio_write = generic_file_aio_write, 27 .aio_write = generic_file_aio_write,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .splice_read = generic_file_splice_read, 30 .splice_read = generic_file_splice_read,
31}; 31};
32 32
33static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
34{
35 struct inode *inode = dentry->d_inode;
36 int error;
37
38 error = inode_change_ok(inode, attr);
39 if (error)
40 return error;
41
42 if ((attr->ia_valid & ATTR_SIZE) &&
43 attr->ia_size != i_size_read(inode)) {
44 error = vmtruncate(inode, attr->ia_size);
45 if (error)
46 return error;
47 }
48
49 setattr_copy(inode, attr);
50 mark_inode_dirty(inode);
51 return 0;
52}
53
33const struct inode_operations sysv_file_inode_operations = { 54const struct inode_operations sysv_file_inode_operations = {
34 .truncate = sysv_truncate, 55 .truncate = sysv_truncate,
56 .setattr = sysv_setattr,
35 .getattr = sysv_getattr, 57 .getattr = sysv_getattr,
36}; 58};
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfad..0c96c98bd1db 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -25,6 +25,7 @@
25#include <linux/stat.h> 25#include <linux/stat.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/writeback.h>
28#include "sysv.h" 29#include "sysv.h"
29 30
30/* We don't trust the value of 31/* We don't trust the value of
@@ -112,7 +113,6 @@ void sysv_free_inode(struct inode * inode)
112 return; 113 return;
113 } 114 }
114 raw_inode = sysv_raw_inode(sb, ino, &bh); 115 raw_inode = sysv_raw_inode(sb, ino, &bh);
115 clear_inode(inode);
116 if (!raw_inode) { 116 if (!raw_inode) {
117 printk("sysv_free_inode: unable to read inode block on device " 117 printk("sysv_free_inode: unable to read inode block on device "
118 "%s\n", inode->i_sb->s_id); 118 "%s\n", inode->i_sb->s_id);
@@ -139,6 +139,9 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
139 struct inode *inode; 139 struct inode *inode;
140 sysv_ino_t ino; 140 sysv_ino_t ino;
141 unsigned count; 141 unsigned count;
142 struct writeback_control wbc = {
143 .sync_mode = WB_SYNC_NONE
144 };
142 145
143 inode = new_inode(sb); 146 inode = new_inode(sb);
144 if (!inode) 147 if (!inode)
@@ -159,15 +162,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
159 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); 162 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
160 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); 163 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
161 dirty_sb(sb); 164 dirty_sb(sb);
162 165 inode_init_owner(inode, dir, mode);
163 if (dir->i_mode & S_ISGID) {
164 inode->i_gid = dir->i_gid;
165 if (S_ISDIR(mode))
166 mode |= S_ISGID;
167 } else
168 inode->i_gid = current_fsgid();
169
170 inode->i_uid = current_fsuid();
171 inode->i_ino = fs16_to_cpu(sbi, ino); 166 inode->i_ino = fs16_to_cpu(sbi, ino);
172 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 167 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
173 inode->i_blocks = 0; 168 inode->i_blocks = 0;
@@ -176,8 +171,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
176 insert_inode_hash(inode); 171 insert_inode_hash(inode);
177 mark_inode_dirty(inode); 172 mark_inode_dirty(inode);
178 173
179 inode->i_mode = mode; /* for sysv_write_inode() */ 174 sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */
180 sysv_write_inode(inode, 0); /* ensure inode not allocated again */
181 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ 175 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
182 /* That's it. */ 176 /* That's it. */
183 unlock_super(sb); 177 unlock_super(sb);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..de44d067b9e6 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
43 * then attach current time stamp. 43 * then attach current time stamp.
44 * But if the filesystem was marked clean, keep it clean. 44 * But if the filesystem was marked clean, keep it clean.
45 */ 45 */
46 sb->s_dirt = 0;
46 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); 47 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
47 if (sbi->s_type == FSTYPE_SYSV4) { 48 if (sbi->s_type == FSTYPE_SYSV4) {
48 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) 49 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
@@ -70,8 +71,8 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
70 lock_super(sb); 71 lock_super(sb);
71 if (sbi->s_forced_ro) 72 if (sbi->s_forced_ro)
72 *flags |= MS_RDONLY; 73 *flags |= MS_RDONLY;
73 if (!(*flags & MS_RDONLY)) 74 if (*flags & MS_RDONLY)
74 sb->s_dirt = 1; 75 sysv_write_super(sb);
75 unlock_super(sb); 76 unlock_super(sb);
76 return 0; 77 return 0;
77} 78}
@@ -307,12 +308,17 @@ int sysv_sync_inode(struct inode *inode)
307 return __sysv_write_inode(inode, 1); 308 return __sysv_write_inode(inode, 1);
308} 309}
309 310
310static void sysv_delete_inode(struct inode *inode) 311static void sysv_evict_inode(struct inode *inode)
311{ 312{
312 truncate_inode_pages(&inode->i_data, 0); 313 truncate_inode_pages(&inode->i_data, 0);
313 inode->i_size = 0; 314 if (!inode->i_nlink) {
314 sysv_truncate(inode); 315 inode->i_size = 0;
315 sysv_free_inode(inode); 316 sysv_truncate(inode);
317 }
318 invalidate_inode_buffers(inode);
319 end_writeback(inode);
320 if (!inode->i_nlink)
321 sysv_free_inode(inode);
316} 322}
317 323
318static struct kmem_cache *sysv_inode_cachep; 324static struct kmem_cache *sysv_inode_cachep;
@@ -343,7 +349,7 @@ const struct super_operations sysv_sops = {
343 .alloc_inode = sysv_alloc_inode, 349 .alloc_inode = sysv_alloc_inode,
344 .destroy_inode = sysv_destroy_inode, 350 .destroy_inode = sysv_destroy_inode,
345 .write_inode = sysv_write_inode, 351 .write_inode = sysv_write_inode,
346 .delete_inode = sysv_delete_inode, 352 .evict_inode = sysv_evict_inode,
347 .put_super = sysv_put_super, 353 .put_super = sysv_put_super,
348 .write_super = sysv_write_super, 354 .write_super = sysv_write_super,
349 .sync_fs = sysv_sync_fs, 355 .sync_fs = sysv_sync_fs,
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index f042eec464c2..9ca66276315e 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -459,20 +459,25 @@ static int sysv_readpage(struct file *file, struct page *page)
459 return block_read_full_page(page,get_block); 459 return block_read_full_page(page,get_block);
460} 460}
461 461
462int __sysv_write_begin(struct file *file, struct address_space *mapping, 462int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
463 loff_t pos, unsigned len, unsigned flags,
464 struct page **pagep, void **fsdata)
465{ 463{
466 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 464 return __block_write_begin(page, pos, len, get_block);
467 get_block);
468} 465}
469 466
470static int sysv_write_begin(struct file *file, struct address_space *mapping, 467static int sysv_write_begin(struct file *file, struct address_space *mapping,
471 loff_t pos, unsigned len, unsigned flags, 468 loff_t pos, unsigned len, unsigned flags,
472 struct page **pagep, void **fsdata) 469 struct page **pagep, void **fsdata)
473{ 470{
474 *pagep = NULL; 471 int ret;
475 return __sysv_write_begin(file, mapping, pos, len, flags, pagep, fsdata); 472
473 ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
474 if (unlikely(ret)) {
475 loff_t isize = mapping->host->i_size;
476 if (pos + len > isize)
477 vmtruncate(mapping->host, isize);
478 }
479
480 return ret;
476} 481}
477 482
478static sector_t sysv_bmap(struct address_space *mapping, sector_t block) 483static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 5a903da54551..a0b0cda6927e 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -347,7 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
347 sb->s_flags |= MS_RDONLY; 347 sb->s_flags |= MS_RDONLY;
348 if (sbi->s_truncate) 348 if (sbi->s_truncate)
349 sb->s_root->d_op = &sysv_dentry_operations; 349 sb->s_root->d_op = &sysv_dentry_operations;
350 sb->s_dirt = 1;
351 return 1; 350 return 1;
352} 351}
353 352
@@ -435,12 +434,46 @@ Ebadsize:
435 goto failed; 434 goto failed;
436} 435}
437 436
438static int v7_fill_super(struct super_block *sb, void *data, int silent) 437static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
439{ 438{
440 struct sysv_sb_info *sbi;
441 struct buffer_head *bh, *bh2 = NULL;
442 struct v7_super_block *v7sb; 439 struct v7_super_block *v7sb;
443 struct sysv_inode *v7i; 440 struct sysv_inode *v7i;
441 struct buffer_head *bh2;
442 struct sysv_sb_info *sbi;
443
444 sbi = sb->s_fs_info;
445
446 /* plausibility check on superblock */
447 v7sb = (struct v7_super_block *) bh->b_data;
448 if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
449 fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
450 fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
451 return 0;
452
453 /* plausibility check on root inode: it is a directory,
454 with a nonzero size that is a multiple of 16 */
455 bh2 = sb_bread(sb, 2);
456 if (bh2 == NULL)
457 return 0;
458
459 v7i = (struct sysv_inode *)(bh2->b_data + 64);
460 if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
461 (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
462 (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
463 (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
464 sizeof(struct sysv_dir_entry))) {
465 brelse(bh2);
466 return 0;
467 }
468
469 brelse(bh2);
470 return 1;
471}
472
473static int v7_fill_super(struct super_block *sb, void *data, int silent)
474{
475 struct sysv_sb_info *sbi;
476 struct buffer_head *bh;
444 477
445 if (440 != sizeof (struct v7_super_block)) 478 if (440 != sizeof (struct v7_super_block))
446 panic("V7 FS: bad super-block size"); 479 panic("V7 FS: bad super-block size");
@@ -454,7 +487,6 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
454 sbi->s_sb = sb; 487 sbi->s_sb = sb;
455 sbi->s_block_base = 0; 488 sbi->s_block_base = 0;
456 sbi->s_type = FSTYPE_V7; 489 sbi->s_type = FSTYPE_V7;
457 sbi->s_bytesex = BYTESEX_PDP;
458 sb->s_fs_info = sbi; 490 sb->s_fs_info = sbi;
459 491
460 sb_set_blocksize(sb, 512); 492 sb_set_blocksize(sb, 512);
@@ -466,32 +498,27 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
466 goto failed; 498 goto failed;
467 } 499 }
468 500
469 /* plausibility check on superblock */ 501 /* Try PDP-11 UNIX */
470 v7sb = (struct v7_super_block *) bh->b_data; 502 sbi->s_bytesex = BYTESEX_PDP;
471 if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || 503 if (v7_sanity_check(sb, bh))
472 fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || 504 goto detected;
473 fs32_to_cpu(sbi, v7sb->s_time) == 0)
474 goto failed;
475 505
476 /* plausibility check on root inode: it is a directory, 506 /* Try PC/IX, v7/x86 */
477 with a nonzero size that is a multiple of 16 */ 507 sbi->s_bytesex = BYTESEX_LE;
478 if ((bh2 = sb_bread(sb, 2)) == NULL) 508 if (v7_sanity_check(sb, bh))
479 goto failed; 509 goto detected;
480 v7i = (struct sysv_inode *)(bh2->b_data + 64);
481 if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
482 (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
483 (fs32_to_cpu(sbi, v7i->i_size) & 017) != 0)
484 goto failed;
485 brelse(bh2);
486 bh2 = NULL;
487 510
511 goto failed;
512
513detected:
488 sbi->s_bh1 = bh; 514 sbi->s_bh1 = bh;
489 sbi->s_bh2 = bh; 515 sbi->s_bh2 = bh;
490 if (complete_read_super(sb, silent, 1)) 516 if (complete_read_super(sb, silent, 1))
491 return 0; 517 return 0;
492 518
493failed: 519failed:
494 brelse(bh2); 520 printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
521 sb->s_id);
495 brelse(bh); 522 brelse(bh);
496 kfree(sbi); 523 kfree(sbi);
497 return -EINVAL; 524 return -EINVAL;
@@ -560,4 +587,5 @@ static void __exit exit_sysv_fs(void)
560 587
561module_init(init_sysv_fs) 588module_init(init_sysv_fs)
562module_exit(exit_sysv_fs) 589module_exit(exit_sysv_fs)
590MODULE_ALIAS("v7");
563MODULE_LICENSE("GPL"); 591MODULE_LICENSE("GPL");
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 94cb9b4d76c2..bb55cdb394bf 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -136,9 +136,7 @@ extern unsigned long sysv_count_free_blocks(struct super_block *);
136 136
137/* itree.c */ 137/* itree.c */
138extern void sysv_truncate(struct inode *); 138extern void sysv_truncate(struct inode *);
139extern int __sysv_write_begin(struct file *file, struct address_space *mapping, 139extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len);
140 loff_t pos, unsigned len, unsigned flags,
141 struct page **pagep, void **fsdata);
142 140
143/* inode.c */ 141/* inode.c */
144extern struct inode *sysv_iget(struct super_block *, unsigned int); 142extern struct inode *sysv_iget(struct super_block *, unsigned int);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 98158de91d24..b86ab8eff79a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -110,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
110 struct timerfd_ctx *ctx = file->private_data; 110 struct timerfd_ctx *ctx = file->private_data;
111 ssize_t res; 111 ssize_t res;
112 u64 ticks = 0; 112 u64 ticks = 0;
113 DECLARE_WAITQUEUE(wait, current);
114 113
115 if (count < sizeof(ticks)) 114 if (count < sizeof(ticks))
116 return -EINVAL; 115 return -EINVAL;
117 spin_lock_irq(&ctx->wqh.lock); 116 spin_lock_irq(&ctx->wqh.lock);
118 res = -EAGAIN; 117 if (file->f_flags & O_NONBLOCK)
119 if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) { 118 res = -EAGAIN;
120 __add_wait_queue(&ctx->wqh, &wait); 119 else
121 for (res = 0;;) { 120 res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
122 set_current_state(TASK_INTERRUPTIBLE);
123 if (ctx->ticks) {
124 res = 0;
125 break;
126 }
127 if (signal_pending(current)) {
128 res = -ERESTARTSYS;
129 break;
130 }
131 spin_unlock_irq(&ctx->wqh.lock);
132 schedule();
133 spin_lock_irq(&ctx->wqh.lock);
134 }
135 __remove_wait_queue(&ctx->wqh, &wait);
136 __set_current_state(TASK_RUNNING);
137 }
138 if (ctx->ticks) { 121 if (ctx->ticks) {
139 ticks = ctx->ticks; 122 ticks = ctx->ticks;
140 if (ctx->expired && ctx->tintv.tv64) { 123 if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 076ca50e9933..c8ff0d1ae5d3 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -62,7 +62,9 @@
62 */ 62 */
63static void shrink_liability(struct ubifs_info *c, int nr_to_write) 63static void shrink_liability(struct ubifs_info *c, int nr_to_write)
64{ 64{
65 down_read(&c->vfs_sb->s_umount);
65 writeback_inodes_sb(c->vfs_sb); 66 writeback_inodes_sb(c->vfs_sb);
67 up_read(&c->vfs_sb->s_umount);
66} 68}
67 69
68/** 70/**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503d44a1..87ebcce72213 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
104 */ 104 */
105 inode->i_flags |= (S_NOCMTIME); 105 inode->i_flags |= (S_NOCMTIME);
106 106
107 inode->i_uid = current_fsuid(); 107 inode_init_owner(inode, dir, mode);
108 if (dir->i_mode & S_ISGID) {
109 inode->i_gid = dir->i_gid;
110 if (S_ISDIR(mode))
111 mode |= S_ISGID;
112 } else
113 inode->i_gid = current_fsgid();
114 inode->i_mode = mode;
115 inode->i_mtime = inode->i_atime = inode->i_ctime = 108 inode->i_mtime = inode->i_atime = inode->i_ctime =
116 ubifs_current_time(inode); 109 ubifs_current_time(inode);
117 inode->i_mapping->nrpages = 0; 110 inode->i_mapping->nrpages = 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf72b807..03ae894c45de 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -967,12 +967,16 @@ static int do_writepage(struct page *page, int len)
967 * the page locked, and it locks @ui_mutex. However, write-back does take inode 967 * the page locked, and it locks @ui_mutex. However, write-back does take inode
968 * @i_mutex, which means other VFS operations may be run on this inode at the 968 * @i_mutex, which means other VFS operations may be run on this inode at the
969 * same time. And the problematic one is truncation to smaller size, from where 969 * same time. And the problematic one is truncation to smaller size, from where
970 * we have to call 'vmtruncate()', which first changes @inode->i_size, then 970 * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
971 * drops the truncated pages. And while dropping the pages, it takes the page 971 * drops the truncated pages. And while dropping the pages, it takes the page
972 * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with 972 * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This 973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
974 * means that @inode->i_size is changed while @ui_mutex is unlocked. 974 * means that @inode->i_size is changed while @ui_mutex is unlocked.
975 * 975 *
976 * XXX(truncate): with the new truncate sequence this is not true anymore,
977 * and the calls to truncate_setsize can be move around freely. They should
978 * be moved to the very end of the truncate sequence.
979 *
976 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond 980 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
977 * inode size. How do we do this if @inode->i_size may became smaller while we 981 * inode size. How do we do this if @inode->i_size may became smaller while we
978 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the 982 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1125,9 +1129,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
1125 budgeted = 0; 1129 budgeted = 0;
1126 } 1130 }
1127 1131
1128 err = vmtruncate(inode, new_size); 1132 truncate_setsize(inode, new_size);
1129 if (err)
1130 goto out_budg;
1131 1133
1132 if (offset) { 1134 if (offset) {
1133 pgoff_t index = new_size >> PAGE_CACHE_SHIFT; 1135 pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
@@ -1214,16 +1216,14 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1214 1216
1215 if (attr->ia_valid & ATTR_SIZE) { 1217 if (attr->ia_valid & ATTR_SIZE) {
1216 dbg_gen("size %lld -> %lld", inode->i_size, new_size); 1218 dbg_gen("size %lld -> %lld", inode->i_size, new_size);
1217 err = vmtruncate(inode, new_size); 1219 truncate_setsize(inode, new_size);
1218 if (err)
1219 goto out;
1220 } 1220 }
1221 1221
1222 mutex_lock(&ui->ui_mutex); 1222 mutex_lock(&ui->ui_mutex);
1223 if (attr->ia_valid & ATTR_SIZE) { 1223 if (attr->ia_valid & ATTR_SIZE) {
1224 /* Truncation changes inode [mc]time */ 1224 /* Truncation changes inode [mc]time */
1225 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1225 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1226 /* 'vmtruncate()' changed @i_size, update @ui_size */ 1226 /* 'truncate_setsize()' changed @i_size, update @ui_size */
1227 ui->ui_size = inode->i_size; 1227 ui->ui_size = inode->i_size;
1228 } 1228 }
1229 1229
@@ -1245,10 +1245,6 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1245 if (IS_SYNC(inode)) 1245 if (IS_SYNC(inode))
1246 err = inode->i_sb->s_op->write_inode(inode, NULL); 1246 err = inode->i_sb->s_op->write_inode(inode, NULL);
1247 return err; 1247 return err;
1248
1249out:
1250 ubifs_release_budget(c, &req);
1251 return err;
1252} 1248}
1253 1249
1254int ubifs_setattr(struct dentry *dentry, struct iattr *attr) 1250int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -1304,9 +1300,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
1304 return NULL; 1300 return NULL;
1305} 1301}
1306 1302
1307int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1303int ubifs_fsync(struct file *file, int datasync)
1308{ 1304{
1309 struct inode *inode = dentry->d_inode; 1305 struct inode *inode = file->f_mapping->host;
1310 struct ubifs_info *c = inode->i_sb->s_fs_info; 1306 struct ubifs_info *c = inode->i_sb->s_fs_info;
1311 int err; 1307 int err;
1312 1308
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 77d5cf4a7547..bcf5a16f30bb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -64,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
64 if (!c->ro_media) { 64 if (!c->ro_media) {
65 c->ro_media = 1; 65 c->ro_media = 1;
66 c->no_chk_data_crc = 0; 66 c->no_chk_data_crc = 0;
67 c->vfs_sb->s_flags |= MS_RDONLY;
67 ubifs_warn("switched to read-only mode, error %d", err); 68 ubifs_warn("switched to read-only mode, error %d", err);
68 dbg_dump_stack(); 69 dbg_dump_stack();
69 } 70 }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ad7f67b827ea..0084a33c4c69 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1457,13 +1457,13 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
1457 shft -= UBIFS_LPT_FANOUT_SHIFT; 1457 shft -= UBIFS_LPT_FANOUT_SHIFT;
1458 nnode = ubifs_get_nnode(c, nnode, iip); 1458 nnode = ubifs_get_nnode(c, nnode, iip);
1459 if (IS_ERR(nnode)) 1459 if (IS_ERR(nnode))
1460 return ERR_PTR(PTR_ERR(nnode)); 1460 return ERR_CAST(nnode);
1461 } 1461 }
1462 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); 1462 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1463 shft -= UBIFS_LPT_FANOUT_SHIFT; 1463 shft -= UBIFS_LPT_FANOUT_SHIFT;
1464 pnode = ubifs_get_pnode(c, nnode, iip); 1464 pnode = ubifs_get_pnode(c, nnode, iip);
1465 if (IS_ERR(pnode)) 1465 if (IS_ERR(pnode))
1466 return ERR_PTR(PTR_ERR(pnode)); 1466 return ERR_CAST(pnode);
1467 iip = (i & (UBIFS_LPT_FANOUT - 1)); 1467 iip = (i & (UBIFS_LPT_FANOUT - 1));
1468 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, 1468 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
1469 pnode->lprops[iip].free, pnode->lprops[iip].dirty, 1469 pnode->lprops[iip].free, pnode->lprops[iip].dirty,
@@ -1586,7 +1586,7 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
1586 nnode = c->nroot; 1586 nnode = c->nroot;
1587 nnode = dirty_cow_nnode(c, nnode); 1587 nnode = dirty_cow_nnode(c, nnode);
1588 if (IS_ERR(nnode)) 1588 if (IS_ERR(nnode))
1589 return ERR_PTR(PTR_ERR(nnode)); 1589 return ERR_CAST(nnode);
1590 i = lnum - c->main_first; 1590 i = lnum - c->main_first;
1591 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; 1591 shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
1592 for (h = 1; h < c->lpt_hght; h++) { 1592 for (h = 1; h < c->lpt_hght; h++) {
@@ -1594,19 +1594,19 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
1594 shft -= UBIFS_LPT_FANOUT_SHIFT; 1594 shft -= UBIFS_LPT_FANOUT_SHIFT;
1595 nnode = ubifs_get_nnode(c, nnode, iip); 1595 nnode = ubifs_get_nnode(c, nnode, iip);
1596 if (IS_ERR(nnode)) 1596 if (IS_ERR(nnode))
1597 return ERR_PTR(PTR_ERR(nnode)); 1597 return ERR_CAST(nnode);
1598 nnode = dirty_cow_nnode(c, nnode); 1598 nnode = dirty_cow_nnode(c, nnode);
1599 if (IS_ERR(nnode)) 1599 if (IS_ERR(nnode))
1600 return ERR_PTR(PTR_ERR(nnode)); 1600 return ERR_CAST(nnode);
1601 } 1601 }
1602 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); 1602 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1603 shft -= UBIFS_LPT_FANOUT_SHIFT; 1603 shft -= UBIFS_LPT_FANOUT_SHIFT;
1604 pnode = ubifs_get_pnode(c, nnode, iip); 1604 pnode = ubifs_get_pnode(c, nnode, iip);
1605 if (IS_ERR(pnode)) 1605 if (IS_ERR(pnode))
1606 return ERR_PTR(PTR_ERR(pnode)); 1606 return ERR_CAST(pnode);
1607 pnode = dirty_cow_pnode(c, pnode); 1607 pnode = dirty_cow_pnode(c, pnode);
1608 if (IS_ERR(pnode)) 1608 if (IS_ERR(pnode))
1609 return ERR_PTR(PTR_ERR(pnode)); 1609 return ERR_CAST(pnode);
1610 iip = (i & (UBIFS_LPT_FANOUT - 1)); 1610 iip = (i & (UBIFS_LPT_FANOUT - 1));
1611 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, 1611 dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
1612 pnode->lprops[iip].free, pnode->lprops[iip].dirty, 1612 pnode->lprops[iip].free, pnode->lprops[iip].dirty,
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 13cb7a4237bf..d12535b7fc78 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -646,7 +646,7 @@ static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
646 shft -= UBIFS_LPT_FANOUT_SHIFT; 646 shft -= UBIFS_LPT_FANOUT_SHIFT;
647 nnode = ubifs_get_nnode(c, nnode, iip); 647 nnode = ubifs_get_nnode(c, nnode, iip);
648 if (IS_ERR(nnode)) 648 if (IS_ERR(nnode))
649 return ERR_PTR(PTR_ERR(nnode)); 649 return ERR_CAST(nnode);
650 } 650 }
651 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); 651 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
652 return ubifs_get_pnode(c, nnode, iip); 652 return ubifs_get_pnode(c, nnode, iip);
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 109c6ea03bb5..daae9e1f5382 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -24,7 +24,7 @@
24 * This file implements functions needed to recover from unclean un-mounts. 24 * This file implements functions needed to recover from unclean un-mounts.
25 * When UBIFS is mounted, it checks a flag on the master node to determine if 25 * When UBIFS is mounted, it checks a flag on the master node to determine if
26 * an un-mount was completed successfully. If not, the process of mounting 26 * an un-mount was completed successfully. If not, the process of mounting
27 * incorparates additional checking and fixing of on-flash data structures. 27 * incorporates additional checking and fixing of on-flash data structures.
28 * UBIFS always cleans away all remnants of an unclean un-mount, so that 28 * UBIFS always cleans away all remnants of an unclean un-mount, so that
29 * errors do not accumulate. However UBIFS defers recovery if it is mounted 29 * errors do not accumulate. However UBIFS defers recovery if it is mounted
30 * read-only, and the flash is not modified in that case. 30 * read-only, and the flash is not modified in that case.
@@ -1063,8 +1063,21 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1063 } 1063 }
1064 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); 1064 err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
1065 if (err) { 1065 if (err) {
1066 if (err == -ENOSPC) 1066 /*
1067 dbg_err("could not find a dirty LEB"); 1067 * There are no dirty or empty LEBs subject to here being
1068 * enough for the index. Try to use
1069 * 'ubifs_find_free_leb_for_idx()', which will return any empty
1070 * LEBs (ignoring index requirements). If the index then
1071 * doesn't have enough LEBs the recovery commit will fail -
1072 * which is the same result anyway i.e. recovery fails. So
1073 * there is no problem ignoring index requirements and just
1074 * grabbing a free LEB since we have already established there
1075 * is not a dirty LEB we could have used instead.
1076 */
1077 if (err == -ENOSPC) {
1078 dbg_rcvry("could not find a dirty LEB");
1079 goto find_free;
1080 }
1068 return err; 1081 return err;
1069 } 1082 }
1070 ubifs_assert(!(lp.flags & LPROPS_INDEX)); 1083 ubifs_assert(!(lp.flags & LPROPS_INDEX));
@@ -1139,8 +1152,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
1139find_free: 1152find_free:
1140 /* 1153 /*
1141 * There is no GC head LEB or the free space in the GC head LEB is too 1154 * There is no GC head LEB or the free space in the GC head LEB is too
1142 * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so 1155 * small, or there are not dirty LEBs. Allocate gc_lnum by calling
1143 * GC is not run. 1156 * 'ubifs_find_free_leb_for_idx()' so GC is not run.
1144 */ 1157 */
1145 lnum = ubifs_find_free_leb_for_idx(c); 1158 lnum = ubifs_find_free_leb_for_idx(c);
1146 if (lnum < 0) { 1159 if (lnum < 0) {
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 02feb59cefca..0b201114a5ad 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,7 +277,7 @@ static int kick_a_thread(void)
277 return 0; 277 return 0;
278} 278}
279 279
280int ubifs_shrinker(int nr, gfp_t gfp_mask) 280int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
281{ 281{
282 int freed, contention = 0; 282 int freed, contention = 0;
283 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); 283 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4d2f2157dd3f..cd5900b85d38 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -327,7 +327,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
327 return err; 327 return err;
328} 328}
329 329
330static void ubifs_delete_inode(struct inode *inode) 330static void ubifs_evict_inode(struct inode *inode)
331{ 331{
332 int err; 332 int err;
333 struct ubifs_info *c = inode->i_sb->s_fs_info; 333 struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -343,9 +343,12 @@ static void ubifs_delete_inode(struct inode *inode)
343 343
344 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); 344 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
345 ubifs_assert(!atomic_read(&inode->i_count)); 345 ubifs_assert(!atomic_read(&inode->i_count));
346 ubifs_assert(inode->i_nlink == 0);
347 346
348 truncate_inode_pages(&inode->i_data, 0); 347 truncate_inode_pages(&inode->i_data, 0);
348
349 if (inode->i_nlink)
350 goto done;
351
349 if (is_bad_inode(inode)) 352 if (is_bad_inode(inode))
350 goto out; 353 goto out;
351 354
@@ -367,7 +370,8 @@ out:
367 c->nospace = c->nospace_rp = 0; 370 c->nospace = c->nospace_rp = 0;
368 smp_wmb(); 371 smp_wmb();
369 } 372 }
370 clear_inode(inode); 373done:
374 end_writeback(inode);
371} 375}
372 376
373static void ubifs_dirty_inode(struct inode *inode) 377static void ubifs_dirty_inode(struct inode *inode)
@@ -1307,6 +1311,8 @@ static int mount_ubifs(struct ubifs_info *c)
1307 if (err) 1311 if (err)
1308 goto out_orphans; 1312 goto out_orphans;
1309 err = ubifs_rcvry_gc_commit(c); 1313 err = ubifs_rcvry_gc_commit(c);
1314 if (err)
1315 goto out_orphans;
1310 } else { 1316 } else {
1311 err = take_gc_lnum(c); 1317 err = take_gc_lnum(c);
1312 if (err) 1318 if (err)
@@ -1318,7 +1324,7 @@ static int mount_ubifs(struct ubifs_info *c)
1318 */ 1324 */
1319 err = ubifs_leb_unmap(c, c->gc_lnum); 1325 err = ubifs_leb_unmap(c, c->gc_lnum);
1320 if (err) 1326 if (err)
1321 return err; 1327 goto out_orphans;
1322 } 1328 }
1323 1329
1324 err = dbg_check_lprops(c); 1330 err = dbg_check_lprops(c);
@@ -1824,7 +1830,7 @@ const struct super_operations ubifs_super_operations = {
1824 .destroy_inode = ubifs_destroy_inode, 1830 .destroy_inode = ubifs_destroy_inode,
1825 .put_super = ubifs_put_super, 1831 .put_super = ubifs_put_super,
1826 .write_inode = ubifs_write_inode, 1832 .write_inode = ubifs_write_inode,
1827 .delete_inode = ubifs_delete_inode, 1833 .evict_inode = ubifs_evict_inode,
1828 .statfs = ubifs_statfs, 1834 .statfs = ubifs_statfs,
1829 .dirty_inode = ubifs_dirty_inode, 1835 .dirty_inode = ubifs_dirty_inode,
1830 .remount_fs = ubifs_remount_fs, 1836 .remount_fs = ubifs_remount_fs,
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bd2542dad014..0c9876b396dd 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb {
379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses 379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot 380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
381 * make sure @inode->i_size is always changed under @ui_mutex, because it 381 * make sure @inode->i_size is always changed under @ui_mutex, because it
382 * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock 382 * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are 383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one 384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
385 * could consider to rework locking and base it on "shadow" fields. 385 * could consider to rework locking and base it on "shadow" fields.
@@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
1575int ubifs_tnc_end_commit(struct ubifs_info *c); 1575int ubifs_tnc_end_commit(struct ubifs_info *c);
1576 1576
1577/* shrinker.c */ 1577/* shrinker.c */
1578int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); 1578int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
1579 1579
1580/* commit.c */ 1580/* commit.c */
1581int ubifs_bg_thread(void *info); 1581int ubifs_bg_thread(void *info);
@@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1678int ubifs_calc_dark(const struct ubifs_info *c, int spc); 1678int ubifs_calc_dark(const struct ubifs_info *c, int spc);
1679 1679
1680/* file.c */ 1680/* file.c */
1681int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); 1681int ubifs_fsync(struct file *file, int datasync);
1682int ubifs_setattr(struct dentry *dentry, struct iattr *attr); 1682int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
1683 1683
1684/* dir.c */ 1684/* dir.c */
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 9a9378b4eb5a..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
21 21
22#include "udfdecl.h" 22#include "udfdecl.h"
23 23
24#include <linux/quotaops.h>
25#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
26#include <linux/bitops.h> 25#include <linux/bitops.h>
27 26
@@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
159 udf_debug("byte=%2x\n", 158 udf_debug("byte=%2x\n",
160 ((char *)bh->b_data)[(bit + i) >> 3]); 159 ((char *)bh->b_data)[(bit + i) >> 3]);
161 } else { 160 } else {
162 if (inode)
163 dquot_free_block(inode, 1);
164 udf_add_free_space(sb, sbi->s_partition, 1); 161 udf_add_free_space(sb, sbi->s_partition, 1);
165 } 162 }
166 } 163 }
@@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
210 bit = block % (sb->s_blocksize << 3); 207 bit = block % (sb->s_blocksize << 3);
211 208
212 while (bit < (sb->s_blocksize << 3) && block_count > 0) { 209 while (bit < (sb->s_blocksize << 3) && block_count > 0) {
213 if (!udf_test_bit(bit, bh->b_data)) 210 if (!udf_clear_bit(bit, bh->b_data))
214 goto out; 211 goto out;
215 else if (dquot_prealloc_block(inode, 1))
216 goto out;
217 else if (!udf_clear_bit(bit, bh->b_data)) {
218 udf_debug("bit already cleared for block %d\n", bit);
219 dquot_free_block(inode, 1);
220 goto out;
221 }
222 block_count--; 212 block_count--;
223 alloc_count++; 213 alloc_count++;
224 bit++; 214 bit++;
@@ -338,20 +328,6 @@ search_back:
338 } 328 }
339 329
340got_block: 330got_block:
341
342 /*
343 * Check quota for allocation of this block.
344 */
345 if (inode) {
346 int ret = dquot_alloc_block(inode, 1);
347
348 if (ret) {
349 mutex_unlock(&sbi->s_alloc_mutex);
350 *err = ret;
351 return 0;
352 }
353 }
354
355 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - 331 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
356 (sizeof(struct spaceBitmapDesc) << 3); 332 (sizeof(struct spaceBitmapDesc) << 3);
357 333
@@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
401 } 377 }
402 378
403 iinfo = UDF_I(table); 379 iinfo = UDF_I(table);
404 /* We do this up front - There are some error conditions that
405 could occure, but.. oh well */
406 if (inode)
407 dquot_free_block(inode, count);
408 udf_add_free_space(sb, sbi->s_partition, count); 380 udf_add_free_space(sb, sbi->s_partition, count);
409 381
410 start = bloc->logicalBlockNum + offset; 382 start = bloc->logicalBlockNum + offset;
@@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
649 epos.offset -= adsize; 621 epos.offset -= adsize;
650 622
651 alloc_count = (elen >> sb->s_blocksize_bits); 623 alloc_count = (elen >> sb->s_blocksize_bits);
652 if (inode && dquot_prealloc_block(inode, 624 if (alloc_count > block_count) {
653 alloc_count > block_count ? block_count : alloc_count))
654 alloc_count = 0;
655 else if (alloc_count > block_count) {
656 alloc_count = block_count; 625 alloc_count = block_count;
657 eloc.logicalBlockNum += alloc_count; 626 eloc.logicalBlockNum += alloc_count;
658 elen -= (alloc_count << sb->s_blocksize_bits); 627 elen -= (alloc_count << sb->s_blocksize_bits);
@@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
752 newblock = goal_eloc.logicalBlockNum; 721 newblock = goal_eloc.logicalBlockNum;
753 goal_eloc.logicalBlockNum++; 722 goal_eloc.logicalBlockNum++;
754 goal_elen -= sb->s_blocksize; 723 goal_elen -= sb->s_blocksize;
755 if (inode) {
756 *err = dquot_alloc_block(inode, 1);
757 if (*err) {
758 brelse(goal_epos.bh);
759 mutex_unlock(&sbi->s_alloc_mutex);
760 return 0;
761 }
762 }
763 724
764 if (goal_elen) 725 if (goal_elen)
765 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); 726 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f0f2a436251e..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
207 207
208/* readdir and lookup functions */ 208/* readdir and lookup functions */
209const struct file_operations udf_dir_operations = { 209const struct file_operations udf_dir_operations = {
210 .llseek = generic_file_llseek,
210 .read = generic_read_dir, 211 .read = generic_read_dir,
211 .readdir = udf_readdir, 212 .readdir = udf_readdir,
212 .ioctl = udf_ioctl, 213 .unlocked_ioctl = udf_ioctl,
213 .fsync = simple_fsync, 214 .fsync = generic_file_fsync,
214}; 215};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 4b6a46ccbf46..66b9e7e7e4c5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,6 @@
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h> 35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
38#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
39#include <linux/aio.h> 38#include <linux/aio.h>
40 39
@@ -144,50 +143,60 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
144 return retval; 143 return retval;
145} 144}
146 145
147int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 146long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
148 unsigned long arg)
149{ 147{
148 struct inode *inode = filp->f_dentry->d_inode;
150 long old_block, new_block; 149 long old_block, new_block;
151 int result = -EINVAL; 150 int result = -EINVAL;
152 151
152 lock_kernel();
153
153 if (file_permission(filp, MAY_READ) != 0) { 154 if (file_permission(filp, MAY_READ) != 0) {
154 udf_debug("no permission to access inode %lu\n", 155 udf_debug("no permission to access inode %lu\n", inode->i_ino);
155 inode->i_ino); 156 result = -EPERM;
156 return -EPERM; 157 goto out;
157 } 158 }
158 159
159 if (!arg) { 160 if (!arg) {
160 udf_debug("invalid argument to udf_ioctl\n"); 161 udf_debug("invalid argument to udf_ioctl\n");
161 return -EINVAL; 162 result = -EINVAL;
163 goto out;
162 } 164 }
163 165
164 switch (cmd) { 166 switch (cmd) {
165 case UDF_GETVOLIDENT: 167 case UDF_GETVOLIDENT:
166 if (copy_to_user((char __user *)arg, 168 if (copy_to_user((char __user *)arg,
167 UDF_SB(inode->i_sb)->s_volume_ident, 32)) 169 UDF_SB(inode->i_sb)->s_volume_ident, 32))
168 return -EFAULT; 170 result = -EFAULT;
169 else 171 else
170 return 0; 172 result = 0;
173 goto out;
171 case UDF_RELOCATE_BLOCKS: 174 case UDF_RELOCATE_BLOCKS:
172 if (!capable(CAP_SYS_ADMIN)) 175 if (!capable(CAP_SYS_ADMIN)) {
173 return -EACCES; 176 result = -EACCES;
174 if (get_user(old_block, (long __user *)arg)) 177 goto out;
175 return -EFAULT; 178 }
179 if (get_user(old_block, (long __user *)arg)) {
180 result = -EFAULT;
181 goto out;
182 }
176 result = udf_relocate_blocks(inode->i_sb, 183 result = udf_relocate_blocks(inode->i_sb,
177 old_block, &new_block); 184 old_block, &new_block);
178 if (result == 0) 185 if (result == 0)
179 result = put_user(new_block, (long __user *)arg); 186 result = put_user(new_block, (long __user *)arg);
180 return result; 187 goto out;
181 case UDF_GETEASIZE: 188 case UDF_GETEASIZE:
182 result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); 189 result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
183 break; 190 goto out;
184 case UDF_GETEABLOCK: 191 case UDF_GETEABLOCK:
185 result = copy_to_user((char __user *)arg, 192 result = copy_to_user((char __user *)arg,
186 UDF_I(inode)->i_ext.i_data, 193 UDF_I(inode)->i_ext.i_data,
187 UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; 194 UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
188 break; 195 goto out;
189 } 196 }
190 197
198out:
199 unlock_kernel();
191 return result; 200 return result;
192} 201}
193 202
@@ -207,40 +216,39 @@ static int udf_release_file(struct inode *inode, struct file *filp)
207const struct file_operations udf_file_operations = { 216const struct file_operations udf_file_operations = {
208 .read = do_sync_read, 217 .read = do_sync_read,
209 .aio_read = generic_file_aio_read, 218 .aio_read = generic_file_aio_read,
210 .ioctl = udf_ioctl, 219 .unlocked_ioctl = udf_ioctl,
211 .open = dquot_file_open, 220 .open = generic_file_open,
212 .mmap = generic_file_mmap, 221 .mmap = generic_file_mmap,
213 .write = do_sync_write, 222 .write = do_sync_write,
214 .aio_write = udf_file_aio_write, 223 .aio_write = udf_file_aio_write,
215 .release = udf_release_file, 224 .release = udf_release_file,
216 .fsync = simple_fsync, 225 .fsync = generic_file_fsync,
217 .splice_read = generic_file_splice_read, 226 .splice_read = generic_file_splice_read,
218 .llseek = generic_file_llseek, 227 .llseek = generic_file_llseek,
219}; 228};
220 229
221int udf_setattr(struct dentry *dentry, struct iattr *iattr) 230static int udf_setattr(struct dentry *dentry, struct iattr *attr)
222{ 231{
223 struct inode *inode = dentry->d_inode; 232 struct inode *inode = dentry->d_inode;
224 int error; 233 int error;
225 234
226 error = inode_change_ok(inode, iattr); 235 error = inode_change_ok(inode, attr);
227 if (error) 236 if (error)
228 return error; 237 return error;
229 238
230 if (iattr->ia_valid & ATTR_SIZE) 239 if ((attr->ia_valid & ATTR_SIZE) &&
231 dquot_initialize(inode); 240 attr->ia_size != i_size_read(inode)) {
232 241 error = vmtruncate(inode, attr->ia_size);
233 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
234 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
235 error = dquot_transfer(inode, iattr);
236 if (error) 242 if (error)
237 return error; 243 return error;
238 } 244 }
239 245
240 return inode_setattr(inode, iattr); 246 setattr_copy(inode, attr);
247 mark_inode_dirty(inode);
248 return 0;
241} 249}
242 250
243const struct inode_operations udf_file_inode_operations = { 251const struct inode_operations udf_file_inode_operations = {
244 .truncate = udf_truncate,
245 .setattr = udf_setattr, 252 .setattr = udf_setattr,
253 .truncate = udf_truncate,
246}; 254};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index fb68c9cd0c3e..75d9304d0dc3 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
20 20
21#include "udfdecl.h" 21#include "udfdecl.h"
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include <linux/sched.h> 23#include <linux/sched.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26 25
@@ -32,15 +31,6 @@ void udf_free_inode(struct inode *inode)
32 struct super_block *sb = inode->i_sb; 31 struct super_block *sb = inode->i_sb;
33 struct udf_sb_info *sbi = UDF_SB(sb); 32 struct udf_sb_info *sbi = UDF_SB(sb);
34 33
35 /*
36 * Note: we must free any quota before locking the superblock,
37 * as writing the quota to disk may need the lock as well.
38 */
39 dquot_free_inode(inode);
40 dquot_drop(inode);
41
42 clear_inode(inode);
43
44 mutex_lock(&sbi->s_alloc_mutex); 34 mutex_lock(&sbi->s_alloc_mutex);
45 if (sbi->s_lvid_bh) { 35 if (sbi->s_lvid_bh) {
46 struct logicalVolIntegrityDescImpUse *lvidiu = 36 struct logicalVolIntegrityDescImpUse *lvidiu =
@@ -61,7 +51,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
61 struct super_block *sb = dir->i_sb; 51 struct super_block *sb = dir->i_sb;
62 struct udf_sb_info *sbi = UDF_SB(sb); 52 struct udf_sb_info *sbi = UDF_SB(sb);
63 struct inode *inode; 53 struct inode *inode;
64 int block, ret; 54 int block;
65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; 55 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
66 struct udf_inode_info *iinfo; 56 struct udf_inode_info *iinfo;
67 struct udf_inode_info *dinfo = UDF_I(dir); 57 struct udf_inode_info *dinfo = UDF_I(dir);
@@ -124,15 +114,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
124 udf_updated_lvid(sb); 114 udf_updated_lvid(sb);
125 } 115 }
126 mutex_unlock(&sbi->s_alloc_mutex); 116 mutex_unlock(&sbi->s_alloc_mutex);
127 inode->i_mode = mode; 117
128 inode->i_uid = current_fsuid(); 118 inode_init_owner(inode, dir, mode);
129 if (dir->i_mode & S_ISGID) {
130 inode->i_gid = dir->i_gid;
131 if (S_ISDIR(mode))
132 mode |= S_ISGID;
133 } else {
134 inode->i_gid = current_fsgid();
135 }
136 119
137 iinfo->i_location.logicalBlockNum = block; 120 iinfo->i_location.logicalBlockNum = block;
138 iinfo->i_location.partitionReferenceNum = 121 iinfo->i_location.partitionReferenceNum =
@@ -153,17 +136,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
153 insert_inode_hash(inode); 136 insert_inode_hash(inode);
154 mark_inode_dirty(inode); 137 mark_inode_dirty(inode);
155 138
156 dquot_initialize(inode);
157 ret = dquot_alloc_inode(inode);
158 if (ret) {
159 dquot_drop(inode);
160 inode->i_flags |= S_NOQUOTA;
161 inode->i_nlink = 0;
162 iput(inode);
163 *err = ret;
164 return NULL;
165 }
166
167 *err = 0; 139 *err = 0;
168 return inode; 140 return inode;
169} 141}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8a3fbd177cab..fc48f37aa2dd 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/quotaops.h>
40#include <linux/slab.h> 39#include <linux/slab.h>
41#include <linux/crc-itu-t.h> 40#include <linux/crc-itu-t.h>
42 41
@@ -69,40 +68,23 @@ static void udf_update_extents(struct inode *,
69static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); 68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
70 69
71 70
72void udf_delete_inode(struct inode *inode) 71void udf_evict_inode(struct inode *inode)
73{ 72{
74 if (!is_bad_inode(inode)) 73 struct udf_inode_info *iinfo = UDF_I(inode);
75 dquot_initialize(inode); 74 int want_delete = 0;
76 75
77 truncate_inode_pages(&inode->i_data, 0); 76 truncate_inode_pages(&inode->i_data, 0);
78 77
79 if (is_bad_inode(inode)) 78 if (!inode->i_nlink && !is_bad_inode(inode)) {
80 goto no_delete; 79 want_delete = 1;
81 80 inode->i_size = 0;
82 inode->i_size = 0; 81 udf_truncate(inode);
83 udf_truncate(inode); 82 lock_kernel();
84 lock_kernel(); 83 udf_update_inode(inode, IS_SYNC(inode));
85 84 unlock_kernel();
86 udf_update_inode(inode, IS_SYNC(inode)); 85 }
87 udf_free_inode(inode); 86 invalidate_inode_buffers(inode);
88 87 end_writeback(inode);
89 unlock_kernel();
90 return;
91
92no_delete:
93 clear_inode(inode);
94}
95
96/*
97 * If we are going to release inode from memory, we truncate last inode extent
98 * to proper length. We could use drop_inode() but it's called under inode_lock
99 * and thus we cannot mark inode dirty there. We use clear_inode() but we have
100 * to make sure to write inode as it's not written automatically.
101 */
102void udf_clear_inode(struct inode *inode)
103{
104 struct udf_inode_info *iinfo = UDF_I(inode);
105
106 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 88 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
107 inode->i_size != iinfo->i_lenExtents) { 89 inode->i_size != iinfo->i_lenExtents) {
108 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " 90 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
@@ -112,10 +94,13 @@ void udf_clear_inode(struct inode *inode)
112 (unsigned long long)inode->i_size, 94 (unsigned long long)inode->i_size,
113 (unsigned long long)iinfo->i_lenExtents); 95 (unsigned long long)iinfo->i_lenExtents);
114 } 96 }
115
116 dquot_drop(inode);
117 kfree(iinfo->i_ext.i_data); 97 kfree(iinfo->i_ext.i_data);
118 iinfo->i_ext.i_data = NULL; 98 iinfo->i_ext.i_data = NULL;
99 if (want_delete) {
100 lock_kernel();
101 udf_free_inode(inode);
102 unlock_kernel();
103 }
119} 104}
120 105
121static int udf_writepage(struct page *page, struct writeback_control *wbc) 106static int udf_writepage(struct page *page, struct writeback_control *wbc)
@@ -132,9 +117,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
132 loff_t pos, unsigned len, unsigned flags, 117 loff_t pos, unsigned len, unsigned flags,
133 struct page **pagep, void **fsdata) 118 struct page **pagep, void **fsdata)
134{ 119{
135 *pagep = NULL; 120 int ret;
136 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 121
137 udf_get_block); 122 ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
123 if (unlikely(ret)) {
124 loff_t isize = mapping->host->i_size;
125 if (pos + len > isize)
126 vmtruncate(mapping->host, isize);
127 }
128
129 return ret;
138} 130}
139 131
140static sector_t udf_bmap(struct address_space *mapping, sector_t block) 132static sector_t udf_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 75816025f95f..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/quotaops.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
33#include <linux/sched.h> 32#include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
563 int err; 562 int err;
564 struct udf_inode_info *iinfo; 563 struct udf_inode_info *iinfo;
565 564
566 dquot_initialize(dir);
567
568 lock_kernel(); 565 lock_kernel();
569 inode = udf_new_inode(dir, mode, &err); 566 inode = udf_new_inode(dir, mode, &err);
570 if (!inode) { 567 if (!inode) {
@@ -579,7 +576,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
579 inode->i_data.a_ops = &udf_aops; 576 inode->i_data.a_ops = &udf_aops;
580 inode->i_op = &udf_file_inode_operations; 577 inode->i_op = &udf_file_inode_operations;
581 inode->i_fop = &udf_file_operations; 578 inode->i_fop = &udf_file_operations;
582 inode->i_mode = mode;
583 mark_inode_dirty(inode); 579 mark_inode_dirty(inode);
584 580
585 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 581 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -618,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
618 if (!old_valid_dev(rdev)) 614 if (!old_valid_dev(rdev))
619 return -EINVAL; 615 return -EINVAL;
620 616
621 dquot_initialize(dir);
622
623 lock_kernel(); 617 lock_kernel();
624 err = -EIO; 618 err = -EIO;
625 inode = udf_new_inode(dir, mode, &err); 619 inode = udf_new_inode(dir, mode, &err);
@@ -627,7 +621,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
627 goto out; 621 goto out;
628 622
629 iinfo = UDF_I(inode); 623 iinfo = UDF_I(inode);
630 inode->i_uid = current_fsuid();
631 init_special_inode(inode, mode, rdev); 624 init_special_inode(inode, mode, rdev);
632 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 625 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
633 if (!fi) { 626 if (!fi) {
@@ -666,15 +659,13 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
666 struct udf_inode_info *dinfo = UDF_I(dir); 659 struct udf_inode_info *dinfo = UDF_I(dir);
667 struct udf_inode_info *iinfo; 660 struct udf_inode_info *iinfo;
668 661
669 dquot_initialize(dir);
670
671 lock_kernel(); 662 lock_kernel();
672 err = -EMLINK; 663 err = -EMLINK;
673 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 664 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
674 goto out; 665 goto out;
675 666
676 err = -EIO; 667 err = -EIO;
677 inode = udf_new_inode(dir, S_IFDIR, &err); 668 inode = udf_new_inode(dir, S_IFDIR | mode, &err);
678 if (!inode) 669 if (!inode)
679 goto out; 670 goto out;
680 671
@@ -697,9 +688,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
697 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; 688 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
698 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); 689 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
699 brelse(fibh.sbh); 690 brelse(fibh.sbh);
700 inode->i_mode = S_IFDIR | mode;
701 if (dir->i_mode & S_ISGID)
702 inode->i_mode |= S_ISGID;
703 mark_inode_dirty(inode); 691 mark_inode_dirty(inode);
704 692
705 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 693 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -805,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
805 struct fileIdentDesc *fi, cfi; 793 struct fileIdentDesc *fi, cfi;
806 struct kernel_lb_addr tloc; 794 struct kernel_lb_addr tloc;
807 795
808 dquot_initialize(dir);
809
810 retval = -ENOENT; 796 retval = -ENOENT;
811 lock_kernel(); 797 lock_kernel();
812 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 798 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -853,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
853 struct fileIdentDesc cfi; 839 struct fileIdentDesc cfi;
854 struct kernel_lb_addr tloc; 840 struct kernel_lb_addr tloc;
855 841
856 dquot_initialize(dir);
857
858 retval = -ENOENT; 842 retval = -ENOENT;
859 lock_kernel(); 843 lock_kernel();
860 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 844 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -909,10 +893,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
909 struct buffer_head *bh; 893 struct buffer_head *bh;
910 struct udf_inode_info *iinfo; 894 struct udf_inode_info *iinfo;
911 895
912 dquot_initialize(dir);
913
914 lock_kernel(); 896 lock_kernel();
915 inode = udf_new_inode(dir, S_IFLNK, &err); 897 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
916 if (!inode) 898 if (!inode)
917 goto out; 899 goto out;
918 900
@@ -923,7 +905,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
923 } 905 }
924 906
925 iinfo = UDF_I(inode); 907 iinfo = UDF_I(inode);
926 inode->i_mode = S_IFLNK | S_IRWXUGO;
927 inode->i_data.a_ops = &udf_symlink_aops; 908 inode->i_data.a_ops = &udf_symlink_aops;
928 inode->i_op = &udf_symlink_inode_operations; 909 inode->i_op = &udf_symlink_inode_operations;
929 910
@@ -1081,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1081 int err; 1062 int err;
1082 struct buffer_head *bh; 1063 struct buffer_head *bh;
1083 1064
1084 dquot_initialize(dir);
1085
1086 lock_kernel(); 1065 lock_kernel();
1087 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1066 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1088 unlock_kernel(); 1067 unlock_kernel();
@@ -1145,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1145 struct kernel_lb_addr tloc; 1124 struct kernel_lb_addr tloc;
1146 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1125 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1147 1126
1148 dquot_initialize(old_dir);
1149 dquot_initialize(new_dir);
1150
1151 lock_kernel(); 1127 lock_kernel();
1152 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1128 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1153 if (ofi) { 1129 if (ofi) {
@@ -1393,7 +1369,6 @@ const struct export_operations udf_export_ops = {
1393const struct inode_operations udf_dir_inode_operations = { 1369const struct inode_operations udf_dir_inode_operations = {
1394 .lookup = udf_lookup, 1370 .lookup = udf_lookup,
1395 .create = udf_create, 1371 .create = udf_create,
1396 .setattr = udf_setattr,
1397 .link = udf_link, 1372 .link = udf_link,
1398 .unlink = udf_unlink, 1373 .unlink = udf_unlink,
1399 .symlink = udf_symlink, 1374 .symlink = udf_symlink,
@@ -1406,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = {
1406 .readlink = generic_readlink, 1381 .readlink = generic_readlink,
1407 .follow_link = page_follow_link_light, 1382 .follow_link = page_follow_link_light,
1408 .put_link = page_put_link, 1383 .put_link = page_put_link,
1409 .setattr = udf_setattr,
1410}; 1384};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..65412d84a45d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -175,8 +175,7 @@ static const struct super_operations udf_sb_ops = {
175 .alloc_inode = udf_alloc_inode, 175 .alloc_inode = udf_alloc_inode,
176 .destroy_inode = udf_destroy_inode, 176 .destroy_inode = udf_destroy_inode,
177 .write_inode = udf_write_inode, 177 .write_inode = udf_write_inode,
178 .delete_inode = udf_delete_inode, 178 .evict_inode = udf_evict_inode,
179 .clear_inode = udf_clear_inode,
180 .put_super = udf_put_super, 179 .put_super = udf_put_super,
181 .sync_fs = udf_sync_fs, 180 .sync_fs = udf_sync_fs,
182 .statfs = udf_statfs, 181 .statfs = udf_statfs,
@@ -557,6 +556,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
557{ 556{
558 struct udf_options uopt; 557 struct udf_options uopt;
559 struct udf_sb_info *sbi = UDF_SB(sb); 558 struct udf_sb_info *sbi = UDF_SB(sb);
559 int error = 0;
560 560
561 uopt.flags = sbi->s_flags; 561 uopt.flags = sbi->s_flags;
562 uopt.uid = sbi->s_uid; 562 uopt.uid = sbi->s_uid;
@@ -582,17 +582,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
582 *flags |= MS_RDONLY; 582 *flags |= MS_RDONLY;
583 } 583 }
584 584
585 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 585 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
586 unlock_kernel(); 586 goto out_unlock;
587 return 0; 587
588 }
589 if (*flags & MS_RDONLY) 588 if (*flags & MS_RDONLY)
590 udf_close_lvid(sb); 589 udf_close_lvid(sb);
591 else 590 else
592 udf_open_lvid(sb); 591 udf_open_lvid(sb);
593 592
593out_unlock:
594 unlock_kernel(); 594 unlock_kernel();
595 return 0; 595 return error;
596} 596}
597 597
598/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ 598/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1578,9 +1578,7 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
1578{ 1578{
1579 struct anchorVolDescPtr *anchor; 1579 struct anchorVolDescPtr *anchor;
1580 long main_s, main_e, reserve_s, reserve_e; 1580 long main_s, main_e, reserve_s, reserve_e;
1581 struct udf_sb_info *sbi;
1582 1581
1583 sbi = UDF_SB(sb);
1584 anchor = (struct anchorVolDescPtr *)bh->b_data; 1582 anchor = (struct anchorVolDescPtr *)bh->b_data;
1585 1583
1586 /* Locate the main sequence */ 1584 /* Locate the main sequence */
@@ -1939,7 +1937,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1939 /* Fill in the rest of the superblock */ 1937 /* Fill in the rest of the superblock */
1940 sb->s_op = &udf_sb_ops; 1938 sb->s_op = &udf_sb_ops;
1941 sb->s_export_op = &udf_export_ops; 1939 sb->s_export_op = &udf_export_ops;
1942 sb->dq_op = NULL; 1940
1943 sb->s_dirt = 0; 1941 sb->s_dirt = 0;
1944 sb->s_magic = UDF_SUPER_MAGIC; 1942 sb->s_magic = UDF_SUPER_MAGIC;
1945 sb->s_time_gran = 1000; 1943 sb->s_time_gran = 1000;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 702a1148e702..6995ab1f4305 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -130,9 +130,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
130 uint8_t *, uint8_t *); 130 uint8_t *, uint8_t *);
131 131
132/* file.c */ 132/* file.c */
133extern int udf_ioctl(struct inode *, struct file *, unsigned int, 133extern long udf_ioctl(struct file *, unsigned int, unsigned long);
134 unsigned long);
135extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
136/* inode.c */ 134/* inode.c */
137extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 135extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
138extern int udf_sync_inode(struct inode *); 136extern int udf_sync_inode(struct inode *);
@@ -141,8 +139,7 @@ extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
141extern struct buffer_head *udf_bread(struct inode *, int, int, int *); 139extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
142extern void udf_truncate(struct inode *); 140extern void udf_truncate(struct inode *);
143extern void udf_read_inode(struct inode *); 141extern void udf_read_inode(struct inode *);
144extern void udf_delete_inode(struct inode *); 142extern void udf_evict_inode(struct inode *);
145extern void udf_clear_inode(struct inode *);
146extern int udf_write_inode(struct inode *, struct writeback_control *wbc); 143extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
147extern long udf_block_map(struct inode *, sector_t); 144extern long udf_block_map(struct inode *, sector_t);
148extern int udf_extend_file(struct inode *, struct extent_position *, 145extern int udf_extend_file(struct inode *, struct extent_position *,
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..46f7a807bbc1 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
12#include <linux/stat.h> 12#include <linux/stat.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/quotaops.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
85 "bit already cleared for fragment %u", i); 84 "bit already cleared for fragment %u", i);
86 } 85 }
87 86
88 dquot_free_block(inode, count);
89
90
91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 87 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
92 uspi->cs_total.cs_nffree += count; 88 uspi->cs_total.cs_nffree += count;
93 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 89 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -118,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
118 114
119 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 115 ubh_mark_buffer_dirty (USPI_UBH(uspi));
120 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 116 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
121 if (sb->s_flags & MS_SYNCHRONOUS) { 117 if (sb->s_flags & MS_SYNCHRONOUS)
122 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 118 ubh_sync_block(UCPI_UBH(ucpi));
123 ubh_wait_on_buffer (UCPI_UBH(ucpi));
124 }
125 sb->s_dirt = 1; 119 sb->s_dirt = 1;
126 120
127 unlock_super (sb); 121 unlock_super (sb);
@@ -195,7 +189,6 @@ do_more:
195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 189 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 190 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
197 ufs_clusteracct (sb, ucpi, blkno, 1); 191 ufs_clusteracct (sb, ucpi, blkno, 1);
198 dquot_free_block(inode, uspi->s_fpb);
199 192
200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 193 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
201 uspi->cs_total.cs_nbfree++; 194 uspi->cs_total.cs_nbfree++;
@@ -212,10 +205,8 @@ do_more:
212 205
213 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 206 ubh_mark_buffer_dirty (USPI_UBH(uspi));
214 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 207 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
215 if (sb->s_flags & MS_SYNCHRONOUS) { 208 if (sb->s_flags & MS_SYNCHRONOUS)
216 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 209 ubh_sync_block(UCPI_UBH(ucpi));
217 ubh_wait_on_buffer (UCPI_UBH(ucpi));
218 }
219 210
220 if (overflow) { 211 if (overflow) {
221 fragment += count; 212 fragment += count;
@@ -511,7 +502,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
511 struct ufs_cg_private_info * ucpi; 502 struct ufs_cg_private_info * ucpi;
512 struct ufs_cylinder_group * ucg; 503 struct ufs_cylinder_group * ucg;
513 unsigned cgno, fragno, fragoff, count, fragsize, i; 504 unsigned cgno, fragno, fragoff, count, fragsize, i;
514 int ret;
515 505
516 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", 506 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
517 (unsigned long long)fragment, oldcount, newcount); 507 (unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +547,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
557 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 547 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
558 for (i = oldcount; i < newcount; i++) 548 for (i = oldcount; i < newcount; i++)
559 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); 549 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
560 ret = dquot_alloc_block(inode, count);
561 if (ret) {
562 *err = ret;
563 return 0;
564 }
565 550
566 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); 551 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
567 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 552 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -569,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
569 554
570 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 555 ubh_mark_buffer_dirty (USPI_UBH(uspi));
571 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 556 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
572 if (sb->s_flags & MS_SYNCHRONOUS) { 557 if (sb->s_flags & MS_SYNCHRONOUS)
573 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 558 ubh_sync_block(UCPI_UBH(ucpi));
574 ubh_wait_on_buffer (UCPI_UBH(ucpi));
575 }
576 sb->s_dirt = 1; 559 sb->s_dirt = 1;
577 560
578 UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment); 561 UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
@@ -598,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
598 struct ufs_cylinder_group * ucg; 581 struct ufs_cylinder_group * ucg;
599 unsigned oldcg, i, j, k, allocsize; 582 unsigned oldcg, i, j, k, allocsize;
600 u64 result; 583 u64 result;
601 int ret;
602 584
603 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", 585 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
604 inode->i_ino, cgno, (unsigned long long)goal, count); 586 inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +649,6 @@ cg_found:
667 for (i = count; i < uspi->s_fpb; i++) 649 for (i = count; i < uspi->s_fpb; i++)
668 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); 650 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
669 i = uspi->s_fpb - count; 651 i = uspi->s_fpb - count;
670 dquot_free_block(inode, i);
671 652
672 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 653 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
673 uspi->cs_total.cs_nffree += i; 654 uspi->cs_total.cs_nffree += i;
@@ -679,11 +660,6 @@ cg_found:
679 result = ufs_bitmap_search (sb, ucpi, goal, allocsize); 660 result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
680 if (result == INVBLOCK) 661 if (result == INVBLOCK)
681 return 0; 662 return 0;
682 ret = dquot_alloc_block(inode, count);
683 if (ret) {
684 *err = ret;
685 return 0;
686 }
687 for (i = 0; i < count; i++) 663 for (i = 0; i < count; i++)
688 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); 664 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
689 665
@@ -698,10 +674,8 @@ cg_found:
698succed: 674succed:
699 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 675 ubh_mark_buffer_dirty (USPI_UBH(uspi));
700 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 676 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
701 if (sb->s_flags & MS_SYNCHRONOUS) { 677 if (sb->s_flags & MS_SYNCHRONOUS)
702 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 678 ubh_sync_block(UCPI_UBH(ucpi));
703 ubh_wait_on_buffer (UCPI_UBH(ucpi));
704 }
705 sb->s_dirt = 1; 679 sb->s_dirt = 1;
706 680
707 result += cgno * uspi->s_fpg; 681 result += cgno * uspi->s_fpg;
@@ -718,7 +692,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
718 struct ufs_super_block_first * usb1; 692 struct ufs_super_block_first * usb1;
719 struct ufs_cylinder_group * ucg; 693 struct ufs_cylinder_group * ucg;
720 u64 result, blkno; 694 u64 result, blkno;
721 int ret;
722 695
723 UFSD("ENTER, goal %llu\n", (unsigned long long)goal); 696 UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
724 697
@@ -752,11 +725,6 @@ gotit:
752 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 725 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
753 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 726 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
754 ufs_clusteracct (sb, ucpi, blkno, -1); 727 ufs_clusteracct (sb, ucpi, blkno, -1);
755 ret = dquot_alloc_block(inode, uspi->s_fpb);
756 if (ret) {
757 *err = ret;
758 return INVBLOCK;
759 }
760 728
761 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); 729 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
762 uspi->cs_total.cs_nbfree--; 730 uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..dbc90994715a 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -95,8 +95,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
95 int err; 95 int err;
96 96
97 lock_page(page); 97 lock_page(page);
98 err = __ufs_write_begin(NULL, page->mapping, pos, len, 98 err = ufs_prepare_chunk(page, pos, len);
99 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
100 BUG_ON(err); 99 BUG_ON(err);
101 100
102 de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); 101 de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
@@ -381,8 +380,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
381got_it: 380got_it:
382 pos = page_offset(page) + 381 pos = page_offset(page) +
383 (char*)de - (char*)page_address(page); 382 (char*)de - (char*)page_address(page);
384 err = __ufs_write_begin(NULL, page->mapping, pos, rec_len, 383 err = ufs_prepare_chunk(page, pos, rec_len);
385 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
386 if (err) 384 if (err)
387 goto out_unlock; 385 goto out_unlock;
388 if (de->d_ino) { 386 if (de->d_ino) {
@@ -518,7 +516,6 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
518 struct page * page) 516 struct page * page)
519{ 517{
520 struct super_block *sb = inode->i_sb; 518 struct super_block *sb = inode->i_sb;
521 struct address_space *mapping = page->mapping;
522 char *kaddr = page_address(page); 519 char *kaddr = page_address(page);
523 unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); 520 unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
524 unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); 521 unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
@@ -549,8 +546,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
549 546
550 pos = page_offset(page) + from; 547 pos = page_offset(page) + from;
551 lock_page(page); 548 lock_page(page);
552 err = __ufs_write_begin(NULL, mapping, pos, to - from, 549 err = ufs_prepare_chunk(page, pos, to - from);
553 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
554 BUG_ON(err); 550 BUG_ON(err);
555 if (pde) 551 if (pde)
556 pde->d_reclen = cpu_to_fs16(sb, to - from); 552 pde->d_reclen = cpu_to_fs16(sb, to - from);
@@ -577,8 +573,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
577 if (!page) 573 if (!page)
578 return -ENOMEM; 574 return -ENOMEM;
579 575
580 err = __ufs_write_begin(NULL, mapping, 0, chunk_size, 576 err = ufs_prepare_chunk(page, 0, chunk_size);
581 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
582 if (err) { 577 if (err) {
583 unlock_page(page); 578 unlock_page(page);
584 goto fail; 579 goto fail;
@@ -666,6 +661,6 @@ not_empty:
666const struct file_operations ufs_dir_operations = { 661const struct file_operations ufs_dir_operations = {
667 .read = generic_read_dir, 662 .read = generic_read_dir,
668 .readdir = ufs_readdir, 663 .readdir = ufs_readdir,
669 .fsync = simple_fsync, 664 .fsync = generic_file_fsync,
670 .llseek = generic_file_llseek, 665 .llseek = generic_file_llseek,
671}; 666};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/quotaops.h>
28 27
29#include "ufs_fs.h" 28#include "ufs_fs.h"
30#include "ufs.h" 29#include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
41 .write = do_sync_write, 40 .write = do_sync_write,
42 .aio_write = generic_file_aio_write, 41 .aio_write = generic_file_aio_write,
43 .mmap = generic_file_mmap, 42 .mmap = generic_file_mmap,
44 .open = dquot_file_open, 43 .open = generic_file_open,
45 .fsync = simple_fsync, 44 .fsync = generic_file_fsync,
46 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
47}; 46};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 230ecf608026..2eabf04af3de 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/string.h> 29#include <linux/string.h>
30#include <linux/quotaops.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/sched.h> 31#include <linux/sched.h>
33#include <linux/bitops.h> 32#include <linux/bitops.h>
@@ -95,11 +94,6 @@ void ufs_free_inode (struct inode * inode)
95 94
96 is_directory = S_ISDIR(inode->i_mode); 95 is_directory = S_ISDIR(inode->i_mode);
97 96
98 dquot_free_inode(inode);
99 dquot_drop(inode);
100
101 clear_inode (inode);
102
103 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) 97 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
104 ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino); 98 ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
105 else { 99 else {
@@ -119,10 +113,8 @@ void ufs_free_inode (struct inode * inode)
119 113
120 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 114 ubh_mark_buffer_dirty (USPI_UBH(uspi));
121 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 115 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
122 if (sb->s_flags & MS_SYNCHRONOUS) { 116 if (sb->s_flags & MS_SYNCHRONOUS)
123 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 117 ubh_sync_block(UCPI_UBH(ucpi));
124 ubh_wait_on_buffer (UCPI_UBH(ucpi));
125 }
126 118
127 sb->s_dirt = 1; 119 sb->s_dirt = 1;
128 unlock_super (sb); 120 unlock_super (sb);
@@ -162,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
162 154
163 fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb); 155 fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
164 ubh_mark_buffer_dirty(UCPI_UBH(ucpi)); 156 ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
165 if (sb->s_flags & MS_SYNCHRONOUS) { 157 if (sb->s_flags & MS_SYNCHRONOUS)
166 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 158 ubh_sync_block(UCPI_UBH(ucpi));
167 ubh_wait_on_buffer(UCPI_UBH(ucpi));
168 }
169 159
170 UFSD("EXIT\n"); 160 UFSD("EXIT\n");
171} 161}
@@ -296,22 +286,12 @@ cg_found:
296 } 286 }
297 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 287 ubh_mark_buffer_dirty (USPI_UBH(uspi));
298 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 288 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
299 if (sb->s_flags & MS_SYNCHRONOUS) { 289 if (sb->s_flags & MS_SYNCHRONOUS)
300 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 290 ubh_sync_block(UCPI_UBH(ucpi));
301 ubh_wait_on_buffer (UCPI_UBH(ucpi));
302 }
303 sb->s_dirt = 1; 291 sb->s_dirt = 1;
304 292
305 inode->i_ino = cg * uspi->s_ipg + bit; 293 inode->i_ino = cg * uspi->s_ipg + bit;
306 inode->i_mode = mode; 294 inode_init_owner(inode, dir, mode);
307 inode->i_uid = current_fsuid();
308 if (dir->i_mode & S_ISGID) {
309 inode->i_gid = dir->i_gid;
310 if (S_ISDIR(mode))
311 inode->i_mode |= S_ISGID;
312 } else
313 inode->i_gid = current_fsgid();
314
315 inode->i_blocks = 0; 295 inode->i_blocks = 0;
316 inode->i_generation = 0; 296 inode->i_generation = 0;
317 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 297 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
@@ -355,21 +335,12 @@ cg_found:
355 335
356 unlock_super (sb); 336 unlock_super (sb);
357 337
358 dquot_initialize(inode);
359 err = dquot_alloc_inode(inode);
360 if (err) {
361 dquot_drop(inode);
362 goto fail_without_unlock;
363 }
364
365 UFSD("allocating inode %lu\n", inode->i_ino); 338 UFSD("allocating inode %lu\n", inode->i_ino);
366 UFSD("EXIT\n"); 339 UFSD("EXIT\n");
367 return inode; 340 return inode;
368 341
369fail_remove_inode: 342fail_remove_inode:
370 unlock_super(sb); 343 unlock_super(sb);
371fail_without_unlock:
372 inode->i_flags |= S_NOQUOTA;
373 inode->i_nlink = 0; 344 inode->i_nlink = 0;
374 iput(inode); 345 iput(inode);
375 UFSD("EXIT (FAILED): err %d\n", err); 346 UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 80b68c3702d1..2b251f2093af 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
37#include <linux/smp_lock.h> 37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40#include <linux/quotaops.h>
41 40
42#include "ufs_fs.h" 41#include "ufs_fs.h"
43#include "ufs.h" 42#include "ufs.h"
@@ -559,20 +558,26 @@ static int ufs_readpage(struct file *file, struct page *page)
559 return block_read_full_page(page,ufs_getfrag_block); 558 return block_read_full_page(page,ufs_getfrag_block);
560} 559}
561 560
562int __ufs_write_begin(struct file *file, struct address_space *mapping, 561int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
563 loff_t pos, unsigned len, unsigned flags,
564 struct page **pagep, void **fsdata)
565{ 562{
566 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 563 return __block_write_begin(page, pos, len, ufs_getfrag_block);
567 ufs_getfrag_block);
568} 564}
569 565
570static int ufs_write_begin(struct file *file, struct address_space *mapping, 566static int ufs_write_begin(struct file *file, struct address_space *mapping,
571 loff_t pos, unsigned len, unsigned flags, 567 loff_t pos, unsigned len, unsigned flags,
572 struct page **pagep, void **fsdata) 568 struct page **pagep, void **fsdata)
573{ 569{
574 *pagep = NULL; 570 int ret;
575 return __ufs_write_begin(file, mapping, pos, len, flags, pagep, fsdata); 571
572 ret = block_write_begin(mapping, pos, len, flags, pagep,
573 ufs_getfrag_block);
574 if (unlikely(ret)) {
575 loff_t isize = mapping->host->i_size;
576 if (pos + len > isize)
577 vmtruncate(mapping->host, isize);
578 }
579
580 return ret;
576} 581}
577 582
578static sector_t ufs_bmap(struct address_space *mapping, sector_t block) 583static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
@@ -603,7 +608,7 @@ static void ufs_set_inode_ops(struct inode *inode)
603 if (!inode->i_blocks) 608 if (!inode->i_blocks)
604 inode->i_op = &ufs_fast_symlink_inode_operations; 609 inode->i_op = &ufs_fast_symlink_inode_operations;
605 else { 610 else {
606 inode->i_op = &page_symlink_inode_operations; 611 inode->i_op = &ufs_symlink_inode_operations;
607 inode->i_mapping->a_ops = &ufs_aops; 612 inode->i_mapping->a_ops = &ufs_aops;
608 } 613 }
609 } else 614 } else
@@ -906,27 +911,33 @@ int ufs_sync_inode (struct inode *inode)
906 return ufs_update_inode (inode, 1); 911 return ufs_update_inode (inode, 1);
907} 912}
908 913
909void ufs_delete_inode (struct inode * inode) 914void ufs_evict_inode(struct inode * inode)
910{ 915{
911 loff_t old_i_size; 916 int want_delete = 0;
912 917
913 if (!is_bad_inode(inode)) 918 if (!inode->i_nlink && !is_bad_inode(inode))
914 dquot_initialize(inode); 919 want_delete = 1;
915 920
916 truncate_inode_pages(&inode->i_data, 0); 921 truncate_inode_pages(&inode->i_data, 0);
917 if (is_bad_inode(inode)) 922 if (want_delete) {
918 goto no_delete; 923 loff_t old_i_size;
919 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ 924 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
920 lock_kernel(); 925 lock_kernel();
921 mark_inode_dirty(inode); 926 mark_inode_dirty(inode);
922 ufs_update_inode(inode, IS_SYNC(inode)); 927 ufs_update_inode(inode, IS_SYNC(inode));
923 old_i_size = inode->i_size; 928 old_i_size = inode->i_size;
924 inode->i_size = 0; 929 inode->i_size = 0;
925 if (inode->i_blocks && ufs_truncate(inode, old_i_size)) 930 if (inode->i_blocks && ufs_truncate(inode, old_i_size))
926 ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); 931 ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
927 ufs_free_inode (inode); 932 unlock_kernel();
928 unlock_kernel(); 933 }
929 return; 934
930no_delete: 935 invalidate_inode_buffers(inode);
931 clear_inode(inode); /* We must guarantee clearing of inode... */ 936 end_writeback(inode);
937
938 if (want_delete) {
939 lock_kernel();
940 ufs_free_inode (inode);
941 unlock_kernel();
942 }
932} 943}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 118556243e7a..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
33#include <linux/quotaops.h>
34 33
35#include "ufs_fs.h" 34#include "ufs_fs.h"
36#include "ufs.h" 35#include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
86 85
87 UFSD("BEGIN\n"); 86 UFSD("BEGIN\n");
88 87
89 dquot_initialize(dir);
90
91 inode = ufs_new_inode(dir, mode); 88 inode = ufs_new_inode(dir, mode);
92 err = PTR_ERR(inode); 89 err = PTR_ERR(inode);
93 90
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
112 if (!old_valid_dev(rdev)) 109 if (!old_valid_dev(rdev))
113 return -EINVAL; 110 return -EINVAL;
114 111
115 dquot_initialize(dir);
116
117 inode = ufs_new_inode(dir, mode); 112 inode = ufs_new_inode(dir, mode);
118 err = PTR_ERR(inode); 113 err = PTR_ERR(inode);
119 if (!IS_ERR(inode)) { 114 if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
138 if (l > sb->s_blocksize) 133 if (l > sb->s_blocksize)
139 goto out_notlocked; 134 goto out_notlocked;
140 135
141 dquot_initialize(dir);
142
143 lock_kernel(); 136 lock_kernel();
144 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 137 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
145 err = PTR_ERR(inode); 138 err = PTR_ERR(inode);
@@ -148,7 +141,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
148 141
149 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { 142 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
150 /* slow symlink */ 143 /* slow symlink */
151 inode->i_op = &page_symlink_inode_operations; 144 inode->i_op = &ufs_symlink_inode_operations;
152 inode->i_mapping->a_ops = &ufs_aops; 145 inode->i_mapping->a_ops = &ufs_aops;
153 err = page_symlink(inode, symname, l); 146 err = page_symlink(inode, symname, l);
154 if (err) 147 if (err)
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
185 return -EMLINK; 178 return -EMLINK;
186 } 179 }
187 180
188 dquot_initialize(dir);
189
190 inode->i_ctime = CURRENT_TIME_SEC; 181 inode->i_ctime = CURRENT_TIME_SEC;
191 inode_inc_link_count(inode); 182 inode_inc_link_count(inode);
192 atomic_inc(&inode->i_count); 183 atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
204 if (dir->i_nlink >= UFS_LINK_MAX) 195 if (dir->i_nlink >= UFS_LINK_MAX)
205 goto out; 196 goto out;
206 197
207 dquot_initialize(dir);
208
209 lock_kernel(); 198 lock_kernel();
210 inode_inc_link_count(dir); 199 inode_inc_link_count(dir);
211 200
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
250 struct page *page; 239 struct page *page;
251 int err = -ENOENT; 240 int err = -ENOENT;
252 241
253 dquot_initialize(dir);
254
255 de = ufs_find_entry(dir, &dentry->d_name, &page); 242 de = ufs_find_entry(dir, &dentry->d_name, &page);
256 if (!de) 243 if (!de)
257 goto out; 244 goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
296 struct ufs_dir_entry *old_de; 283 struct ufs_dir_entry *old_de;
297 int err = -ENOENT; 284 int err = -ENOENT;
298 285
299 dquot_initialize(old_dir);
300 dquot_initialize(new_dir);
301
302 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 286 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
303 if (!old_de) 287 if (!old_de)
304 goto out; 288 goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..d510c1b91817 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
77 77
78#include <linux/errno.h> 78#include <linux/errno.h>
79#include <linux/fs.h> 79#include <linux/fs.h>
80#include <linux/quotaops.h>
81#include <linux/slab.h> 80#include <linux/slab.h>
82#include <linux/time.h> 81#include <linux/time.h>
83#include <linux/stat.h> 82#include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
918 sbi->s_bytesex = BYTESEX_LE; 917 sbi->s_bytesex = BYTESEX_LE;
919 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 918 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
920 case UFS_MAGIC: 919 case UFS_MAGIC:
920 case UFS_MAGIC_BW:
921 case UFS2_MAGIC: 921 case UFS2_MAGIC:
922 case UFS_MAGIC_LFN: 922 case UFS_MAGIC_LFN:
923 case UFS_MAGIC_FEA: 923 case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
927 sbi->s_bytesex = BYTESEX_BE; 927 sbi->s_bytesex = BYTESEX_BE;
928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
929 case UFS_MAGIC: 929 case UFS_MAGIC:
930 case UFS_MAGIC_BW:
930 case UFS2_MAGIC: 931 case UFS2_MAGIC:
931 case UFS_MAGIC_LFN: 932 case UFS_MAGIC_LFN:
932 case UFS_MAGIC_FEA: 933 case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
1045 */ 1046 */
1046 sb->s_op = &ufs_super_ops; 1047 sb->s_op = &ufs_super_ops;
1047 sb->s_export_op = &ufs_export_ops; 1048 sb->s_export_op = &ufs_export_ops;
1048 sb->dq_op = NULL; /***/ 1049
1049 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); 1050 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
1050 1051
1051 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); 1052 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
1435 kmem_cache_destroy(ufs_inode_cachep); 1436 kmem_cache_destroy(ufs_inode_cachep);
1436} 1437}
1437 1438
1438static void ufs_clear_inode(struct inode *inode)
1439{
1440 dquot_drop(inode);
1441}
1442
1443#ifdef CONFIG_QUOTA
1444static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
1445static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
1446#endif
1447
1448static const struct super_operations ufs_super_ops = { 1439static const struct super_operations ufs_super_ops = {
1449 .alloc_inode = ufs_alloc_inode, 1440 .alloc_inode = ufs_alloc_inode,
1450 .destroy_inode = ufs_destroy_inode, 1441 .destroy_inode = ufs_destroy_inode,
1451 .write_inode = ufs_write_inode, 1442 .write_inode = ufs_write_inode,
1452 .delete_inode = ufs_delete_inode, 1443 .evict_inode = ufs_evict_inode,
1453 .clear_inode = ufs_clear_inode,
1454 .put_super = ufs_put_super, 1444 .put_super = ufs_put_super,
1455 .write_super = ufs_write_super, 1445 .write_super = ufs_write_super,
1456 .sync_fs = ufs_sync_fs, 1446 .sync_fs = ufs_sync_fs,
1457 .statfs = ufs_statfs, 1447 .statfs = ufs_statfs,
1458 .remount_fs = ufs_remount, 1448 .remount_fs = ufs_remount,
1459 .show_options = ufs_show_options, 1449 .show_options = ufs_show_options,
1460#ifdef CONFIG_QUOTA
1461 .quota_read = ufs_quota_read,
1462 .quota_write = ufs_quota_write,
1463#endif
1464}; 1450};
1465 1451
1466#ifdef CONFIG_QUOTA
1467
1468/* Read data from quotafile - avoid pagecache and such because we cannot afford
1469 * acquiring the locks... As quota files are never truncated and quota code
1470 * itself serializes the operations (and noone else should touch the files)
1471 * we don't have to be afraid of races */
1472static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
1473 size_t len, loff_t off)
1474{
1475 struct inode *inode = sb_dqopt(sb)->files[type];
1476 sector_t blk = off >> sb->s_blocksize_bits;
1477 int err = 0;
1478 int offset = off & (sb->s_blocksize - 1);
1479 int tocopy;
1480 size_t toread;
1481 struct buffer_head *bh;
1482 loff_t i_size = i_size_read(inode);
1483
1484 if (off > i_size)
1485 return 0;
1486 if (off+len > i_size)
1487 len = i_size-off;
1488 toread = len;
1489 while (toread > 0) {
1490 tocopy = sb->s_blocksize - offset < toread ?
1491 sb->s_blocksize - offset : toread;
1492
1493 bh = ufs_bread(inode, blk, 0, &err);
1494 if (err)
1495 return err;
1496 if (!bh) /* A hole? */
1497 memset(data, 0, tocopy);
1498 else {
1499 memcpy(data, bh->b_data+offset, tocopy);
1500 brelse(bh);
1501 }
1502 offset = 0;
1503 toread -= tocopy;
1504 data += tocopy;
1505 blk++;
1506 }
1507 return len;
1508}
1509
1510/* Write to quotafile */
1511static ssize_t ufs_quota_write(struct super_block *sb, int type,
1512 const char *data, size_t len, loff_t off)
1513{
1514 struct inode *inode = sb_dqopt(sb)->files[type];
1515 sector_t blk = off >> sb->s_blocksize_bits;
1516 int err = 0;
1517 int offset = off & (sb->s_blocksize - 1);
1518 int tocopy;
1519 size_t towrite = len;
1520 struct buffer_head *bh;
1521
1522 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
1523 while (towrite > 0) {
1524 tocopy = sb->s_blocksize - offset < towrite ?
1525 sb->s_blocksize - offset : towrite;
1526
1527 bh = ufs_bread(inode, blk, 1, &err);
1528 if (!bh)
1529 goto out;
1530 lock_buffer(bh);
1531 memcpy(bh->b_data+offset, data, tocopy);
1532 flush_dcache_page(bh->b_page);
1533 set_buffer_uptodate(bh);
1534 mark_buffer_dirty(bh);
1535 unlock_buffer(bh);
1536 brelse(bh);
1537 offset = 0;
1538 towrite -= tocopy;
1539 data += tocopy;
1540 blk++;
1541 }
1542out:
1543 if (len == towrite) {
1544 mutex_unlock(&inode->i_mutex);
1545 return err;
1546 }
1547 if (inode->i_size < off+len-towrite)
1548 i_size_write(inode, off+len-towrite);
1549 inode->i_version++;
1550 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1551 mark_inode_dirty(inode);
1552 mutex_unlock(&inode->i_mutex);
1553 return len - towrite;
1554}
1555
1556#endif
1557
1558static int ufs_get_sb(struct file_system_type *fs_type, 1452static int ufs_get_sb(struct file_system_type *fs_type,
1559 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1453 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
1560{ 1454{
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index c0156eda44bc..d283628b4778 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -42,4 +42,12 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
42const struct inode_operations ufs_fast_symlink_inode_operations = { 42const struct inode_operations ufs_fast_symlink_inode_operations = {
43 .readlink = generic_readlink, 43 .readlink = generic_readlink,
44 .follow_link = ufs_follow_link, 44 .follow_link = ufs_follow_link,
45 .setattr = ufs_setattr,
46};
47
48const struct inode_operations ufs_symlink_inode_operations = {
49 .readlink = generic_readlink,
50 .follow_link = page_follow_link_light,
51 .put_link = page_put_link,
52 .setattr = ufs_setattr,
45}; 53};
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index d3b6270cb377..a58f9155fc9a 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
44#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/quotaops.h>
48 47
49#include "ufs_fs.h" 48#include "ufs_fs.h"
50#include "ufs.h" 49#include "ufs.h"
@@ -244,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
244 ubh_bforget(ind_ubh); 243 ubh_bforget(ind_ubh);
245 ind_ubh = NULL; 244 ind_ubh = NULL;
246 } 245 }
247 if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) { 246 if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
248 ubh_ll_rw_block(SWRITE, ind_ubh); 247 ubh_sync_block(ind_ubh);
249 ubh_wait_on_buffer (ind_ubh);
250 }
251 ubh_brelse (ind_ubh); 248 ubh_brelse (ind_ubh);
252 249
253 UFSD("EXIT: ino %lu\n", inode->i_ino); 250 UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -308,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
308 ubh_bforget(dind_bh); 305 ubh_bforget(dind_bh);
309 dind_bh = NULL; 306 dind_bh = NULL;
310 } 307 }
311 if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) { 308 if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
312 ubh_ll_rw_block(SWRITE, dind_bh); 309 ubh_sync_block(dind_bh);
313 ubh_wait_on_buffer (dind_bh);
314 }
315 ubh_brelse (dind_bh); 310 ubh_brelse (dind_bh);
316 311
317 UFSD("EXIT: ino %lu\n", inode->i_ino); 312 UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -368,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode)
368 ubh_bforget(tind_bh); 363 ubh_bforget(tind_bh);
369 tind_bh = NULL; 364 tind_bh = NULL;
370 } 365 }
371 if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) { 366 if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
372 ubh_ll_rw_block(SWRITE, tind_bh); 367 ubh_sync_block(tind_bh);
373 ubh_wait_on_buffer (tind_bh);
374 }
375 ubh_brelse (tind_bh); 368 ubh_brelse (tind_bh);
376 369
377 UFSD("EXIT: ino %lu\n", inode->i_ino); 370 UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -501,14 +494,7 @@ out:
501 return err; 494 return err;
502} 495}
503 496
504 497int ufs_setattr(struct dentry *dentry, struct iattr *attr)
505/*
506 * We don't define our `inode->i_op->truncate', and call it here,
507 * because of:
508 * - there is no way to know old size
509 * - there is no way inform user about error, if it happens in `truncate'
510 */
511static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
512{ 498{
513 struct inode *inode = dentry->d_inode; 499 struct inode *inode = dentry->d_inode;
514 unsigned int ia_valid = attr->ia_valid; 500 unsigned int ia_valid = attr->ia_valid;
@@ -518,26 +504,20 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
518 if (error) 504 if (error)
519 return error; 505 return error;
520 506
521 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 507 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
522 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
523 error = dquot_transfer(inode, attr);
524 if (error)
525 return error;
526 }
527 if (ia_valid & ATTR_SIZE &&
528 attr->ia_size != i_size_read(inode)) {
529 loff_t old_i_size = inode->i_size; 508 loff_t old_i_size = inode->i_size;
530 509
531 dquot_initialize(inode); 510 /* XXX(truncate): truncate_setsize should be called last */
511 truncate_setsize(inode, attr->ia_size);
532 512
533 error = vmtruncate(inode, attr->ia_size);
534 if (error)
535 return error;
536 error = ufs_truncate(inode, old_i_size); 513 error = ufs_truncate(inode, old_i_size);
537 if (error) 514 if (error)
538 return error; 515 return error;
539 } 516 }
540 return inode_setattr(inode, attr); 517
518 setattr_copy(inode, attr);
519 mark_inode_dirty(inode);
520 return 0;
541} 521}
542 522
543const struct inode_operations ufs_file_inode_operations = { 523const struct inode_operations ufs_file_inode_operations = {
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 43f9f5d5670e..c08782e1b48a 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -108,7 +108,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
108extern struct inode *ufs_iget(struct super_block *, unsigned long); 108extern struct inode *ufs_iget(struct super_block *, unsigned long);
109extern int ufs_write_inode (struct inode *, struct writeback_control *); 109extern int ufs_write_inode (struct inode *, struct writeback_control *);
110extern int ufs_sync_inode (struct inode *); 110extern int ufs_sync_inode (struct inode *);
111extern void ufs_delete_inode (struct inode *); 111extern void ufs_evict_inode (struct inode *);
112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); 112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
113extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); 113extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
114 114
@@ -122,9 +122,11 @@ extern void ufs_panic (struct super_block *, const char *, const char *, ...) __
122 122
123/* symlink.c */ 123/* symlink.c */
124extern const struct inode_operations ufs_fast_symlink_inode_operations; 124extern const struct inode_operations ufs_fast_symlink_inode_operations;
125extern const struct inode_operations ufs_symlink_inode_operations;
125 126
126/* truncate.c */ 127/* truncate.c */
127extern int ufs_truncate (struct inode *, loff_t); 128extern int ufs_truncate (struct inode *, loff_t);
129extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
128 130
129static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) 131static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
130{ 132{
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
48#define UFS_SECTOR_SIZE 512 48#define UFS_SECTOR_SIZE 512
49#define UFS_SECTOR_BITS 9 49#define UFS_SECTOR_BITS 9
50#define UFS_MAGIC 0x00011954 50#define UFS_MAGIC 0x00011954
51#define UFS_MAGIC_BW 0x0f242697
51#define UFS2_MAGIC 0x19540119 52#define UFS2_MAGIC 0x19540119
52#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ 53#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */
53 54
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 85a7fc9e4a4e..d2c36d53fe66 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
113 } 113 }
114} 114}
115 115
116void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh) 116void ubh_sync_block(struct ufs_buffer_head *ubh)
117{ 117{
118 if (!ubh) 118 if (ubh) {
119 return; 119 unsigned i;
120 120
121 ll_rw_block(rw, ubh->count, ubh->bh); 121 for (i = 0; i < ubh->count; i++)
122} 122 write_dirty_buffer(ubh->bh[i], WRITE);
123 123
124void ubh_wait_on_buffer (struct ufs_buffer_head * ubh) 124 for (i = 0; i < ubh->count; i++)
125{ 125 wait_on_buffer(ubh->bh[i]);
126 unsigned i; 126 }
127 if (!ubh)
128 return;
129 for ( i = 0; i < ubh->count; i++ )
130 wait_on_buffer (ubh->bh[i]);
131} 127}
132 128
133void ubh_bforget (struct ufs_buffer_head * ubh) 129void ubh_bforget (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 23ceed8c8fb9..9f8775ce381c 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -257,9 +257,7 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value)
257 257
258extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); 258extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *);
259extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); 259extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t);
260extern int __ufs_write_begin(struct file *file, struct address_space *mapping, 260extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len);
261 loff_t pos, unsigned len, unsigned flags,
262 struct page **pagep, void **fsdata);
263 261
264/* 262/*
265 * These functions manipulate ufs buffers 263 * These functions manipulate ufs buffers
@@ -271,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
271extern void ubh_brelse_uspi (struct ufs_sb_private_info *); 269extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
272extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); 270extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
273extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); 271extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
274extern void ubh_ll_rw_block(int, struct ufs_buffer_head *); 272extern void ubh_sync_block(struct ufs_buffer_head *);
275extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
276extern void ubh_bforget (struct ufs_buffer_head *); 273extern void ubh_bforget (struct ufs_buffer_head *);
277extern int ubh_buffer_dirty (struct ufs_buffer_head *); 274extern int ubh_buffer_dirty (struct ufs_buffer_head *);
278#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) 275#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db5d373..179b58690657 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -126,7 +126,8 @@ out:
126 * must be owner or have write permission. 126 * must be owner or have write permission.
127 * Else, update from *times, must be owner or super user. 127 * Else, update from *times, must be owner or super user.
128 */ 128 */
129long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags) 129long do_utimes(int dfd, const char __user *filename, struct timespec *times,
130 int flags)
130{ 131{
131 int error = -EINVAL; 132 int error = -EINVAL;
132 133
@@ -170,7 +171,7 @@ out:
170 return error; 171 return error;
171} 172}
172 173
173SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename, 174SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
174 struct timespec __user *, utimes, int, flags) 175 struct timespec __user *, utimes, int, flags)
175{ 176{
176 struct timespec tstimes[2]; 177 struct timespec tstimes[2];
@@ -188,7 +189,7 @@ SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
188 return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); 189 return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
189} 190}
190 191
191SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename, 192SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
192 struct timeval __user *, utimes) 193 struct timeval __user *, utimes)
193{ 194{
194 struct timeval times[2]; 195 struct timeval times[2];
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b48..01bb8135e14a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
590/* 590/*
591 * Find the xattr_handler with the matching prefix. 591 * Find the xattr_handler with the matching prefix.
592 */ 592 */
593static struct xattr_handler * 593static const struct xattr_handler *
594xattr_resolve_name(struct xattr_handler **handlers, const char **name) 594xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
595{ 595{
596 struct xattr_handler *handler; 596 const struct xattr_handler *handler;
597 597
598 if (!*name) 598 if (!*name)
599 return NULL; 599 return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
614ssize_t 614ssize_t
615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) 615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
616{ 616{
617 struct xattr_handler *handler; 617 const struct xattr_handler *handler;
618 618
619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
620 if (!handler) 620 if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
629ssize_t 629ssize_t
630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) 630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
631{ 631{
632 struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; 632 const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
633 unsigned int size = 0; 633 unsigned int size = 0;
634 634
635 if (!buffer) { 635 if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
659int 659int
660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) 660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
661{ 661{
662 struct xattr_handler *handler; 662 const struct xattr_handler *handler;
663 663
664 if (size == 0) 664 if (size == 0)
665 value = ""; /* empty EA, do not remove */ 665 value = ""; /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
676int 676int
677generic_removexattr(struct dentry *dentry, const char *name) 677generic_removexattr(struct dentry *dentry, const char *name)
678{ 678{
679 struct xattr_handler *handler; 679 const struct xattr_handler *handler;
680 680
681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
682 if (!handler) 682 if (!handler)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..0dce969d6cad 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
77 xfs_itable.o \ 77 xfs_itable.o \
78 xfs_dfrag.o \ 78 xfs_dfrag.o \
79 xfs_log.o \ 79 xfs_log.o \
80 xfs_log_cil.o \
80 xfs_log_recover.o \ 81 xfs_log_recover.o \
81 xfs_mount.o \ 82 xfs_mount.o \
82 xfs_mru_cache.o \ 83 xfs_mru_cache.o \
@@ -86,11 +87,9 @@ xfs-y += xfs_alloc.o \
86 xfs_trans_buf.o \ 87 xfs_trans_buf.o \
87 xfs_trans_extfree.o \ 88 xfs_trans_extfree.o \
88 xfs_trans_inode.o \ 89 xfs_trans_inode.o \
89 xfs_trans_item.o \
90 xfs_utils.o \ 90 xfs_utils.o \
91 xfs_vnodeops.o \ 91 xfs_vnodeops.o \
92 xfs_rw.o \ 92 xfs_rw.o
93 xfs_dmops.o
94 93
95xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o 94xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o
96 95
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index a7bc925c4d60..b2771862fd3d 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -225,7 +225,7 @@ xfs_check_acl(struct inode *inode, int mask)
225 struct posix_acl *acl; 225 struct posix_acl *acl;
226 int error = -EAGAIN; 226 int error = -EAGAIN;
227 227
228 xfs_itrace_entry(ip); 228 trace_xfs_check_acl(ip);
229 229
230 /* 230 /*
231 * If there is no attribute fork no ACL exists on this inode and 231 * If there is no attribute fork no ACL exists on this inode and
@@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
440 return error; 440 return error;
441} 441}
442 442
443struct xattr_handler xfs_xattr_acl_access_handler = { 443const struct xattr_handler xfs_xattr_acl_access_handler = {
444 .prefix = POSIX_ACL_XATTR_ACCESS, 444 .prefix = POSIX_ACL_XATTR_ACCESS,
445 .flags = ACL_TYPE_ACCESS, 445 .flags = ACL_TYPE_ACCESS,
446 .get = xfs_xattr_acl_get, 446 .get = xfs_xattr_acl_get,
447 .set = xfs_xattr_acl_set, 447 .set = xfs_xattr_acl_set,
448}; 448};
449 449
450struct xattr_handler xfs_xattr_acl_default_handler = { 450const struct xattr_handler xfs_xattr_acl_default_handler = {
451 .prefix = POSIX_ACL_XATTR_DEFAULT, 451 .prefix = POSIX_ACL_XATTR_DEFAULT,
452 .flags = ACL_TYPE_DEFAULT, 452 .flags = ACL_TYPE_DEFAULT,
453 .get = xfs_xattr_acl_get, 453 .get = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 0f8b9968a803..b552f816de15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -21,19 +21,12 @@
21#include "xfs_inum.h" 21#include "xfs_inum.h"
22#include "xfs_sb.h" 22#include "xfs_sb.h"
23#include "xfs_ag.h" 23#include "xfs_ag.h"
24#include "xfs_dir2.h"
25#include "xfs_trans.h" 24#include "xfs_trans.h"
26#include "xfs_dmapi.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
29#include "xfs_alloc_btree.h"
30#include "xfs_ialloc_btree.h"
31#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 27#include "xfs_dinode.h"
34#include "xfs_inode.h" 28#include "xfs_inode.h"
35#include "xfs_alloc.h" 29#include "xfs_alloc.h"
36#include "xfs_btree.h"
37#include "xfs_error.h" 30#include "xfs_error.h"
38#include "xfs_rw.h" 31#include "xfs_rw.h"
39#include "xfs_iomap.h" 32#include "xfs_iomap.h"
@@ -45,6 +38,15 @@
45#include <linux/pagevec.h> 38#include <linux/pagevec.h>
46#include <linux/writeback.h> 39#include <linux/writeback.h>
47 40
41/*
42 * Types of I/O for bmap clustering and I/O completion tracking.
43 */
44enum {
45 IO_READ, /* mapping for a read */
46 IO_DELAY, /* mapping covers delalloc region */
47 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
48 IO_NEW /* just allocated */
49};
48 50
49/* 51/*
50 * Prime number of hash buckets since address is used as the key. 52 * Prime number of hash buckets since address is used as the key.
@@ -83,18 +85,15 @@ void
83xfs_count_page_state( 85xfs_count_page_state(
84 struct page *page, 86 struct page *page,
85 int *delalloc, 87 int *delalloc,
86 int *unmapped,
87 int *unwritten) 88 int *unwritten)
88{ 89{
89 struct buffer_head *bh, *head; 90 struct buffer_head *bh, *head;
90 91
91 *delalloc = *unmapped = *unwritten = 0; 92 *delalloc = *unwritten = 0;
92 93
93 bh = head = page_buffers(page); 94 bh = head = page_buffers(page);
94 do { 95 do {
95 if (buffer_uptodate(bh) && !buffer_mapped(bh)) 96 if (buffer_unwritten(bh))
96 (*unmapped) = 1;
97 else if (buffer_unwritten(bh))
98 (*unwritten) = 1; 97 (*unwritten) = 1;
99 else if (buffer_delay(bh)) 98 else if (buffer_delay(bh))
100 (*delalloc) = 1; 99 (*delalloc) = 1;
@@ -103,8 +102,9 @@ xfs_count_page_state(
103 102
104STATIC struct block_device * 103STATIC struct block_device *
105xfs_find_bdev_for_inode( 104xfs_find_bdev_for_inode(
106 struct xfs_inode *ip) 105 struct inode *inode)
107{ 106{
107 struct xfs_inode *ip = XFS_I(inode);
108 struct xfs_mount *mp = ip->i_mount; 108 struct xfs_mount *mp = ip->i_mount;
109 109
110 if (XFS_IS_REALTIME_INODE(ip)) 110 if (XFS_IS_REALTIME_INODE(ip))
@@ -183,7 +183,7 @@ xfs_setfilesize(
183 xfs_fsize_t isize; 183 xfs_fsize_t isize;
184 184
185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
186 ASSERT(ioend->io_type != IOMAP_READ); 186 ASSERT(ioend->io_type != IO_READ);
187 187
188 if (unlikely(ioend->io_error)) 188 if (unlikely(ioend->io_error))
189 return 0; 189 return 0;
@@ -202,23 +202,17 @@ xfs_setfilesize(
202} 202}
203 203
204/* 204/*
205 * Schedule IO completion handling on a xfsdatad if this was 205 * Schedule IO completion handling on the final put of an ioend.
206 * the final hold on this ioend. If we are asked to wait,
207 * flush the workqueue.
208 */ 206 */
209STATIC void 207STATIC void
210xfs_finish_ioend( 208xfs_finish_ioend(
211 xfs_ioend_t *ioend, 209 struct xfs_ioend *ioend)
212 int wait)
213{ 210{
214 if (atomic_dec_and_test(&ioend->io_remaining)) { 211 if (atomic_dec_and_test(&ioend->io_remaining)) {
215 struct workqueue_struct *wq; 212 if (ioend->io_type == IO_UNWRITTEN)
216 213 queue_work(xfsconvertd_workqueue, &ioend->io_work);
217 wq = (ioend->io_type == IOMAP_UNWRITTEN) ? 214 else
218 xfsconvertd_workqueue : xfsdatad_workqueue; 215 queue_work(xfsdatad_workqueue, &ioend->io_work);
219 queue_work(wq, &ioend->io_work);
220 if (wait)
221 flush_workqueue(wq);
222 } 216 }
223} 217}
224 218
@@ -237,7 +231,7 @@ xfs_end_io(
237 * For unwritten extents we need to issue transactions to convert a 231 * For unwritten extents we need to issue transactions to convert a
238 * range to normal written extens after the data I/O has finished. 232 * range to normal written extens after the data I/O has finished.
239 */ 233 */
240 if (ioend->io_type == IOMAP_UNWRITTEN && 234 if (ioend->io_type == IO_UNWRITTEN &&
241 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { 235 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
242 236
243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 237 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
@@ -250,7 +244,7 @@ xfs_end_io(
250 * We might have to update the on-disk file size after extending 244 * We might have to update the on-disk file size after extending
251 * writes. 245 * writes.
252 */ 246 */
253 if (ioend->io_type != IOMAP_READ) { 247 if (ioend->io_type != IO_READ) {
254 error = xfs_setfilesize(ioend); 248 error = xfs_setfilesize(ioend);
255 ASSERT(!error || error == EAGAIN); 249 ASSERT(!error || error == EAGAIN);
256 } 250 }
@@ -262,11 +256,25 @@ xfs_end_io(
262 */ 256 */
263 if (error == EAGAIN) { 257 if (error == EAGAIN) {
264 atomic_inc(&ioend->io_remaining); 258 atomic_inc(&ioend->io_remaining);
265 xfs_finish_ioend(ioend, 0); 259 xfs_finish_ioend(ioend);
266 /* ensure we don't spin on blocked ioends */ 260 /* ensure we don't spin on blocked ioends */
267 delay(1); 261 delay(1);
268 } else 262 } else {
263 if (ioend->io_iocb)
264 aio_complete(ioend->io_iocb, ioend->io_result, 0);
269 xfs_destroy_ioend(ioend); 265 xfs_destroy_ioend(ioend);
266 }
267}
268
269/*
270 * Call IO completion handling in caller context on the final put of an ioend.
271 */
272STATIC void
273xfs_finish_ioend_sync(
274 struct xfs_ioend *ioend)
275{
276 if (atomic_dec_and_test(&ioend->io_remaining))
277 xfs_end_io(&ioend->io_work);
270} 278}
271 279
272/* 280/*
@@ -299,6 +307,8 @@ xfs_alloc_ioend(
299 atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); 307 atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
300 ioend->io_offset = 0; 308 ioend->io_offset = 0;
301 ioend->io_size = 0; 309 ioend->io_size = 0;
310 ioend->io_iocb = NULL;
311 ioend->io_result = 0;
302 312
303 INIT_WORK(&ioend->io_work, xfs_end_io); 313 INIT_WORK(&ioend->io_work, xfs_end_io);
304 return ioend; 314 return ioend;
@@ -309,21 +319,25 @@ xfs_map_blocks(
309 struct inode *inode, 319 struct inode *inode,
310 loff_t offset, 320 loff_t offset,
311 ssize_t count, 321 ssize_t count,
312 xfs_iomap_t *mapp, 322 struct xfs_bmbt_irec *imap,
313 int flags) 323 int flags)
314{ 324{
315 int nmaps = 1; 325 int nmaps = 1;
326 int new = 0;
316 327
317 return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); 328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
318} 329}
319 330
320STATIC int 331STATIC int
321xfs_iomap_valid( 332xfs_imap_valid(
322 xfs_iomap_t *iomapp, 333 struct inode *inode,
323 loff_t offset) 334 struct xfs_bmbt_irec *imap,
335 xfs_off_t offset)
324{ 336{
325 return offset >= iomapp->iomap_offset && 337 offset >>= inode->i_blkbits;
326 offset < iomapp->iomap_offset + iomapp->iomap_bsize; 338
339 return offset >= imap->br_startoff &&
340 offset < imap->br_startoff + imap->br_blockcount;
327} 341}
328 342
329/* 343/*
@@ -344,7 +358,7 @@ xfs_end_bio(
344 bio->bi_end_io = NULL; 358 bio->bi_end_io = NULL;
345 bio_put(bio); 359 bio_put(bio);
346 360
347 xfs_finish_ioend(ioend, 0); 361 xfs_finish_ioend(ioend);
348} 362}
349 363
350STATIC void 364STATIC void
@@ -486,7 +500,7 @@ xfs_submit_ioend(
486 } 500 }
487 if (bio) 501 if (bio)
488 xfs_submit_ioend_bio(wbc, ioend, bio); 502 xfs_submit_ioend_bio(wbc, ioend, bio);
489 xfs_finish_ioend(ioend, 0); 503 xfs_finish_ioend(ioend);
490 } while ((ioend = next) != NULL); 504 } while ((ioend = next) != NULL);
491} 505}
492 506
@@ -554,19 +568,23 @@ xfs_add_to_ioend(
554 568
555STATIC void 569STATIC void
556xfs_map_buffer( 570xfs_map_buffer(
571 struct inode *inode,
557 struct buffer_head *bh, 572 struct buffer_head *bh,
558 xfs_iomap_t *mp, 573 struct xfs_bmbt_irec *imap,
559 xfs_off_t offset, 574 xfs_off_t offset)
560 uint block_bits)
561{ 575{
562 sector_t bn; 576 sector_t bn;
577 struct xfs_mount *m = XFS_I(inode)->i_mount;
578 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
579 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
563 580
564 ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); 581 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
582 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
565 583
566 bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + 584 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
567 ((offset - mp->iomap_offset) >> block_bits); 585 ((offset - iomap_offset) >> inode->i_blkbits);
568 586
569 ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); 587 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
570 588
571 bh->b_blocknr = bn; 589 bh->b_blocknr = bn;
572 set_buffer_mapped(bh); 590 set_buffer_mapped(bh);
@@ -574,17 +592,17 @@ xfs_map_buffer(
574 592
575STATIC void 593STATIC void
576xfs_map_at_offset( 594xfs_map_at_offset(
595 struct inode *inode,
577 struct buffer_head *bh, 596 struct buffer_head *bh,
578 loff_t offset, 597 struct xfs_bmbt_irec *imap,
579 int block_bits, 598 xfs_off_t offset)
580 xfs_iomap_t *iomapp)
581{ 599{
582 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); 600 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
583 ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); 601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
584 602
585 lock_buffer(bh); 603 lock_buffer(bh);
586 xfs_map_buffer(bh, iomapp, offset, block_bits); 604 xfs_map_buffer(inode, bh, imap, offset);
587 bh->b_bdev = iomapp->iomap_target->bt_bdev; 605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
588 set_buffer_mapped(bh); 606 set_buffer_mapped(bh);
589 clear_buffer_delay(bh); 607 clear_buffer_delay(bh);
590 clear_buffer_unwritten(bh); 608 clear_buffer_unwritten(bh);
@@ -596,31 +614,30 @@ xfs_map_at_offset(
596STATIC unsigned int 614STATIC unsigned int
597xfs_probe_page( 615xfs_probe_page(
598 struct page *page, 616 struct page *page,
599 unsigned int pg_offset, 617 unsigned int pg_offset)
600 int mapped)
601{ 618{
619 struct buffer_head *bh, *head;
602 int ret = 0; 620 int ret = 0;
603 621
604 if (PageWriteback(page)) 622 if (PageWriteback(page))
605 return 0; 623 return 0;
624 if (!PageDirty(page))
625 return 0;
626 if (!page->mapping)
627 return 0;
628 if (!page_has_buffers(page))
629 return 0;
606 630
607 if (page->mapping && PageDirty(page)) { 631 bh = head = page_buffers(page);
608 if (page_has_buffers(page)) { 632 do {
609 struct buffer_head *bh, *head; 633 if (!buffer_uptodate(bh))
610 634 break;
611 bh = head = page_buffers(page); 635 if (!buffer_mapped(bh))
612 do { 636 break;
613 if (!buffer_uptodate(bh)) 637 ret += bh->b_size;
614 break; 638 if (ret >= pg_offset)
615 if (mapped != buffer_mapped(bh)) 639 break;
616 break; 640 } while ((bh = bh->b_this_page) != head);
617 ret += bh->b_size;
618 if (ret >= pg_offset)
619 break;
620 } while ((bh = bh->b_this_page) != head);
621 } else
622 ret = mapped ? 0 : PAGE_CACHE_SIZE;
623 }
624 641
625 return ret; 642 return ret;
626} 643}
@@ -630,8 +647,7 @@ xfs_probe_cluster(
630 struct inode *inode, 647 struct inode *inode,
631 struct page *startpage, 648 struct page *startpage,
632 struct buffer_head *bh, 649 struct buffer_head *bh,
633 struct buffer_head *head, 650 struct buffer_head *head)
634 int mapped)
635{ 651{
636 struct pagevec pvec; 652 struct pagevec pvec;
637 pgoff_t tindex, tlast, tloff; 653 pgoff_t tindex, tlast, tloff;
@@ -640,7 +656,7 @@ xfs_probe_cluster(
640 656
641 /* First sum forwards in this page */ 657 /* First sum forwards in this page */
642 do { 658 do {
643 if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) 659 if (!buffer_uptodate(bh) || !buffer_mapped(bh))
644 return total; 660 return total;
645 total += bh->b_size; 661 total += bh->b_size;
646 } while ((bh = bh->b_this_page) != head); 662 } while ((bh = bh->b_this_page) != head);
@@ -674,7 +690,7 @@ xfs_probe_cluster(
674 pg_offset = PAGE_CACHE_SIZE; 690 pg_offset = PAGE_CACHE_SIZE;
675 691
676 if (page->index == tindex && trylock_page(page)) { 692 if (page->index == tindex && trylock_page(page)) {
677 pg_len = xfs_probe_page(page, pg_offset, mapped); 693 pg_len = xfs_probe_page(page, pg_offset);
678 unlock_page(page); 694 unlock_page(page);
679 } 695 }
680 696
@@ -713,11 +729,11 @@ xfs_is_delayed_page(
713 bh = head = page_buffers(page); 729 bh = head = page_buffers(page);
714 do { 730 do {
715 if (buffer_unwritten(bh)) 731 if (buffer_unwritten(bh))
716 acceptable = (type == IOMAP_UNWRITTEN); 732 acceptable = (type == IO_UNWRITTEN);
717 else if (buffer_delay(bh)) 733 else if (buffer_delay(bh))
718 acceptable = (type == IOMAP_DELAY); 734 acceptable = (type == IO_DELAY);
719 else if (buffer_dirty(bh) && buffer_mapped(bh)) 735 else if (buffer_dirty(bh) && buffer_mapped(bh))
720 acceptable = (type == IOMAP_NEW); 736 acceptable = (type == IO_NEW);
721 else 737 else
722 break; 738 break;
723 } while ((bh = bh->b_this_page) != head); 739 } while ((bh = bh->b_this_page) != head);
@@ -740,17 +756,15 @@ xfs_convert_page(
740 struct inode *inode, 756 struct inode *inode,
741 struct page *page, 757 struct page *page,
742 loff_t tindex, 758 loff_t tindex,
743 xfs_iomap_t *mp, 759 struct xfs_bmbt_irec *imap,
744 xfs_ioend_t **ioendp, 760 xfs_ioend_t **ioendp,
745 struct writeback_control *wbc, 761 struct writeback_control *wbc,
746 int startio,
747 int all_bh) 762 int all_bh)
748{ 763{
749 struct buffer_head *bh, *head; 764 struct buffer_head *bh, *head;
750 xfs_off_t end_offset; 765 xfs_off_t end_offset;
751 unsigned long p_offset; 766 unsigned long p_offset;
752 unsigned int type; 767 unsigned int type;
753 int bbits = inode->i_blkbits;
754 int len, page_dirty; 768 int len, page_dirty;
755 int count = 0, done = 0, uptodate = 1; 769 int count = 0, done = 0, uptodate = 1;
756 xfs_off_t offset = page_offset(page); 770 xfs_off_t offset = page_offset(page);
@@ -802,32 +816,27 @@ xfs_convert_page(
802 816
803 if (buffer_unwritten(bh) || buffer_delay(bh)) { 817 if (buffer_unwritten(bh) || buffer_delay(bh)) {
804 if (buffer_unwritten(bh)) 818 if (buffer_unwritten(bh))
805 type = IOMAP_UNWRITTEN; 819 type = IO_UNWRITTEN;
806 else 820 else
807 type = IOMAP_DELAY; 821 type = IO_DELAY;
808 822
809 if (!xfs_iomap_valid(mp, offset)) { 823 if (!xfs_imap_valid(inode, imap, offset)) {
810 done = 1; 824 done = 1;
811 continue; 825 continue;
812 } 826 }
813 827
814 ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); 828 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
815 ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); 829 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
830
831 xfs_map_at_offset(inode, bh, imap, offset);
832 xfs_add_to_ioend(inode, bh, offset, type,
833 ioendp, done);
816 834
817 xfs_map_at_offset(bh, offset, bbits, mp);
818 if (startio) {
819 xfs_add_to_ioend(inode, bh, offset,
820 type, ioendp, done);
821 } else {
822 set_buffer_dirty(bh);
823 unlock_buffer(bh);
824 mark_buffer_dirty(bh);
825 }
826 page_dirty--; 835 page_dirty--;
827 count++; 836 count++;
828 } else { 837 } else {
829 type = IOMAP_NEW; 838 type = IO_NEW;
830 if (buffer_mapped(bh) && all_bh && startio) { 839 if (buffer_mapped(bh) && all_bh) {
831 lock_buffer(bh); 840 lock_buffer(bh);
832 xfs_add_to_ioend(inode, bh, offset, 841 xfs_add_to_ioend(inode, bh, offset,
833 type, ioendp, done); 842 type, ioendp, done);
@@ -842,14 +851,12 @@ xfs_convert_page(
842 if (uptodate && bh == head) 851 if (uptodate && bh == head)
843 SetPageUptodate(page); 852 SetPageUptodate(page);
844 853
845 if (startio) { 854 if (count) {
846 if (count) { 855 if (--wbc->nr_to_write <= 0 &&
847 wbc->nr_to_write--; 856 wbc->sync_mode == WB_SYNC_NONE)
848 if (wbc->nr_to_write <= 0) 857 done = 1;
849 done = 1;
850 }
851 xfs_start_page_writeback(page, !page_dirty, count);
852 } 858 }
859 xfs_start_page_writeback(page, !page_dirty, count);
853 860
854 return done; 861 return done;
855 fail_unlock_page: 862 fail_unlock_page:
@@ -866,10 +873,9 @@ STATIC void
866xfs_cluster_write( 873xfs_cluster_write(
867 struct inode *inode, 874 struct inode *inode,
868 pgoff_t tindex, 875 pgoff_t tindex,
869 xfs_iomap_t *iomapp, 876 struct xfs_bmbt_irec *imap,
870 xfs_ioend_t **ioendp, 877 xfs_ioend_t **ioendp,
871 struct writeback_control *wbc, 878 struct writeback_control *wbc,
872 int startio,
873 int all_bh, 879 int all_bh,
874 pgoff_t tlast) 880 pgoff_t tlast)
875{ 881{
@@ -885,7 +891,7 @@ xfs_cluster_write(
885 891
886 for (i = 0; i < pagevec_count(&pvec); i++) { 892 for (i = 0; i < pagevec_count(&pvec); i++) {
887 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 893 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
888 iomapp, ioendp, wbc, startio, all_bh); 894 imap, ioendp, wbc, all_bh);
889 if (done) 895 if (done)
890 break; 896 break;
891 } 897 }
@@ -930,7 +936,7 @@ xfs_aops_discard_page(
930 loff_t offset = page_offset(page); 936 loff_t offset = page_offset(page);
931 ssize_t len = 1 << inode->i_blkbits; 937 ssize_t len = 1 << inode->i_blkbits;
932 938
933 if (!xfs_is_delayed_page(page, IOMAP_DELAY)) 939 if (!xfs_is_delayed_page(page, IO_DELAY))
934 goto out_invalidate; 940 goto out_invalidate;
935 941
936 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 942 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -964,7 +970,7 @@ xfs_aops_discard_page(
964 */ 970 */
965 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 971 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
966 XFS_BMAPI_ENTIRE, NULL, 0, &imap, 972 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
967 &nimaps, NULL, NULL); 973 &nimaps, NULL);
968 974
969 if (error) { 975 if (error) {
970 /* something screwed, just bail */ 976 /* something screwed, just bail */
@@ -992,7 +998,7 @@ xfs_aops_discard_page(
992 */ 998 */
993 xfs_bmap_init(&flist, &firstblock); 999 xfs_bmap_init(&flist, &firstblock);
994 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, 1000 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
995 &flist, NULL, &done); 1001 &flist, &done);
996 1002
997 ASSERT(!flist.xbf_count && !flist.xbf_first); 1003 ASSERT(!flist.xbf_count && !flist.xbf_first);
998 if (error) { 1004 if (error) {
@@ -1015,50 +1021,66 @@ out_invalidate:
1015} 1021}
1016 1022
1017/* 1023/*
1018 * Calling this without startio set means we are being asked to make a dirty 1024 * Write out a dirty page.
1019 * page ready for freeing it's buffers. When called with startio set then
1020 * we are coming from writepage.
1021 * 1025 *
1022 * When called with startio set it is important that we write the WHOLE 1026 * For delalloc space on the page we need to allocate space and flush it.
1023 * page if possible. 1027 * For unwritten space on the page we need to start the conversion to
1024 * The bh->b_state's cannot know if any of the blocks or which block for 1028 * regular allocated space.
1025 * that matter are dirty due to mmap writes, and therefore bh uptodate is 1029 * For any other dirty buffer heads on the page we should flush them.
1026 * only valid if the page itself isn't completely uptodate. Some layers 1030 *
1027 * may clear the page dirty flag prior to calling write page, under the 1031 * If we detect that a transaction would be required to flush the page, we
1028 * assumption the entire page will be written out; by not writing out the 1032 * have to check the process flags first, if we are already in a transaction
1029 * whole page the page can be reused before all valid dirty data is 1033 * or disk I/O during allocations is off, we need to fail the writepage and
1030 * written out. Note: in the case of a page that has been dirty'd by 1034 * redirty the page.
1031 * mapwrite and but partially setup by block_prepare_write the
1032 * bh->b_states's will not agree and only ones setup by BPW/BCW will have
1033 * valid state, thus the whole page must be written out thing.
1034 */ 1035 */
1035
1036STATIC int 1036STATIC int
1037xfs_page_state_convert( 1037xfs_vm_writepage(
1038 struct inode *inode, 1038 struct page *page,
1039 struct page *page, 1039 struct writeback_control *wbc)
1040 struct writeback_control *wbc,
1041 int startio,
1042 int unmapped) /* also implies page uptodate */
1043{ 1040{
1041 struct inode *inode = page->mapping->host;
1042 int delalloc, unwritten;
1044 struct buffer_head *bh, *head; 1043 struct buffer_head *bh, *head;
1045 xfs_iomap_t iomap; 1044 struct xfs_bmbt_irec imap;
1046 xfs_ioend_t *ioend = NULL, *iohead = NULL; 1045 xfs_ioend_t *ioend = NULL, *iohead = NULL;
1047 loff_t offset; 1046 loff_t offset;
1048 unsigned long p_offset = 0;
1049 unsigned int type; 1047 unsigned int type;
1050 __uint64_t end_offset; 1048 __uint64_t end_offset;
1051 pgoff_t end_index, last_index, tlast; 1049 pgoff_t end_index, last_index;
1052 ssize_t size, len; 1050 ssize_t size, len;
1053 int flags, err, iomap_valid = 0, uptodate = 1; 1051 int flags, err, imap_valid = 0, uptodate = 1;
1054 int page_dirty, count = 0; 1052 int count = 0;
1055 int trylock = 0; 1053 int all_bh = 0;
1056 int all_bh = unmapped; 1054
1057 1055 trace_xfs_writepage(inode, page, 0);
1058 if (startio) { 1056
1059 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) 1057 ASSERT(page_has_buffers(page));
1060 trylock |= BMAPI_TRYLOCK; 1058
1061 } 1059 /*
1060 * Refuse to write the page out if we are called from reclaim context.
1061 *
1062 * This avoids stack overflows when called from deeply used stacks in
1063 * random callers for direct reclaim or memcg reclaim. We explicitly
1064 * allow reclaim from kswapd as the stack usage there is relatively low.
1065 *
1066 * This should really be done by the core VM, but until that happens
1067 * filesystems like XFS, btrfs and ext4 have to take care of this
1068 * by themselves.
1069 */
1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
1071 goto redirty;
1072
1073 /*
1074 * We need a transaction if there are delalloc or unwritten buffers
1075 * on the page.
1076 *
1077 * If we need a transaction and the process flags say we are already
1078 * in a transaction, or no IO is allowed then mark the page dirty
1079 * again and leave the page as is.
1080 */
1081 xfs_count_page_state(page, &delalloc, &unwritten);
1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
1083 goto redirty;
1062 1084
1063 /* Is this page beyond the end of the file? */ 1085 /* Is this page beyond the end of the file? */
1064 offset = i_size_read(inode); 1086 offset = i_size_read(inode);
@@ -1067,92 +1089,64 @@ xfs_page_state_convert(
1067 if (page->index >= end_index) { 1089 if (page->index >= end_index) {
1068 if ((page->index >= end_index + 1) || 1090 if ((page->index >= end_index + 1) ||
1069 !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { 1091 !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
1070 if (startio) 1092 unlock_page(page);
1071 unlock_page(page);
1072 return 0; 1093 return 0;
1073 } 1094 }
1074 } 1095 }
1075 1096
1076 /*
1077 * page_dirty is initially a count of buffers on the page before
1078 * EOF and is decremented as we move each into a cleanable state.
1079 *
1080 * Derivation:
1081 *
1082 * End offset is the highest offset that this page should represent.
1083 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
1084 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
1085 * hence give us the correct page_dirty count. On any other page,
1086 * it will be zero and in that case we need page_dirty to be the
1087 * count of buffers on the page.
1088 */
1089 end_offset = min_t(unsigned long long, 1097 end_offset = min_t(unsigned long long,
1090 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); 1098 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
1099 offset);
1091 len = 1 << inode->i_blkbits; 1100 len = 1 << inode->i_blkbits;
1092 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
1093 PAGE_CACHE_SIZE);
1094 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
1095 page_dirty = p_offset / len;
1096 1101
1097 bh = head = page_buffers(page); 1102 bh = head = page_buffers(page);
1098 offset = page_offset(page); 1103 offset = page_offset(page);
1099 flags = BMAPI_READ; 1104 flags = BMAPI_READ;
1100 type = IOMAP_NEW; 1105 type = IO_NEW;
1101
1102 /* TODO: cleanup count and page_dirty */
1103 1106
1104 do { 1107 do {
1105 if (offset >= end_offset) 1108 if (offset >= end_offset)
1106 break; 1109 break;
1107 if (!buffer_uptodate(bh)) 1110 if (!buffer_uptodate(bh))
1108 uptodate = 0; 1111 uptodate = 0;
1109 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) { 1112
1110 /* 1113 /*
1111 * the iomap is actually still valid, but the ioend 1114 * A hole may still be marked uptodate because discard_buffer
1112 * isn't. shouldn't happen too often. 1115 * leaves the flag set.
1113 */ 1116 */
1114 iomap_valid = 0; 1117 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1118 ASSERT(!buffer_dirty(bh));
1119 imap_valid = 0;
1115 continue; 1120 continue;
1116 } 1121 }
1117 1122
1118 if (iomap_valid) 1123 if (imap_valid)
1119 iomap_valid = xfs_iomap_valid(&iomap, offset); 1124 imap_valid = xfs_imap_valid(inode, &imap, offset);
1120 1125
1121 /* 1126 if (buffer_unwritten(bh) || buffer_delay(bh)) {
1122 * First case, map an unwritten extent and prepare for
1123 * extent state conversion transaction on completion.
1124 *
1125 * Second case, allocate space for a delalloc buffer.
1126 * We can return EAGAIN here in the release page case.
1127 *
1128 * Third case, an unmapped buffer was found, and we are
1129 * in a path where we need to write the whole page out.
1130 */
1131 if (buffer_unwritten(bh) || buffer_delay(bh) ||
1132 ((buffer_uptodate(bh) || PageUptodate(page)) &&
1133 !buffer_mapped(bh) && (unmapped || startio))) {
1134 int new_ioend = 0; 1127 int new_ioend = 0;
1135 1128
1136 /* 1129 /*
1137 * Make sure we don't use a read-only iomap 1130 * Make sure we don't use a read-only iomap
1138 */ 1131 */
1139 if (flags == BMAPI_READ) 1132 if (flags == BMAPI_READ)
1140 iomap_valid = 0; 1133 imap_valid = 0;
1141 1134
1142 if (buffer_unwritten(bh)) { 1135 if (buffer_unwritten(bh)) {
1143 type = IOMAP_UNWRITTEN; 1136 type = IO_UNWRITTEN;
1144 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 1137 flags = BMAPI_WRITE | BMAPI_IGNSTATE;
1145 } else if (buffer_delay(bh)) { 1138 } else if (buffer_delay(bh)) {
1146 type = IOMAP_DELAY; 1139 type = IO_DELAY;
1147 flags = BMAPI_ALLOCATE | trylock; 1140 flags = BMAPI_ALLOCATE;
1148 } else { 1141
1149 type = IOMAP_NEW; 1142 if (wbc->sync_mode == WB_SYNC_NONE &&
1150 flags = BMAPI_WRITE | BMAPI_MMAP; 1143 wbc->nonblocking)
1144 flags |= BMAPI_TRYLOCK;
1151 } 1145 }
1152 1146
1153 if (!iomap_valid) { 1147 if (!imap_valid) {
1154 /* 1148 /*
1155 * if we didn't have a valid mapping then we 1149 * If we didn't have a valid mapping then we
1156 * need to ensure that we put the new mapping 1150 * need to ensure that we put the new mapping
1157 * in a new ioend structure. This needs to be 1151 * in a new ioend structure. This needs to be
1158 * done to ensure that the ioends correctly 1152 * done to ensure that the ioends correctly
@@ -1160,74 +1154,57 @@ xfs_page_state_convert(
1160 * for unwritten extent conversion. 1154 * for unwritten extent conversion.
1161 */ 1155 */
1162 new_ioend = 1; 1156 new_ioend = 1;
1163 if (type == IOMAP_NEW) { 1157 err = xfs_map_blocks(inode, offset, len,
1164 size = xfs_probe_cluster(inode, 1158 &imap, flags);
1165 page, bh, head, 0);
1166 } else {
1167 size = len;
1168 }
1169
1170 err = xfs_map_blocks(inode, offset, size,
1171 &iomap, flags);
1172 if (err) 1159 if (err)
1173 goto error; 1160 goto error;
1174 iomap_valid = xfs_iomap_valid(&iomap, offset); 1161 imap_valid = xfs_imap_valid(inode, &imap,
1162 offset);
1175 } 1163 }
1176 if (iomap_valid) { 1164 if (imap_valid) {
1177 xfs_map_at_offset(bh, offset, 1165 xfs_map_at_offset(inode, bh, &imap, offset);
1178 inode->i_blkbits, &iomap); 1166 xfs_add_to_ioend(inode, bh, offset, type,
1179 if (startio) { 1167 &ioend, new_ioend);
1180 xfs_add_to_ioend(inode, bh, offset,
1181 type, &ioend,
1182 new_ioend);
1183 } else {
1184 set_buffer_dirty(bh);
1185 unlock_buffer(bh);
1186 mark_buffer_dirty(bh);
1187 }
1188 page_dirty--;
1189 count++; 1168 count++;
1190 } 1169 }
1191 } else if (buffer_uptodate(bh) && startio) { 1170 } else if (buffer_uptodate(bh)) {
1192 /* 1171 /*
1193 * we got here because the buffer is already mapped. 1172 * we got here because the buffer is already mapped.
1194 * That means it must already have extents allocated 1173 * That means it must already have extents allocated
1195 * underneath it. Map the extent by reading it. 1174 * underneath it. Map the extent by reading it.
1196 */ 1175 */
1197 if (!iomap_valid || flags != BMAPI_READ) { 1176 if (!imap_valid || flags != BMAPI_READ) {
1198 flags = BMAPI_READ; 1177 flags = BMAPI_READ;
1199 size = xfs_probe_cluster(inode, page, bh, 1178 size = xfs_probe_cluster(inode, page, bh, head);
1200 head, 1);
1201 err = xfs_map_blocks(inode, offset, size, 1179 err = xfs_map_blocks(inode, offset, size,
1202 &iomap, flags); 1180 &imap, flags);
1203 if (err) 1181 if (err)
1204 goto error; 1182 goto error;
1205 iomap_valid = xfs_iomap_valid(&iomap, offset); 1183 imap_valid = xfs_imap_valid(inode, &imap,
1184 offset);
1206 } 1185 }
1207 1186
1208 /* 1187 /*
1209 * We set the type to IOMAP_NEW in case we are doing a 1188 * We set the type to IO_NEW in case we are doing a
1210 * small write at EOF that is extending the file but 1189 * small write at EOF that is extending the file but
1211 * without needing an allocation. We need to update the 1190 * without needing an allocation. We need to update the
1212 * file size on I/O completion in this case so it is 1191 * file size on I/O completion in this case so it is
1213 * the same case as having just allocated a new extent 1192 * the same case as having just allocated a new extent
1214 * that we are writing into for the first time. 1193 * that we are writing into for the first time.
1215 */ 1194 */
1216 type = IOMAP_NEW; 1195 type = IO_NEW;
1217 if (trylock_buffer(bh)) { 1196 if (trylock_buffer(bh)) {
1218 ASSERT(buffer_mapped(bh)); 1197 if (imap_valid)
1219 if (iomap_valid)
1220 all_bh = 1; 1198 all_bh = 1;
1221 xfs_add_to_ioend(inode, bh, offset, type, 1199 xfs_add_to_ioend(inode, bh, offset, type,
1222 &ioend, !iomap_valid); 1200 &ioend, !imap_valid);
1223 page_dirty--;
1224 count++; 1201 count++;
1225 } else { 1202 } else {
1226 iomap_valid = 0; 1203 imap_valid = 0;
1227 } 1204 }
1228 } else if ((buffer_uptodate(bh) || PageUptodate(page)) && 1205 } else if (PageUptodate(page)) {
1229 (unmapped || startio)) { 1206 ASSERT(buffer_mapped(bh));
1230 iomap_valid = 0; 1207 imap_valid = 0;
1231 } 1208 }
1232 1209
1233 if (!iohead) 1210 if (!iohead)
@@ -1238,132 +1215,48 @@ xfs_page_state_convert(
1238 if (uptodate && bh == head) 1215 if (uptodate && bh == head)
1239 SetPageUptodate(page); 1216 SetPageUptodate(page);
1240 1217
1241 if (startio) 1218 xfs_start_page_writeback(page, 1, count);
1242 xfs_start_page_writeback(page, 1, count);
1243
1244 if (ioend && iomap_valid) {
1245 offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
1246 PAGE_CACHE_SHIFT;
1247 tlast = min_t(pgoff_t, offset, last_index);
1248 xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
1249 wbc, startio, all_bh, tlast);
1250 }
1251
1252 if (iohead)
1253 xfs_submit_ioend(wbc, iohead);
1254
1255 return page_dirty;
1256
1257error:
1258 if (iohead)
1259 xfs_cancel_ioend(iohead);
1260 1219
1261 /* 1220 if (ioend && imap_valid) {
1262 * If it's delalloc and we have nowhere to put it, 1221 xfs_off_t end_index;
1263 * throw it away, unless the lower layers told
1264 * us to try again.
1265 */
1266 if (err != -EAGAIN) {
1267 if (!unmapped)
1268 xfs_aops_discard_page(page);
1269 ClearPageUptodate(page);
1270 }
1271 return err;
1272}
1273 1222
1274/* 1223 end_index = imap.br_startoff + imap.br_blockcount;
1275 * writepage: Called from one of two places:
1276 *
1277 * 1. we are flushing a delalloc buffer head.
1278 *
1279 * 2. we are writing out a dirty page. Typically the page dirty
1280 * state is cleared before we get here. In this case is it
1281 * conceivable we have no buffer heads.
1282 *
1283 * For delalloc space on the page we need to allocate space and
1284 * flush it. For unmapped buffer heads on the page we should
1285 * allocate space if the page is uptodate. For any other dirty
1286 * buffer heads on the page we should flush them.
1287 *
1288 * If we detect that a transaction would be required to flush
1289 * the page, we have to check the process flags first, if we
1290 * are already in a transaction or disk I/O during allocations
1291 * is off, we need to fail the writepage and redirty the page.
1292 */
1293 1224
1294STATIC int 1225 /* to bytes */
1295xfs_vm_writepage( 1226 end_index <<= inode->i_blkbits;
1296 struct page *page,
1297 struct writeback_control *wbc)
1298{
1299 int error;
1300 int need_trans;
1301 int delalloc, unmapped, unwritten;
1302 struct inode *inode = page->mapping->host;
1303 1227
1304 trace_xfs_writepage(inode, page, 0); 1228 /* to pages */
1229 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1305 1230
1306 /* 1231 /* check against file size */
1307 * We need a transaction if: 1232 if (end_index > last_index)
1308 * 1. There are delalloc buffers on the page 1233 end_index = last_index;
1309 * 2. The page is uptodate and we have unmapped buffers
1310 * 3. The page is uptodate and we have no buffers
1311 * 4. There are unwritten buffers on the page
1312 */
1313 1234
1314 if (!page_has_buffers(page)) { 1235 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1315 unmapped = 1; 1236 wbc, all_bh, end_index);
1316 need_trans = 1;
1317 } else {
1318 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1319 if (!PageUptodate(page))
1320 unmapped = 0;
1321 need_trans = delalloc + unmapped + unwritten;
1322 } 1237 }
1323 1238
1324 /* 1239 if (iohead)
1325 * If we need a transaction and the process flags say 1240 xfs_submit_ioend(wbc, iohead);
1326 * we are already in a transaction, or no IO is allowed
1327 * then mark the page dirty again and leave the page
1328 * as is.
1329 */
1330 if (current_test_flags(PF_FSTRANS) && need_trans)
1331 goto out_fail;
1332
1333 /*
1334 * Delay hooking up buffer heads until we have
1335 * made our go/no-go decision.
1336 */
1337 if (!page_has_buffers(page))
1338 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1339 1241
1242 return 0;
1340 1243
1341 /* 1244error:
1342 * VM calculation for nr_to_write seems off. Bump it way 1245 if (iohead)
1343 * up, this gets simple streaming writes zippy again. 1246 xfs_cancel_ioend(iohead);
1344 * To be reviewed again after Jens' writeback changes.
1345 */
1346 wbc->nr_to_write *= 4;
1347 1247
1348 /* 1248 if (err == -EAGAIN)
1349 * Convert delayed allocate, unwritten or unmapped space 1249 goto redirty;
1350 * to real space and flush out to disk.
1351 */
1352 error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
1353 if (error == -EAGAIN)
1354 goto out_fail;
1355 if (unlikely(error < 0))
1356 goto out_unlock;
1357 1250
1358 return 0; 1251 xfs_aops_discard_page(page);
1252 ClearPageUptodate(page);
1253 unlock_page(page);
1254 return err;
1359 1255
1360out_fail: 1256redirty:
1361 redirty_page_for_writepage(wbc, page); 1257 redirty_page_for_writepage(wbc, page);
1362 unlock_page(page); 1258 unlock_page(page);
1363 return 0; 1259 return 0;
1364out_unlock:
1365 unlock_page(page);
1366 return error;
1367} 1260}
1368 1261
1369STATIC int 1262STATIC int
@@ -1377,65 +1270,27 @@ xfs_vm_writepages(
1377 1270
1378/* 1271/*
1379 * Called to move a page into cleanable state - and from there 1272 * Called to move a page into cleanable state - and from there
1380 * to be released. Possibly the page is already clean. We always 1273 * to be released. The page should already be clean. We always
1381 * have buffer heads in this call. 1274 * have buffer heads in this call.
1382 * 1275 *
1383 * Returns 0 if the page is ok to release, 1 otherwise. 1276 * Returns 1 if the page is ok to release, 0 otherwise.
1384 *
1385 * Possible scenarios are:
1386 *
1387 * 1. We are being called to release a page which has been written
1388 * to via regular I/O. buffer heads will be dirty and possibly
1389 * delalloc. If no delalloc buffer heads in this case then we
1390 * can just return zero.
1391 *
1392 * 2. We are called to release a page which has been written via
1393 * mmap, all we need to do is ensure there is no delalloc
1394 * state in the buffer heads, if not we can let the caller
1395 * free them and we should come back later via writepage.
1396 */ 1277 */
1397STATIC int 1278STATIC int
1398xfs_vm_releasepage( 1279xfs_vm_releasepage(
1399 struct page *page, 1280 struct page *page,
1400 gfp_t gfp_mask) 1281 gfp_t gfp_mask)
1401{ 1282{
1402 struct inode *inode = page->mapping->host; 1283 int delalloc, unwritten;
1403 int dirty, delalloc, unmapped, unwritten;
1404 struct writeback_control wbc = {
1405 .sync_mode = WB_SYNC_ALL,
1406 .nr_to_write = 1,
1407 };
1408 1284
1409 trace_xfs_releasepage(inode, page, 0); 1285 trace_xfs_releasepage(page->mapping->host, page, 0);
1410
1411 if (!page_has_buffers(page))
1412 return 0;
1413 1286
1414 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); 1287 xfs_count_page_state(page, &delalloc, &unwritten);
1415 if (!delalloc && !unwritten)
1416 goto free_buffers;
1417 1288
1418 if (!(gfp_mask & __GFP_FS)) 1289 if (WARN_ON(delalloc))
1419 return 0; 1290 return 0;
1420 1291 if (WARN_ON(unwritten))
1421 /* If we are already inside a transaction or the thread cannot
1422 * do I/O, we cannot release this page.
1423 */
1424 if (current_test_flags(PF_FSTRANS))
1425 return 0; 1292 return 0;
1426 1293
1427 /*
1428 * Convert delalloc space to real space, do not flush the
1429 * data out to disk, that will be done by the caller.
1430 * Never need to allocate space here - we will always
1431 * come back to writepage in that case.
1432 */
1433 dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
1434 if (dirty == 0 && !unwritten)
1435 goto free_buffers;
1436 return 0;
1437
1438free_buffers:
1439 return try_to_free_buffers(page); 1294 return try_to_free_buffers(page);
1440} 1295}
1441 1296
@@ -1445,13 +1300,14 @@ __xfs_get_blocks(
1445 sector_t iblock, 1300 sector_t iblock,
1446 struct buffer_head *bh_result, 1301 struct buffer_head *bh_result,
1447 int create, 1302 int create,
1448 int direct, 1303 int direct)
1449 bmapi_flags_t flags)
1450{ 1304{
1451 xfs_iomap_t iomap; 1305 int flags = create ? BMAPI_WRITE : BMAPI_READ;
1306 struct xfs_bmbt_irec imap;
1452 xfs_off_t offset; 1307 xfs_off_t offset;
1453 ssize_t size; 1308 ssize_t size;
1454 int niomap = 1; 1309 int nimap = 1;
1310 int new = 0;
1455 int error; 1311 int error;
1456 1312
1457 offset = (xfs_off_t)iblock << inode->i_blkbits; 1313 offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1461,23 +1317,25 @@ __xfs_get_blocks(
1461 if (!create && direct && offset >= i_size_read(inode)) 1317 if (!create && direct && offset >= i_size_read(inode))
1462 return 0; 1318 return 0;
1463 1319
1464 error = xfs_iomap(XFS_I(inode), offset, size, 1320 if (direct && create)
1465 create ? flags : BMAPI_READ, &iomap, &niomap); 1321 flags |= BMAPI_DIRECT;
1322
1323 error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
1324 &new);
1466 if (error) 1325 if (error)
1467 return -error; 1326 return -error;
1468 if (niomap == 0) 1327 if (nimap == 0)
1469 return 0; 1328 return 0;
1470 1329
1471 if (iomap.iomap_bn != IOMAP_DADDR_NULL) { 1330 if (imap.br_startblock != HOLESTARTBLOCK &&
1331 imap.br_startblock != DELAYSTARTBLOCK) {
1472 /* 1332 /*
1473 * For unwritten extents do not report a disk address on 1333 * For unwritten extents do not report a disk address on
1474 * the read case (treat as if we're reading into a hole). 1334 * the read case (treat as if we're reading into a hole).
1475 */ 1335 */
1476 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { 1336 if (create || !ISUNWRITTEN(&imap))
1477 xfs_map_buffer(bh_result, &iomap, offset, 1337 xfs_map_buffer(inode, bh_result, &imap, offset);
1478 inode->i_blkbits); 1338 if (create && ISUNWRITTEN(&imap)) {
1479 }
1480 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
1481 if (direct) 1339 if (direct)
1482 bh_result->b_private = inode; 1340 bh_result->b_private = inode;
1483 set_buffer_unwritten(bh_result); 1341 set_buffer_unwritten(bh_result);
@@ -1488,7 +1346,7 @@ __xfs_get_blocks(
1488 * If this is a realtime file, data may be on a different device. 1346 * If this is a realtime file, data may be on a different device.
1489 * to that pointed to from the buffer_head b_bdev currently. 1347 * to that pointed to from the buffer_head b_bdev currently.
1490 */ 1348 */
1491 bh_result->b_bdev = iomap.iomap_target->bt_bdev; 1349 bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1492 1350
1493 /* 1351 /*
1494 * If we previously allocated a block out beyond eof and we are now 1352 * If we previously allocated a block out beyond eof and we are now
@@ -1502,10 +1360,10 @@ __xfs_get_blocks(
1502 if (create && 1360 if (create &&
1503 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1361 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1504 (offset >= i_size_read(inode)) || 1362 (offset >= i_size_read(inode)) ||
1505 (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) 1363 (new || ISUNWRITTEN(&imap))))
1506 set_buffer_new(bh_result); 1364 set_buffer_new(bh_result);
1507 1365
1508 if (iomap.iomap_flags & IOMAP_DELAY) { 1366 if (imap.br_startblock == DELAYSTARTBLOCK) {
1509 BUG_ON(direct); 1367 BUG_ON(direct);
1510 if (create) { 1368 if (create) {
1511 set_buffer_uptodate(bh_result); 1369 set_buffer_uptodate(bh_result);
@@ -1514,11 +1372,23 @@ __xfs_get_blocks(
1514 } 1372 }
1515 } 1373 }
1516 1374
1375 /*
1376 * If this is O_DIRECT or the mpage code calling tell them how large
1377 * the mapping is, so that we can avoid repeated get_blocks calls.
1378 */
1517 if (direct || size > (1 << inode->i_blkbits)) { 1379 if (direct || size > (1 << inode->i_blkbits)) {
1518 ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); 1380 xfs_off_t mapping_size;
1519 offset = min_t(xfs_off_t, 1381
1520 iomap.iomap_bsize - iomap.iomap_delta, size); 1382 mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1521 bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); 1383 mapping_size <<= inode->i_blkbits;
1384
1385 ASSERT(mapping_size > 0);
1386 if (mapping_size > size)
1387 mapping_size = size;
1388 if (mapping_size > LONG_MAX)
1389 mapping_size = LONG_MAX;
1390
1391 bh_result->b_size = mapping_size;
1522 } 1392 }
1523 1393
1524 return 0; 1394 return 0;
@@ -1531,8 +1401,7 @@ xfs_get_blocks(
1531 struct buffer_head *bh_result, 1401 struct buffer_head *bh_result,
1532 int create) 1402 int create)
1533{ 1403{
1534 return __xfs_get_blocks(inode, iblock, 1404 return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
1535 bh_result, create, 0, BMAPI_WRITE);
1536} 1405}
1537 1406
1538STATIC int 1407STATIC int
@@ -1542,61 +1411,59 @@ xfs_get_blocks_direct(
1542 struct buffer_head *bh_result, 1411 struct buffer_head *bh_result,
1543 int create) 1412 int create)
1544{ 1413{
1545 return __xfs_get_blocks(inode, iblock, 1414 return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
1546 bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
1547} 1415}
1548 1416
1417/*
1418 * Complete a direct I/O write request.
1419 *
1420 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1421 * need to issue a transaction to convert the range from unwritten to written
1422 * extents. In case this is regular synchronous I/O we just call xfs_end_io
1423 * to do this and we are done. But in case this was a successfull AIO
1424 * request this handler is called from interrupt context, from which we
1425 * can't start transactions. In that case offload the I/O completion to
1426 * the workqueues we also use for buffered I/O completion.
1427 */
1549STATIC void 1428STATIC void
1550xfs_end_io_direct( 1429xfs_end_io_direct_write(
1551 struct kiocb *iocb, 1430 struct kiocb *iocb,
1552 loff_t offset, 1431 loff_t offset,
1553 ssize_t size, 1432 ssize_t size,
1554 void *private) 1433 void *private,
1434 int ret,
1435 bool is_async)
1555{ 1436{
1556 xfs_ioend_t *ioend = iocb->private; 1437 struct xfs_ioend *ioend = iocb->private;
1557 1438
1558 /* 1439 /*
1559 * Non-NULL private data means we need to issue a transaction to 1440 * blockdev_direct_IO can return an error even after the I/O
1560 * convert a range from unwritten to written extents. This needs 1441 * completion handler was called. Thus we need to protect
1561 * to happen from process context but aio+dio I/O completion 1442 * against double-freeing.
1562 * happens from irq context so we need to defer it to a workqueue.
1563 * This is not necessary for synchronous direct I/O, but we do
1564 * it anyway to keep the code uniform and simpler.
1565 *
1566 * Well, if only it were that simple. Because synchronous direct I/O
1567 * requires extent conversion to occur *before* we return to userspace,
1568 * we have to wait for extent conversion to complete. Look at the
1569 * iocb that has been passed to us to determine if this is AIO or
1570 * not. If it is synchronous, tell xfs_finish_ioend() to kick the
1571 * workqueue and wait for it to complete.
1572 *
1573 * The core direct I/O code might be changed to always call the
1574 * completion handler in the future, in which case all this can
1575 * go away.
1576 */ 1443 */
1444 iocb->private = NULL;
1445
1577 ioend->io_offset = offset; 1446 ioend->io_offset = offset;
1578 ioend->io_size = size; 1447 ioend->io_size = size;
1579 if (ioend->io_type == IOMAP_READ) { 1448 if (private && size > 0)
1580 xfs_finish_ioend(ioend, 0); 1449 ioend->io_type = IO_UNWRITTEN;
1581 } else if (private && size > 0) { 1450
1582 xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); 1451 if (is_async) {
1583 } else {
1584 /* 1452 /*
1585 * A direct I/O write ioend starts it's life in unwritten 1453 * If we are converting an unwritten extent we need to delay
1586 * state in case they map an unwritten extent. This write 1454 * the AIO completion until after the unwrittent extent
1587 * didn't map an unwritten extent so switch it's completion 1455 * conversion has completed, otherwise do it ASAP.
1588 * handler.
1589 */ 1456 */
1590 ioend->io_type = IOMAP_NEW; 1457 if (ioend->io_type == IO_UNWRITTEN) {
1591 xfs_finish_ioend(ioend, 0); 1458 ioend->io_iocb = iocb;
1459 ioend->io_result = ret;
1460 } else {
1461 aio_complete(iocb, ret, 0);
1462 }
1463 xfs_finish_ioend(ioend);
1464 } else {
1465 xfs_finish_ioend_sync(ioend);
1592 } 1466 }
1593
1594 /*
1595 * blockdev_direct_IO can return an error even after the I/O
1596 * completion handler was called. Thus we need to protect
1597 * against double-freeing.
1598 */
1599 iocb->private = NULL;
1600} 1467}
1601 1468
1602STATIC ssize_t 1469STATIC ssize_t
@@ -1607,26 +1474,45 @@ xfs_vm_direct_IO(
1607 loff_t offset, 1474 loff_t offset,
1608 unsigned long nr_segs) 1475 unsigned long nr_segs)
1609{ 1476{
1610 struct file *file = iocb->ki_filp; 1477 struct inode *inode = iocb->ki_filp->f_mapping->host;
1611 struct inode *inode = file->f_mapping->host; 1478 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
1612 struct block_device *bdev; 1479 ssize_t ret;
1613 ssize_t ret;
1614
1615 bdev = xfs_find_bdev_for_inode(XFS_I(inode));
1616 1480
1617 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? 1481 if (rw & WRITE) {
1618 IOMAP_UNWRITTEN : IOMAP_READ); 1482 iocb->private = xfs_alloc_ioend(inode, IO_NEW);
1619 1483
1620 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, 1484 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1485 offset, nr_segs,
1486 xfs_get_blocks_direct,
1487 xfs_end_io_direct_write, NULL, 0);
1488 if (ret != -EIOCBQUEUED && iocb->private)
1489 xfs_destroy_ioend(iocb->private);
1490 } else {
1491 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1621 offset, nr_segs, 1492 offset, nr_segs,
1622 xfs_get_blocks_direct, 1493 xfs_get_blocks_direct,
1623 xfs_end_io_direct); 1494 NULL, NULL, 0);
1495 }
1624 1496
1625 if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1626 xfs_destroy_ioend(iocb->private);
1627 return ret; 1497 return ret;
1628} 1498}
1629 1499
1500STATIC void
1501xfs_vm_write_failed(
1502 struct address_space *mapping,
1503 loff_t to)
1504{
1505 struct inode *inode = mapping->host;
1506
1507 if (to > inode->i_size) {
1508 struct iattr ia = {
1509 .ia_valid = ATTR_SIZE | ATTR_FORCE,
1510 .ia_size = inode->i_size,
1511 };
1512 xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
1513 }
1514}
1515
1630STATIC int 1516STATIC int
1631xfs_vm_write_begin( 1517xfs_vm_write_begin(
1632 struct file *file, 1518 struct file *file,
@@ -1637,9 +1523,31 @@ xfs_vm_write_begin(
1637 struct page **pagep, 1523 struct page **pagep,
1638 void **fsdata) 1524 void **fsdata)
1639{ 1525{
1640 *pagep = NULL; 1526 int ret;
1641 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1527
1642 xfs_get_blocks); 1528 ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
1529 pagep, xfs_get_blocks);
1530 if (unlikely(ret))
1531 xfs_vm_write_failed(mapping, pos + len);
1532 return ret;
1533}
1534
1535STATIC int
1536xfs_vm_write_end(
1537 struct file *file,
1538 struct address_space *mapping,
1539 loff_t pos,
1540 unsigned len,
1541 unsigned copied,
1542 struct page *page,
1543 void *fsdata)
1544{
1545 int ret;
1546
1547 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1548 if (unlikely(ret < len))
1549 xfs_vm_write_failed(mapping, pos + len);
1550 return ret;
1643} 1551}
1644 1552
1645STATIC sector_t 1553STATIC sector_t
@@ -1650,7 +1558,7 @@ xfs_vm_bmap(
1650 struct inode *inode = (struct inode *)mapping->host; 1558 struct inode *inode = (struct inode *)mapping->host;
1651 struct xfs_inode *ip = XFS_I(inode); 1559 struct xfs_inode *ip = XFS_I(inode);
1652 1560
1653 xfs_itrace_entry(XFS_I(inode)); 1561 trace_xfs_vm_bmap(XFS_I(inode));
1654 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1562 xfs_ilock(ip, XFS_IOLOCK_SHARED);
1655 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); 1563 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
1656 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1564 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -1684,7 +1592,7 @@ const struct address_space_operations xfs_address_space_operations = {
1684 .releasepage = xfs_vm_releasepage, 1592 .releasepage = xfs_vm_releasepage,
1685 .invalidatepage = xfs_vm_invalidatepage, 1593 .invalidatepage = xfs_vm_invalidatepage,
1686 .write_begin = xfs_vm_write_begin, 1594 .write_begin = xfs_vm_write_begin,
1687 .write_end = generic_write_end, 1595 .write_end = xfs_vm_write_end,
1688 .bmap = xfs_vm_bmap, 1596 .bmap = xfs_vm_bmap,
1689 .direct_IO = xfs_vm_direct_IO, 1597 .direct_IO = xfs_vm_direct_IO,
1690 .migratepage = buffer_migrate_page, 1598 .migratepage = buffer_migrate_page,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4cfc6ea87df8..c5057fb6237a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -37,6 +37,8 @@ typedef struct xfs_ioend {
37 size_t io_size; /* size of the extent */ 37 size_t io_size; /* size of the extent */
38 xfs_off_t io_offset; /* offset in the file */ 38 xfs_off_t io_offset; /* offset in the file */
39 struct work_struct io_work; /* xfsdatad work queue */ 39 struct work_struct io_work; /* xfsdatad work queue */
40 struct kiocb *io_iocb;
41 int io_result;
40} xfs_ioend_t; 42} xfs_ioend_t;
41 43
42extern const struct address_space_operations xfs_address_space_operations; 44extern const struct address_space_operations xfs_address_space_operations;
@@ -45,6 +47,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
45extern void xfs_ioend_init(void); 47extern void xfs_ioend_init(void);
46extern void xfs_ioend_wait(struct xfs_inode *); 48extern void xfs_ioend_wait(struct xfs_inode *);
47 49
48extern void xfs_count_page_state(struct page *, int *, int *, int *); 50extern void xfs_count_page_state(struct page *, int *, int *);
49 51
50#endif /* __XFS_AOPS_H__ */ 52#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 44c2b0ef9a41..286e36e21dae 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,14 +37,14 @@
37 37
38#include "xfs_sb.h" 38#include "xfs_sb.h"
39#include "xfs_inum.h" 39#include "xfs_inum.h"
40#include "xfs_log.h"
40#include "xfs_ag.h" 41#include "xfs_ag.h"
41#include "xfs_dmapi.h"
42#include "xfs_mount.h" 42#include "xfs_mount.h"
43#include "xfs_trace.h" 43#include "xfs_trace.h"
44 44
45static kmem_zone_t *xfs_buf_zone; 45static kmem_zone_t *xfs_buf_zone;
46STATIC int xfsbufd(void *); 46STATIC int xfsbufd(void *);
47STATIC int xfsbufd_wakeup(int, gfp_t); 47STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
48STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 48STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
49static struct shrinker xfs_buf_shake = { 49static struct shrinker xfs_buf_shake = {
50 .shrink = xfsbufd_wakeup, 50 .shrink = xfsbufd_wakeup,
@@ -339,7 +339,7 @@ _xfs_buf_lookup_pages(
339 __func__, gfp_mask); 339 __func__, gfp_mask);
340 340
341 XFS_STATS_INC(xb_page_retries); 341 XFS_STATS_INC(xb_page_retries);
342 xfsbufd_wakeup(0, gfp_mask); 342 xfsbufd_wakeup(NULL, 0, gfp_mask);
343 congestion_wait(BLK_RW_ASYNC, HZ/50); 343 congestion_wait(BLK_RW_ASYNC, HZ/50);
344 goto retry; 344 goto retry;
345 } 345 }
@@ -440,12 +440,7 @@ _xfs_buf_find(
440 ASSERT(btp == bp->b_target); 440 ASSERT(btp == bp->b_target);
441 if (bp->b_file_offset == range_base && 441 if (bp->b_file_offset == range_base &&
442 bp->b_buffer_length == range_length) { 442 bp->b_buffer_length == range_length) {
443 /*
444 * If we look at something, bring it to the
445 * front of the list for next time.
446 */
447 atomic_inc(&bp->b_hold); 443 atomic_inc(&bp->b_hold);
448 list_move(&bp->b_hash_list, &hash->bh_list);
449 goto found; 444 goto found;
450 } 445 }
451 } 446 }
@@ -578,9 +573,9 @@ _xfs_buf_read(
578 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 573 XBF_READ_AHEAD | _XBF_RUN_QUEUES);
579 574
580 status = xfs_buf_iorequest(bp); 575 status = xfs_buf_iorequest(bp);
581 if (!status && !(flags & XBF_ASYNC)) 576 if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
582 status = xfs_buf_iowait(bp); 577 return status;
583 return status; 578 return xfs_buf_iowait(bp);
584} 579}
585 580
586xfs_buf_t * 581xfs_buf_t *
@@ -850,6 +845,12 @@ xfs_buf_lock_value(
850 * Note that this in no way locks the underlying pages, so it is only 845 * Note that this in no way locks the underlying pages, so it is only
851 * useful for synchronizing concurrent use of buffer objects, not for 846 * useful for synchronizing concurrent use of buffer objects, not for
852 * synchronizing independent access to the underlying pages. 847 * synchronizing independent access to the underlying pages.
848 *
849 * If we come across a stale, pinned, locked buffer, we know that we
850 * are being asked to lock a buffer that has been reallocated. Because
851 * it is pinned, we know that the log has not been pushed to disk and
852 * hence it will still be locked. Rather than sleeping until someone
853 * else pushes the log, push it ourselves before trying to get the lock.
853 */ 854 */
854void 855void
855xfs_buf_lock( 856xfs_buf_lock(
@@ -857,6 +858,8 @@ xfs_buf_lock(
857{ 858{
858 trace_xfs_buf_lock(bp, _RET_IP_); 859 trace_xfs_buf_lock(bp, _RET_IP_);
859 860
861 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
862 xfs_log_force(bp->b_mount, 0);
860 if (atomic_read(&bp->b_io_remaining)) 863 if (atomic_read(&bp->b_io_remaining))
861 blk_run_address_space(bp->b_target->bt_mapping); 864 blk_run_address_space(bp->b_target->bt_mapping);
862 down(&bp->b_sema); 865 down(&bp->b_sema);
@@ -888,36 +891,6 @@ xfs_buf_unlock(
888 trace_xfs_buf_unlock(bp, _RET_IP_); 891 trace_xfs_buf_unlock(bp, _RET_IP_);
889} 892}
890 893
891
892/*
893 * Pinning Buffer Storage in Memory
894 * Ensure that no attempt to force a buffer to disk will succeed.
895 */
896void
897xfs_buf_pin(
898 xfs_buf_t *bp)
899{
900 trace_xfs_buf_pin(bp, _RET_IP_);
901 atomic_inc(&bp->b_pin_count);
902}
903
904void
905xfs_buf_unpin(
906 xfs_buf_t *bp)
907{
908 trace_xfs_buf_unpin(bp, _RET_IP_);
909
910 if (atomic_dec_and_test(&bp->b_pin_count))
911 wake_up_all(&bp->b_waiters);
912}
913
914int
915xfs_buf_ispin(
916 xfs_buf_t *bp)
917{
918 return atomic_read(&bp->b_pin_count);
919}
920
921STATIC void 894STATIC void
922xfs_buf_wait_unpin( 895xfs_buf_wait_unpin(
923 xfs_buf_t *bp) 896 xfs_buf_t *bp)
@@ -1007,25 +980,19 @@ xfs_bwrite(
1007 struct xfs_mount *mp, 980 struct xfs_mount *mp,
1008 struct xfs_buf *bp) 981 struct xfs_buf *bp)
1009{ 982{
1010 int iowait = (bp->b_flags & XBF_ASYNC) == 0; 983 int error;
1011 int error = 0;
1012 984
1013 bp->b_strat = xfs_bdstrat_cb;
1014 bp->b_mount = mp; 985 bp->b_mount = mp;
1015 bp->b_flags |= XBF_WRITE; 986 bp->b_flags |= XBF_WRITE;
1016 if (!iowait) 987 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
1017 bp->b_flags |= _XBF_RUN_QUEUES;
1018 988
1019 xfs_buf_delwri_dequeue(bp); 989 xfs_buf_delwri_dequeue(bp);
1020 xfs_buf_iostrategy(bp); 990 xfs_bdstrat_cb(bp);
1021
1022 if (iowait) {
1023 error = xfs_buf_iowait(bp);
1024 if (error)
1025 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1026 xfs_buf_relse(bp);
1027 }
1028 991
992 error = xfs_buf_iowait(bp);
993 if (error)
994 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
995 xfs_buf_relse(bp);
1029 return error; 996 return error;
1030} 997}
1031 998
@@ -1036,7 +1003,6 @@ xfs_bdwrite(
1036{ 1003{
1037 trace_xfs_buf_bdwrite(bp, _RET_IP_); 1004 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1038 1005
1039 bp->b_strat = xfs_bdstrat_cb;
1040 bp->b_mount = mp; 1006 bp->b_mount = mp;
1041 1007
1042 bp->b_flags &= ~XBF_READ; 1008 bp->b_flags &= ~XBF_READ;
@@ -1071,7 +1037,6 @@ xfs_bioerror(
1071 XFS_BUF_UNDONE(bp); 1037 XFS_BUF_UNDONE(bp);
1072 XFS_BUF_STALE(bp); 1038 XFS_BUF_STALE(bp);
1073 1039
1074 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1075 xfs_biodone(bp); 1040 xfs_biodone(bp);
1076 1041
1077 return EIO; 1042 return EIO;
@@ -1101,7 +1066,6 @@ xfs_bioerror_relse(
1101 XFS_BUF_DONE(bp); 1066 XFS_BUF_DONE(bp);
1102 XFS_BUF_STALE(bp); 1067 XFS_BUF_STALE(bp);
1103 XFS_BUF_CLR_IODONE_FUNC(bp); 1068 XFS_BUF_CLR_IODONE_FUNC(bp);
1104 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1105 if (!(fl & XBF_ASYNC)) { 1069 if (!(fl & XBF_ASYNC)) {
1106 /* 1070 /*
1107 * Mark b_error and B_ERROR _both_. 1071 * Mark b_error and B_ERROR _both_.
@@ -1307,8 +1271,19 @@ submit_io:
1307 if (size) 1271 if (size)
1308 goto next_chunk; 1272 goto next_chunk;
1309 } else { 1273 } else {
1310 bio_put(bio); 1274 /*
1275 * if we get here, no pages were added to the bio. However,
1276 * we can't just error out here - if the pages are locked then
1277 * we have to unlock them otherwise we can hang on a later
1278 * access to the page.
1279 */
1311 xfs_buf_ioerror(bp, EIO); 1280 xfs_buf_ioerror(bp, EIO);
1281 if (bp->b_flags & _XBF_PAGE_LOCKED) {
1282 int i;
1283 for (i = 0; i < bp->b_page_count; i++)
1284 unlock_page(bp->b_pages[i]);
1285 }
1286 bio_put(bio);
1312 } 1287 }
1313} 1288}
1314 1289
@@ -1463,8 +1438,7 @@ xfs_alloc_bufhash(
1463{ 1438{
1464 unsigned int i; 1439 unsigned int i;
1465 1440
1466 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1441 btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */
1467 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1468 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1442 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1469 sizeof(xfs_bufhash_t)); 1443 sizeof(xfs_bufhash_t));
1470 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1444 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
@@ -1614,7 +1588,8 @@ xfs_mapping_buftarg(
1614 1588
1615STATIC int 1589STATIC int
1616xfs_alloc_delwrite_queue( 1590xfs_alloc_delwrite_queue(
1617 xfs_buftarg_t *btp) 1591 xfs_buftarg_t *btp,
1592 const char *fsname)
1618{ 1593{
1619 int error = 0; 1594 int error = 0;
1620 1595
@@ -1622,7 +1597,7 @@ xfs_alloc_delwrite_queue(
1622 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1597 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1623 spin_lock_init(&btp->bt_delwrite_lock); 1598 spin_lock_init(&btp->bt_delwrite_lock);
1624 btp->bt_flags = 0; 1599 btp->bt_flags = 0;
1625 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); 1600 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1626 if (IS_ERR(btp->bt_task)) { 1601 if (IS_ERR(btp->bt_task)) {
1627 error = PTR_ERR(btp->bt_task); 1602 error = PTR_ERR(btp->bt_task);
1628 goto out_error; 1603 goto out_error;
@@ -1635,7 +1610,8 @@ out_error:
1635xfs_buftarg_t * 1610xfs_buftarg_t *
1636xfs_alloc_buftarg( 1611xfs_alloc_buftarg(
1637 struct block_device *bdev, 1612 struct block_device *bdev,
1638 int external) 1613 int external,
1614 const char *fsname)
1639{ 1615{
1640 xfs_buftarg_t *btp; 1616 xfs_buftarg_t *btp;
1641 1617
@@ -1647,7 +1623,7 @@ xfs_alloc_buftarg(
1647 goto error; 1623 goto error;
1648 if (xfs_mapping_buftarg(btp, bdev)) 1624 if (xfs_mapping_buftarg(btp, bdev))
1649 goto error; 1625 goto error;
1650 if (xfs_alloc_delwrite_queue(btp)) 1626 if (xfs_alloc_delwrite_queue(btp, fsname))
1651 goto error; 1627 goto error;
1652 xfs_alloc_bufhash(btp, external); 1628 xfs_alloc_bufhash(btp, external);
1653 return btp; 1629 return btp;
@@ -1756,6 +1732,7 @@ xfs_buf_runall_queues(
1756 1732
1757STATIC int 1733STATIC int
1758xfsbufd_wakeup( 1734xfsbufd_wakeup(
1735 struct shrinker *shrink,
1759 int priority, 1736 int priority,
1760 gfp_t mask) 1737 gfp_t mask)
1761{ 1738{
@@ -1797,7 +1774,7 @@ xfs_buf_delwri_split(
1797 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1774 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1798 ASSERT(bp->b_flags & XBF_DELWRI); 1775 ASSERT(bp->b_flags & XBF_DELWRI);
1799 1776
1800 if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { 1777 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
1801 if (!force && 1778 if (!force &&
1802 time_before(jiffies, bp->b_queuetime + age)) { 1779 time_before(jiffies, bp->b_queuetime + age)) {
1803 xfs_buf_unlock(bp); 1780 xfs_buf_unlock(bp);
@@ -1882,7 +1859,7 @@ xfsbufd(
1882 struct xfs_buf *bp; 1859 struct xfs_buf *bp;
1883 bp = list_first_entry(&tmp, struct xfs_buf, b_list); 1860 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1884 list_del_init(&bp->b_list); 1861 list_del_init(&bp->b_list);
1885 xfs_buf_iostrategy(bp); 1862 xfs_bdstrat_cb(bp);
1886 count++; 1863 count++;
1887 } 1864 }
1888 if (count) 1865 if (count)
@@ -1929,7 +1906,7 @@ xfs_flush_buftarg(
1929 bp->b_flags &= ~XBF_ASYNC; 1906 bp->b_flags &= ~XBF_ASYNC;
1930 list_add(&bp->b_list, &wait_list); 1907 list_add(&bp->b_list, &wait_list);
1931 } 1908 }
1932 xfs_buf_iostrategy(bp); 1909 xfs_bdstrat_cb(bp);
1933 } 1910 }
1934 1911
1935 if (wait) { 1912 if (wait) {
@@ -1955,7 +1932,8 @@ xfs_buf_init(void)
1955 if (!xfs_buf_zone) 1932 if (!xfs_buf_zone)
1956 goto out; 1933 goto out;
1957 1934
1958 xfslogd_workqueue = create_workqueue("xfslogd"); 1935 xfslogd_workqueue = alloc_workqueue("xfslogd",
1936 WQ_RESCUER | WQ_HIGHPRI, 1);
1959 if (!xfslogd_workqueue) 1937 if (!xfslogd_workqueue)
1960 goto out_free_buf_zone; 1938 goto out_free_buf_zone;
1961 1939
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 386e7361e50e..2a05614f0b92 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -44,57 +44,57 @@ typedef enum {
44 XBRW_ZERO = 3, /* Zero target memory */ 44 XBRW_ZERO = 3, /* Zero target memory */
45} xfs_buf_rw_t; 45} xfs_buf_rw_t;
46 46
47typedef enum { 47#define XBF_READ (1 << 0) /* buffer intended for reading from device */
48 XBF_READ = (1 << 0), /* buffer intended for reading from device */ 48#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
49 XBF_WRITE = (1 << 1), /* buffer intended for writing to device */ 49#define XBF_MAPPED (1 << 2) /* buffer mapped (b_addr valid) */
50 XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */ 50#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
51 XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ 51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52 XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */ 52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
53 XBF_DELWRI = (1 << 6), /* buffer has dirty pages */ 53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54 XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ 54#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */
55 XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ 55#define XBF_ORDERED (1 << 11)/* use ordered writes */
56 XBF_ORDERED = (1 << 11), /* use ordered writes */ 56#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
57 XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ 57#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
58 XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log */ 58
59 59/* flags used only as arguments to access routines */
60 /* flags used only as arguments to access routines */ 60#define XBF_LOCK (1 << 14)/* lock requested */
61 XBF_LOCK = (1 << 14), /* lock requested */ 61#define XBF_TRYLOCK (1 << 15)/* lock requested, but do not wait */
62 XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ 62#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
63 XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ 63
64 64/* flags used only internally */
65 /* flags used only internally */ 65#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
66 _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ 66#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
67 _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ 67#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
68 _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ 68#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
69 _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ 69
70 70/*
71 /* 71 * Special flag for supporting metadata blocks smaller than a FSB.
72 * Special flag for supporting metadata blocks smaller than a FSB. 72 *
73 * 73 * In this case we can have multiple xfs_buf_t on a single page and
74 * In this case we can have multiple xfs_buf_t on a single page and 74 * need to lock out concurrent xfs_buf_t readers as they only
75 * need to lock out concurrent xfs_buf_t readers as they only 75 * serialise access to the buffer.
76 * serialise access to the buffer. 76 *
77 * 77 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
78 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation 78 * between reads of the page. Hence we can have one thread read the
79 * between reads of the page. Hence we can have one thread read the 79 * page and modify it, but then race with another thread that thinks
80 * page and modify it, but then race with another thread that thinks 80 * the page is not up-to-date and hence reads it again.
81 * the page is not up-to-date and hence reads it again. 81 *
82 * 82 * The result is that the first modifcation to the page is lost.
83 * The result is that the first modifcation to the page is lost. 83 * This sort of AGF/AGI reading race can happen when unlinking inodes
84 * This sort of AGF/AGI reading race can happen when unlinking inodes 84 * that require truncation and results in the AGI unlinked list
85 * that require truncation and results in the AGI unlinked list 85 * modifications being lost.
86 * modifications being lost. 86 */
87 */ 87#define _XBF_PAGE_LOCKED (1 << 22)
88 _XBF_PAGE_LOCKED = (1 << 22), 88
89 89/*
90 /* 90 * If we try a barrier write, but it fails we have to communicate
91 * If we try a barrier write, but it fails we have to communicate 91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * this to the upper layers. Unfortunately b_error gets overwritten 92 * when the buffer is re-issued so we have to add another flag to
93 * when the buffer is re-issued so we have to add another flag to 93 * keep this information.
94 * keep this information. 94 */
95 */ 95#define _XFS_BARRIER_FAILED (1 << 23)
96 _XFS_BARRIER_FAILED = (1 << 23), 96
97} xfs_buf_flags_t; 97typedef unsigned int xfs_buf_flags_t;
98 98
99#define XFS_BUF_FLAGS \ 99#define XFS_BUF_FLAGS \
100 { XBF_READ, "READ" }, \ 100 { XBF_READ, "READ" }, \
@@ -137,7 +137,6 @@ typedef struct xfs_buftarg {
137 size_t bt_smask; 137 size_t bt_smask;
138 138
139 /* per device buffer hash table */ 139 /* per device buffer hash table */
140 uint bt_hashmask;
141 uint bt_hashshift; 140 uint bt_hashshift;
142 xfs_bufhash_t *bt_hash; 141 xfs_bufhash_t *bt_hash;
143 142
@@ -187,7 +186,6 @@ typedef struct xfs_buf {
187 atomic_t b_io_remaining; /* #outstanding I/O requests */ 186 atomic_t b_io_remaining; /* #outstanding I/O requests */
188 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 187 xfs_buf_iodone_t b_iodone; /* I/O completion function */
189 xfs_buf_relse_t b_relse; /* releasing function */ 188 xfs_buf_relse_t b_relse; /* releasing function */
190 xfs_buf_bdstrat_t b_strat; /* pre-write function */
191 struct completion b_iowait; /* queue for I/O waiters */ 189 struct completion b_iowait; /* queue for I/O waiters */
192 void *b_fspriv; 190 void *b_fspriv;
193 void *b_fspriv2; 191 void *b_fspriv2;
@@ -245,11 +243,6 @@ extern int xfs_buf_iowait(xfs_buf_t *);
245extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 243extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
246 xfs_buf_rw_t); 244 xfs_buf_rw_t);
247 245
248static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
249{
250 return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
251}
252
253static inline int xfs_buf_geterror(xfs_buf_t *bp) 246static inline int xfs_buf_geterror(xfs_buf_t *bp)
254{ 247{
255 return bp ? bp->b_error : ENOMEM; 248 return bp ? bp->b_error : ENOMEM;
@@ -258,11 +251,6 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
258/* Buffer Utility Routines */ 251/* Buffer Utility Routines */
259extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); 252extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
260 253
261/* Pinning Buffer Storage in Memory */
262extern void xfs_buf_pin(xfs_buf_t *);
263extern void xfs_buf_unpin(xfs_buf_t *);
264extern int xfs_buf_ispin(xfs_buf_t *);
265
266/* Delayed Write Buffer Routines */ 254/* Delayed Write Buffer Routines */
267extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 255extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
268extern void xfs_buf_delwri_promote(xfs_buf_t *); 256extern void xfs_buf_delwri_promote(xfs_buf_t *);
@@ -326,8 +314,6 @@ extern void xfs_buf_terminate(void);
326#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) 314#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone)
327#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) 315#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func))
328#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) 316#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL)
329#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func))
330#define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL)
331 317
332#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv) 318#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv)
333#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) 319#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val))
@@ -351,7 +337,7 @@ extern void xfs_buf_terminate(void);
351#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 337#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
352#define XFS_BUF_SET_REF(bp, ref) do { } while (0) 338#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
353 339
354#define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp) 340#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
355 341
356#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp) 342#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp)
357#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) 343#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0)
@@ -370,8 +356,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
370 xfs_buf_rele(bp); 356 xfs_buf_rele(bp);
371} 357}
372 358
373#define xfs_bpin(bp) xfs_buf_pin(bp)
374#define xfs_bunpin(bp) xfs_buf_unpin(bp)
375#define xfs_biodone(bp) xfs_buf_ioend(bp, 0) 359#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
376 360
377#define xfs_biomove(bp, off, len, data, rw) \ 361#define xfs_biomove(bp, off, len, data, rw) \
@@ -390,7 +374,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
390/* 374/*
391 * Handling of buftargs. 375 * Handling of buftargs.
392 */ 376 */
393extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); 377extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
394extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 378extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
395extern void xfs_wait_buftarg(xfs_buftarg_t *); 379extern void xfs_wait_buftarg(xfs_buftarg_t *);
396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 380extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_dmapi_priv.h b/fs/xfs/linux-2.6/xfs_dmapi_priv.h
deleted file mode 100644
index a8b0b1685eed..000000000000
--- a/fs/xfs/linux-2.6/xfs_dmapi_priv.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DMAPI_PRIV_H__
19#define __XFS_DMAPI_PRIV_H__
20
21/*
22 * Based on IO_ISDIRECT, decide which i_ flag is set.
23 */
24#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
25 DM_FLAGS_IMUX : 0)
26#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
27
28#endif /*__XFS_DMAPI_PRIV_H__*/
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 846b75aeb2ab..3764d74790ec 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -23,13 +23,13 @@
23#include "xfs_sb.h" 23#include "xfs_sb.h"
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dir2.h" 25#include "xfs_dir2.h"
26#include "xfs_dmapi.h"
27#include "xfs_mount.h" 26#include "xfs_mount.h"
28#include "xfs_export.h" 27#include "xfs_export.h"
29#include "xfs_vnodeops.h" 28#include "xfs_vnodeops.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_inode.h" 30#include "xfs_inode.h"
32#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
32#include "xfs_trace.h"
33 33
34/* 34/*
35 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
@@ -128,13 +128,11 @@ xfs_nfs_get_inode(
128 return ERR_PTR(-ESTALE); 128 return ERR_PTR(-ESTALE);
129 129
130 /* 130 /*
131 * The XFS_IGET_BULKSTAT means that an invalid inode number is just 131 * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
132 * fine and not an indication of a corrupted filesystem. Because 132 * fine and not an indication of a corrupted filesystem as clients can
133 * clients can send any kind of invalid file handle, e.g. after 133 * send invalid file handles and we have to handle it gracefully..
134 * a restore on the server we have to deal with this case gracefully.
135 */ 134 */
136 error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT, 135 error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
137 XFS_ILOCK_SHARED, &ip, 0);
138 if (error) { 136 if (error) {
139 /* 137 /*
140 * EINVAL means the inode cluster doesn't exist anymore. 138 * EINVAL means the inode cluster doesn't exist anymore.
@@ -149,11 +147,10 @@ xfs_nfs_get_inode(
149 } 147 }
150 148
151 if (ip->i_d.di_gen != generation) { 149 if (ip->i_d.di_gen != generation) {
152 xfs_iput_new(ip, XFS_ILOCK_SHARED); 150 IRELE(ip);
153 return ERR_PTR(-ENOENT); 151 return ERR_PTR(-ENOENT);
154 } 152 }
155 153
156 xfs_iunlock(ip, XFS_ILOCK_SHARED);
157 return VFS_I(ip); 154 return VFS_I(ip);
158} 155}
159 156
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bcfba6b..ba8ad422a165 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -22,23 +22,15 @@
22#include "xfs_inum.h" 22#include "xfs_inum.h"
23#include "xfs_sb.h" 23#include "xfs_sb.h"
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dir2.h"
26#include "xfs_trans.h" 25#include "xfs_trans.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h"
32#include "xfs_alloc.h" 28#include "xfs_alloc.h"
33#include "xfs_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_dinode.h" 29#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
39#include "xfs_bmap.h" 32#include "xfs_bmap.h"
40#include "xfs_error.h" 33#include "xfs_error.h"
41#include "xfs_rw.h"
42#include "xfs_vnodeops.h" 34#include "xfs_vnodeops.h"
43#include "xfs_da_btree.h" 35#include "xfs_da_btree.h"
44#include "xfs_ioctl.h" 36#include "xfs_ioctl.h"
@@ -100,21 +92,23 @@ xfs_iozero(
100STATIC int 92STATIC int
101xfs_file_fsync( 93xfs_file_fsync(
102 struct file *file, 94 struct file *file,
103 struct dentry *dentry,
104 int datasync) 95 int datasync)
105{ 96{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode); 97 struct inode *inode = file->f_mapping->host;
98 struct xfs_inode *ip = XFS_I(inode);
107 struct xfs_trans *tp; 99 struct xfs_trans *tp;
108 int error = 0; 100 int error = 0;
109 int log_flushed = 0; 101 int log_flushed = 0;
110 102
111 xfs_itrace_entry(ip); 103 trace_xfs_file_fsync(ip);
112 104
113 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 105 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
114 return -XFS_ERROR(EIO); 106 return -XFS_ERROR(EIO);
115 107
116 xfs_iflags_clear(ip, XFS_ITRUNCATED); 108 xfs_iflags_clear(ip, XFS_ITRUNCATED);
117 109
110 xfs_ioend_wait(ip);
111
118 /* 112 /*
119 * We always need to make sure that the required inode state is safe on 113 * We always need to make sure that the required inode state is safe on
120 * disk. The inode might be clean but we still might need to force the 114 * disk. The inode might be clean but we still might need to force the
@@ -138,8 +132,8 @@ xfs_file_fsync(
138 * might gets cleared when the inode gets written out via the AIL 132 * might gets cleared when the inode gets written out via the AIL
139 * or xfs_iflush_cluster. 133 * or xfs_iflush_cluster.
140 */ 134 */
141 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || 135 if (((inode->i_state & I_DIRTY_DATASYNC) ||
142 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && 136 ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
143 ip->i_update_core) { 137 ip->i_update_core) {
144 /* 138 /*
145 * Kick off a transaction to log the inode core to get the 139 * Kick off a transaction to log the inode core to get the
@@ -164,8 +158,7 @@ xfs_file_fsync(
164 * transaction. So we play it safe and fire off the 158 * transaction. So we play it safe and fire off the
165 * transaction anyway. 159 * transaction anyway.
166 */ 160 */
167 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 161 xfs_trans_ijoin(tp, ip);
168 xfs_trans_ihold(tp, ip);
169 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 162 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
170 xfs_trans_set_sync(tp); 163 xfs_trans_set_sync(tp);
171 error = _xfs_trans_commit(tp, 0, &log_flushed); 164 error = _xfs_trans_commit(tp, 0, &log_flushed);
@@ -273,20 +266,6 @@ xfs_file_aio_read(
273 mutex_lock(&inode->i_mutex); 266 mutex_lock(&inode->i_mutex);
274 xfs_ilock(ip, XFS_IOLOCK_SHARED); 267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
275 268
276 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
277 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
278 int iolock = XFS_IOLOCK_SHARED;
279
280 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
281 dmflags, &iolock);
282 if (ret) {
283 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
284 if (unlikely(ioflags & IO_ISDIRECT))
285 mutex_unlock(&inode->i_mutex);
286 return ret;
287 }
288 }
289
290 if (unlikely(ioflags & IO_ISDIRECT)) { 269 if (unlikely(ioflags & IO_ISDIRECT)) {
291 if (inode->i_mapping->nrpages) { 270 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip, 271 ret = -xfs_flushinval_pages(ip,
@@ -319,7 +298,6 @@ xfs_file_splice_read(
319 unsigned int flags) 298 unsigned int flags)
320{ 299{
321 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); 300 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
322 struct xfs_mount *mp = ip->i_mount;
323 int ioflags = 0; 301 int ioflags = 0;
324 ssize_t ret; 302 ssize_t ret;
325 303
@@ -333,18 +311,6 @@ xfs_file_splice_read(
333 311
334 xfs_ilock(ip, XFS_IOLOCK_SHARED); 312 xfs_ilock(ip, XFS_IOLOCK_SHARED);
335 313
336 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
337 int iolock = XFS_IOLOCK_SHARED;
338 int error;
339
340 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
341 FILP_DELAY_FLAG(infilp), &iolock);
342 if (error) {
343 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
344 return -error;
345 }
346 }
347
348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
349 315
350 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 316 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
@@ -365,7 +331,6 @@ xfs_file_splice_write(
365{ 331{
366 struct inode *inode = outfilp->f_mapping->host; 332 struct inode *inode = outfilp->f_mapping->host;
367 struct xfs_inode *ip = XFS_I(inode); 333 struct xfs_inode *ip = XFS_I(inode);
368 struct xfs_mount *mp = ip->i_mount;
369 xfs_fsize_t isize, new_size; 334 xfs_fsize_t isize, new_size;
370 int ioflags = 0; 335 int ioflags = 0;
371 ssize_t ret; 336 ssize_t ret;
@@ -380,18 +345,6 @@ xfs_file_splice_write(
380 345
381 xfs_ilock(ip, XFS_IOLOCK_EXCL); 346 xfs_ilock(ip, XFS_IOLOCK_EXCL);
382 347
383 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
384 int iolock = XFS_IOLOCK_EXCL;
385 int error;
386
387 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
388 FILP_DELAY_FLAG(outfilp), &iolock);
389 if (error) {
390 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
391 return -error;
392 }
393 }
394
395 new_size = *ppos + count; 348 new_size = *ppos + count;
396 349
397 xfs_ilock(ip, XFS_ILOCK_EXCL); 350 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -461,7 +414,7 @@ xfs_zero_last_block(
461 last_fsb = XFS_B_TO_FSBT(mp, isize); 414 last_fsb = XFS_B_TO_FSBT(mp, isize);
462 nimaps = 1; 415 nimaps = 1;
463 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, 416 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
464 &nimaps, NULL, NULL); 417 &nimaps, NULL);
465 if (error) { 418 if (error) {
466 return error; 419 return error;
467 } 420 }
@@ -556,7 +509,7 @@ xfs_zero_eof(
556 nimaps = 1; 509 nimaps = 1;
557 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 510 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
558 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, 511 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
559 0, NULL, 0, &imap, &nimaps, NULL, NULL); 512 0, NULL, 0, &imap, &nimaps, NULL);
560 if (error) { 513 if (error) {
561 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 514 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
562 return error; 515 return error;
@@ -625,7 +578,6 @@ xfs_file_aio_write(
625 int ioflags = 0; 578 int ioflags = 0;
626 xfs_fsize_t isize, new_size; 579 xfs_fsize_t isize, new_size;
627 int iolock; 580 int iolock;
628 int eventsent = 0;
629 size_t ocount = 0, count; 581 size_t ocount = 0, count;
630 int need_i_mutex; 582 int need_i_mutex;
631 583
@@ -671,33 +623,6 @@ start:
671 goto out_unlock_mutex; 623 goto out_unlock_mutex;
672 } 624 }
673 625
674 if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
675 !(ioflags & IO_INVIS) && !eventsent)) {
676 int dmflags = FILP_DELAY_FLAG(file);
677
678 if (need_i_mutex)
679 dmflags |= DM_FLAGS_IMUX;
680
681 xfs_iunlock(ip, XFS_ILOCK_EXCL);
682 error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
683 pos, count, dmflags, &iolock);
684 if (error) {
685 goto out_unlock_internal;
686 }
687 xfs_ilock(ip, XFS_ILOCK_EXCL);
688 eventsent = 1;
689
690 /*
691 * The iolock was dropped and reacquired in XFS_SEND_DATA
692 * so we have to recheck the size when appending.
693 * We will only "goto start;" once, since having sent the
694 * event prevents another call to XFS_SEND_DATA, which is
695 * what allows the size to change in the first place.
696 */
697 if ((file->f_flags & O_APPEND) && pos != ip->i_size)
698 goto start;
699 }
700
701 if (ioflags & IO_ISDIRECT) { 626 if (ioflags & IO_ISDIRECT) {
702 xfs_buftarg_t *target = 627 xfs_buftarg_t *target =
703 XFS_IS_REALTIME_INODE(ip) ? 628 XFS_IS_REALTIME_INODE(ip) ?
@@ -828,22 +753,6 @@ write_retry:
828 xfs_iunlock(ip, XFS_ILOCK_EXCL); 753 xfs_iunlock(ip, XFS_ILOCK_EXCL);
829 } 754 }
830 755
831 if (ret == -ENOSPC &&
832 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
833 xfs_iunlock(ip, iolock);
834 if (need_i_mutex)
835 mutex_unlock(&inode->i_mutex);
836 error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
837 DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
838 0, 0, 0); /* Delay flag intentionally unused */
839 if (need_i_mutex)
840 mutex_lock(&inode->i_mutex);
841 xfs_ilock(ip, iolock);
842 if (error)
843 goto out_unlock_internal;
844 goto start;
845 }
846
847 error = -ret; 756 error = -ret;
848 if (ret <= 0) 757 if (ret <= 0)
849 goto out_unlock_internal; 758 goto out_unlock_internal;
@@ -866,7 +775,7 @@ write_retry:
866 mutex_lock(&inode->i_mutex); 775 mutex_lock(&inode->i_mutex);
867 xfs_ilock(ip, iolock); 776 xfs_ilock(ip, iolock);
868 777
869 error2 = -xfs_file_fsync(file, file->f_path.dentry, 778 error2 = -xfs_file_fsync(file,
870 (file->f_flags & __O_SYNC) ? 0 : 1); 779 (file->f_flags & __O_SYNC) ? 0 : 1);
871 if (!error) 780 if (!error)
872 error = error2; 781 error = error2;
@@ -1012,9 +921,6 @@ const struct file_operations xfs_file_operations = {
1012 .open = xfs_file_open, 921 .open = xfs_file_open,
1013 .release = xfs_file_release, 922 .release = xfs_file_release,
1014 .fsync = xfs_file_fsync, 923 .fsync = xfs_file_fsync,
1015#ifdef HAVE_FOP_OPEN_EXEC
1016 .open_exec = xfs_file_open_exec,
1017#endif
1018}; 924};
1019 925
1020const struct file_operations xfs_dir_file_operations = { 926const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index b6918d76bc7b..1f279b012f94 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -21,10 +21,6 @@
21#include "xfs_inode.h" 21#include "xfs_inode.h"
22#include "xfs_trace.h" 22#include "xfs_trace.h"
23 23
24int fs_noerr(void) { return 0; }
25int fs_nosys(void) { return ENOSYS; }
26void fs_noval(void) { return; }
27
28/* 24/*
29 * note: all filemap functions return negative error codes. These 25 * note: all filemap functions return negative error codes. These
30 * need to be inverted before returning to the xfs core functions. 26 * need to be inverted before returning to the xfs core functions.
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
deleted file mode 100644
index 82bb19b2599e..000000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_FS_SUBR_H__
19#define __XFS_FS_SUBR_H__
20
21extern int fs_noerr(void);
22extern int fs_nosys(void);
23extern void fs_noval(void);
24
25#endif /* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 7b26cc2fd284..3b9e626f7cd1 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -23,24 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_attr_sf.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 29#include "xfs_dinode.h"
36#include "xfs_inode.h" 30#include "xfs_inode.h"
37#include "xfs_ioctl.h" 31#include "xfs_ioctl.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h"
40#include "xfs_rtalloc.h" 32#include "xfs_rtalloc.h"
41#include "xfs_itable.h" 33#include "xfs_itable.h"
42#include "xfs_error.h" 34#include "xfs_error.h"
43#include "xfs_rw.h"
44#include "xfs_attr.h" 35#include "xfs_attr.h"
45#include "xfs_bmap.h" 36#include "xfs_bmap.h"
46#include "xfs_buf_item.h" 37#include "xfs_buf_item.h"
@@ -527,6 +518,10 @@ xfs_attrmulti_by_handle(
527 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) 518 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
528 return -XFS_ERROR(EFAULT); 519 return -XFS_ERROR(EFAULT);
529 520
521 /* overflow check */
522 if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
523 return -E2BIG;
524
530 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); 525 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
531 if (IS_ERR(dentry)) 526 if (IS_ERR(dentry))
532 return PTR_ERR(dentry); 527 return PTR_ERR(dentry);
@@ -675,10 +670,9 @@ xfs_ioc_bulkstat(
675 error = xfs_bulkstat_single(mp, &inlast, 670 error = xfs_bulkstat_single(mp, &inlast,
676 bulkreq.ubuffer, &done); 671 bulkreq.ubuffer, &done);
677 else /* XFS_IOC_FSBULKSTAT */ 672 else /* XFS_IOC_FSBULKSTAT */
678 error = xfs_bulkstat(mp, &inlast, &count, 673 error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
679 (bulkstat_one_pf)xfs_bulkstat_one, NULL, 674 sizeof(xfs_bstat_t), bulkreq.ubuffer,
680 sizeof(xfs_bstat_t), bulkreq.ubuffer, 675 &done);
681 BULKSTAT_FG_QUICK, &done);
682 676
683 if (error) 677 if (error)
684 return -error; 678 return -error;
@@ -791,6 +785,8 @@ xfs_ioc_fsgetxattr(
791{ 785{
792 struct fsxattr fa; 786 struct fsxattr fa;
793 787
788 memset(&fa, 0, sizeof(struct fsxattr));
789
794 xfs_ilock(ip, XFS_ILOCK_SHARED); 790 xfs_ilock(ip, XFS_ILOCK_SHARED);
795 fa.fsx_xflags = xfs_ip2xflags(ip); 791 fa.fsx_xflags = xfs_ip2xflags(ip);
796 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; 792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
@@ -905,7 +901,7 @@ xfs_ioctl_setattr(
905 struct xfs_dquot *olddquot = NULL; 901 struct xfs_dquot *olddquot = NULL;
906 int code; 902 int code;
907 903
908 xfs_itrace_entry(ip); 904 trace_xfs_ioctl_setattr(ip);
909 905
910 if (mp->m_flags & XFS_MOUNT_RDONLY) 906 if (mp->m_flags & XFS_MOUNT_RDONLY)
911 return XFS_ERROR(EROFS); 907 return XFS_ERROR(EROFS);
@@ -913,6 +909,13 @@ xfs_ioctl_setattr(
913 return XFS_ERROR(EIO); 909 return XFS_ERROR(EIO);
914 910
915 /* 911 /*
912 * Disallow 32bit project ids because on-disk structure
913 * is 16bit only.
914 */
915 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
916 return XFS_ERROR(EINVAL);
917
918 /*
916 * If disk quotas is on, we make sure that the dquots do exist on disk, 919 * If disk quotas is on, we make sure that the dquots do exist on disk,
917 * before we start any other transactions. Trying to do this later 920 * before we start any other transactions. Trying to do this later
918 * is messy. We don't care to take a readlock to look at the ids 921 * is messy. We don't care to take a readlock to look at the ids
@@ -1040,8 +1043,7 @@ xfs_ioctl_setattr(
1040 } 1043 }
1041 } 1044 }
1042 1045
1043 xfs_trans_ijoin(tp, ip, lock_flags); 1046 xfs_trans_ijoin(tp, ip);
1044 xfs_trans_ihold(tp, ip);
1045 1047
1046 /* 1048 /*
1047 * Change file ownership. Must be the owner or privileged. 1049 * Change file ownership. Must be the owner or privileged.
@@ -1113,16 +1115,7 @@ xfs_ioctl_setattr(
1113 xfs_qm_dqrele(udqp); 1115 xfs_qm_dqrele(udqp);
1114 xfs_qm_dqrele(gdqp); 1116 xfs_qm_dqrele(gdqp);
1115 1117
1116 if (code) 1118 return code;
1117 return code;
1118
1119 if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
1120 XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
1121 NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
1122 (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
1123 }
1124
1125 return 0;
1126 1119
1127 error_return: 1120 error_return:
1128 xfs_qm_dqrele(udqp); 1121 xfs_qm_dqrele(udqp);
@@ -1298,7 +1291,7 @@ xfs_file_ioctl(
1298 if (filp->f_mode & FMODE_NOCMTIME) 1291 if (filp->f_mode & FMODE_NOCMTIME)
1299 ioflags |= IO_INVIS; 1292 ioflags |= IO_INVIS;
1300 1293
1301 xfs_itrace_entry(ip); 1294 trace_xfs_file_ioctl(ip);
1302 1295
1303 switch (cmd) { 1296 switch (cmd) {
1304 case XFS_IOC_ALLOCSP: 1297 case XFS_IOC_ALLOCSP:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 593c05b4df8d..6c83f7f62dc9 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -28,12 +28,8 @@
28#include "xfs_trans.h" 28#include "xfs_trans.h"
29#include "xfs_sb.h" 29#include "xfs_sb.h"
30#include "xfs_ag.h" 30#include "xfs_ag.h"
31#include "xfs_dir2.h"
32#include "xfs_dmapi.h"
33#include "xfs_mount.h" 31#include "xfs_mount.h"
34#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dir2_sf.h"
37#include "xfs_vnode.h" 33#include "xfs_vnode.h"
38#include "xfs_dinode.h" 34#include "xfs_dinode.h"
39#include "xfs_inode.h" 35#include "xfs_inode.h"
@@ -237,15 +233,12 @@ xfs_bulkstat_one_compat(
237 xfs_ino_t ino, /* inode number to get data for */ 233 xfs_ino_t ino, /* inode number to get data for */
238 void __user *buffer, /* buffer to place output in */ 234 void __user *buffer, /* buffer to place output in */
239 int ubsize, /* size of buffer */ 235 int ubsize, /* size of buffer */
240 void *private_data, /* my private data */
241 xfs_daddr_t bno, /* starting bno of inode cluster */
242 int *ubused, /* bytes used by me */ 236 int *ubused, /* bytes used by me */
243 void *dibuff, /* on-disk inode buffer */
244 int *stat) /* BULKSTAT_RV_... */ 237 int *stat) /* BULKSTAT_RV_... */
245{ 238{
246 return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, 239 return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
247 xfs_bulkstat_one_fmt_compat, bno, 240 xfs_bulkstat_one_fmt_compat,
248 ubused, dibuff, stat); 241 ubused, stat);
249} 242}
250 243
251/* copied from xfs_ioctl.c */ 244/* copied from xfs_ioctl.c */
@@ -298,13 +291,11 @@ xfs_compat_ioc_bulkstat(
298 int res; 291 int res;
299 292
300 error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, 293 error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
301 sizeof(compat_xfs_bstat_t), 294 sizeof(compat_xfs_bstat_t), 0, &res);
302 NULL, 0, NULL, NULL, &res);
303 } else if (cmd == XFS_IOC_FSBULKSTAT_32) { 295 } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
304 error = xfs_bulkstat(mp, &inlast, &count, 296 error = xfs_bulkstat(mp, &inlast, &count,
305 xfs_bulkstat_one_compat, NULL, 297 xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
306 sizeof(compat_xfs_bstat_t), bulkreq.ubuffer, 298 bulkreq.ubuffer, &done);
307 BULKSTAT_FG_QUICK, &done);
308 } else 299 } else
309 error = XFS_ERROR(EINVAL); 300 error = XFS_ERROR(EINVAL);
310 if (error) 301 if (error)
@@ -420,6 +411,10 @@ xfs_compat_attrmulti_by_handle(
420 sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) 411 sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
421 return -XFS_ERROR(EFAULT); 412 return -XFS_ERROR(EFAULT);
422 413
414 /* overflow check */
415 if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
416 return -E2BIG;
417
423 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); 418 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
424 if (IS_ERR(dentry)) 419 if (IS_ERR(dentry))
425 return PTR_ERR(dentry); 420 return PTR_ERR(dentry);
@@ -545,7 +540,7 @@ xfs_file_compat_ioctl(
545 if (filp->f_mode & FMODE_NOCMTIME) 540 if (filp->f_mode & FMODE_NOCMTIME)
546 ioflags |= IO_INVIS; 541 ioflags |= IO_INVIS;
547 542
548 xfs_itrace_entry(ip); 543 trace_xfs_file_compat_ioctl(ip);
549 544
550 switch (cmd) { 545 switch (cmd) {
551 /* No size or alignment issues on any arch */ 546 /* No size or alignment issues on any arch */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e65a7937f3a4..b1fc2a6bfe83 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -24,21 +24,13 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_alloc.h" 27#include "xfs_alloc.h"
29#include "xfs_dmapi.h"
30#include "xfs_quota.h" 28#include "xfs_quota.h"
31#include "xfs_mount.h" 29#include "xfs_mount.h"
32#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 31#include "xfs_dinode.h"
38#include "xfs_inode.h" 32#include "xfs_inode.h"
39#include "xfs_bmap.h" 33#include "xfs_bmap.h"
40#include "xfs_btree.h"
41#include "xfs_ialloc.h"
42#include "xfs_rtalloc.h" 34#include "xfs_rtalloc.h"
43#include "xfs_error.h" 35#include "xfs_error.h"
44#include "xfs_itable.h" 36#include "xfs_itable.h"
@@ -88,7 +80,7 @@ xfs_mark_inode_dirty_sync(
88{ 80{
89 struct inode *inode = VFS_I(ip); 81 struct inode *inode = VFS_I(ip);
90 82
91 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR))) 83 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
92 mark_inode_dirty_sync(inode); 84 mark_inode_dirty_sync(inode);
93} 85}
94 86
@@ -98,7 +90,7 @@ xfs_mark_inode_dirty(
98{ 90{
99 struct inode *inode = VFS_I(ip); 91 struct inode *inode = VFS_I(ip);
100 92
101 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR))) 93 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
102 mark_inode_dirty(inode); 94 mark_inode_dirty(inode);
103} 95}
104 96
@@ -496,7 +488,7 @@ xfs_vn_getattr(
496 struct xfs_inode *ip = XFS_I(inode); 488 struct xfs_inode *ip = XFS_I(inode);
497 struct xfs_mount *mp = ip->i_mount; 489 struct xfs_mount *mp = ip->i_mount;
498 490
499 xfs_itrace_entry(ip); 491 trace_xfs_getattr(ip);
500 492
501 if (XFS_FORCED_SHUTDOWN(mp)) 493 if (XFS_FORCED_SHUTDOWN(mp))
502 return XFS_ERROR(EIO); 494 return XFS_ERROR(EIO);
@@ -548,21 +540,6 @@ xfs_vn_setattr(
548 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 540 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
549} 541}
550 542
551/*
552 * block_truncate_page can return an error, but we can't propagate it
553 * at all here. Leave a complaint + stack trace in the syslog because
554 * this could be bad. If it is bad, we need to propagate the error further.
555 */
556STATIC void
557xfs_vn_truncate(
558 struct inode *inode)
559{
560 int error;
561 error = block_truncate_page(inode->i_mapping, inode->i_size,
562 xfs_get_blocks);
563 WARN_ON(error);
564}
565
566STATIC long 543STATIC long
567xfs_vn_fallocate( 544xfs_vn_fallocate(
568 struct inode *inode, 545 struct inode *inode,
@@ -585,11 +562,20 @@ xfs_vn_fallocate(
585 bf.l_len = len; 562 bf.l_len = len;
586 563
587 xfs_ilock(ip, XFS_IOLOCK_EXCL); 564 xfs_ilock(ip, XFS_IOLOCK_EXCL);
565
566 /* check the new inode size is valid before allocating */
567 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
568 offset + len > i_size_read(inode)) {
569 new_size = offset + len;
570 error = inode_newsize_ok(inode, new_size);
571 if (error)
572 goto out_unlock;
573 }
574
588 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, 575 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
589 0, XFS_ATTR_NOLOCK); 576 0, XFS_ATTR_NOLOCK);
590 if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && 577 if (error)
591 offset + len > i_size_read(inode)) 578 goto out_unlock;
592 new_size = offset + len;
593 579
594 /* Change file size if needed */ 580 /* Change file size if needed */
595 if (new_size) { 581 if (new_size) {
@@ -600,6 +586,7 @@ xfs_vn_fallocate(
600 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); 586 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
601 } 587 }
602 588
589out_unlock:
603 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 590 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
604out_error: 591out_error:
605 return error; 592 return error;
@@ -673,8 +660,11 @@ xfs_vn_fiemap(
673 bm.bmv_length = BTOBB(length); 660 bm.bmv_length = BTOBB(length);
674 661
675 /* We add one because in getbmap world count includes the header */ 662 /* We add one because in getbmap world count includes the header */
676 bm.bmv_count = fieinfo->fi_extents_max + 1; 663 bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
677 bm.bmv_iflags = BMV_IF_PREALLOC; 664 fieinfo->fi_extents_max + 1;
665 bm.bmv_count = min_t(__s32, bm.bmv_count,
666 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
667 bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
678 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 668 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
679 bm.bmv_iflags |= BMV_IF_ATTRFORK; 669 bm.bmv_iflags |= BMV_IF_ATTRFORK;
680 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) 670 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
@@ -689,7 +679,6 @@ xfs_vn_fiemap(
689 679
690static const struct inode_operations xfs_inode_operations = { 680static const struct inode_operations xfs_inode_operations = {
691 .check_acl = xfs_check_acl, 681 .check_acl = xfs_check_acl,
692 .truncate = xfs_vn_truncate,
693 .getattr = xfs_vn_getattr, 682 .getattr = xfs_vn_getattr,
694 .setattr = xfs_vn_setattr, 683 .setattr = xfs_vn_setattr,
695 .setxattr = generic_setxattr, 684 .setxattr = generic_setxattr,
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index facfb323a706..2fa0bd9ebc7f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -87,7 +87,6 @@
87#include <xfs_aops.h> 87#include <xfs_aops.h>
88#include <xfs_super.h> 88#include <xfs_super.h>
89#include <xfs_globals.h> 89#include <xfs_globals.h>
90#include <xfs_fs_subr.h>
91#include <xfs_buf.h> 90#include <xfs_buf.h>
92 91
93/* 92/*
@@ -157,8 +156,6 @@
157 */ 156 */
158#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) 157#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
159#define xfs_stack_trace() dump_stack() 158#define xfs_stack_trace() dump_stack()
160#define xfs_itruncate_data(ip, off) \
161 (-vmtruncate(VFS_I(ip), (off)))
162 159
163 160
164/* Move the kernel do_div definition off to one side */ 161/* Move the kernel do_div definition off to one side */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 1947514ce1ad..29b9d642e93d 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -16,13 +16,12 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_dmapi.h"
20#include "xfs_sb.h" 19#include "xfs_sb.h"
21#include "xfs_inum.h" 20#include "xfs_inum.h"
21#include "xfs_log.h"
22#include "xfs_ag.h" 22#include "xfs_ag.h"
23#include "xfs_mount.h" 23#include "xfs_mount.h"
24#include "xfs_quota.h" 24#include "xfs_quota.h"
25#include "xfs_log.h"
26#include "xfs_trans.h" 25#include "xfs_trans.h"
27#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
28#include "xfs_inode.h" 27#include "xfs_inode.h"
@@ -69,15 +68,15 @@ xfs_fs_set_xstate(
69 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) 68 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
70 return -ENOSYS; 69 return -ENOSYS;
71 70
72 if (uflags & XFS_QUOTA_UDQ_ACCT) 71 if (uflags & FS_QUOTA_UDQ_ACCT)
73 flags |= XFS_UQUOTA_ACCT; 72 flags |= XFS_UQUOTA_ACCT;
74 if (uflags & XFS_QUOTA_PDQ_ACCT) 73 if (uflags & FS_QUOTA_PDQ_ACCT)
75 flags |= XFS_PQUOTA_ACCT; 74 flags |= XFS_PQUOTA_ACCT;
76 if (uflags & XFS_QUOTA_GDQ_ACCT) 75 if (uflags & FS_QUOTA_GDQ_ACCT)
77 flags |= XFS_GQUOTA_ACCT; 76 flags |= XFS_GQUOTA_ACCT;
78 if (uflags & XFS_QUOTA_UDQ_ENFD) 77 if (uflags & FS_QUOTA_UDQ_ENFD)
79 flags |= XFS_UQUOTA_ENFD; 78 flags |= XFS_UQUOTA_ENFD;
80 if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) 79 if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
81 flags |= XFS_OQUOTA_ENFD; 80 flags |= XFS_OQUOTA_ENFD;
82 81
83 switch (op) { 82 switch (op) {
@@ -97,7 +96,7 @@ xfs_fs_set_xstate(
97} 96}
98 97
99STATIC int 98STATIC int
100xfs_fs_get_xquota( 99xfs_fs_get_dqblk(
101 struct super_block *sb, 100 struct super_block *sb,
102 int type, 101 int type,
103 qid_t id, 102 qid_t id,
@@ -114,7 +113,7 @@ xfs_fs_get_xquota(
114} 113}
115 114
116STATIC int 115STATIC int
117xfs_fs_set_xquota( 116xfs_fs_set_dqblk(
118 struct super_block *sb, 117 struct super_block *sb,
119 int type, 118 int type,
120 qid_t id, 119 qid_t id,
@@ -135,6 +134,6 @@ xfs_fs_set_xquota(
135const struct quotactl_ops xfs_quotactl_operations = { 134const struct quotactl_ops xfs_quotactl_operations = {
136 .get_xstate = xfs_fs_get_xstate, 135 .get_xstate = xfs_fs_get_xstate,
137 .set_xstate = xfs_fs_set_xstate, 136 .set_xstate = xfs_fs_set_xstate,
138 .get_xquota = xfs_fs_get_xquota, 137 .get_dqblk = xfs_fs_get_dqblk,
139 .set_xquota = xfs_fs_set_xquota, 138 .set_dqblk = xfs_fs_set_dqblk,
140}; 139};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 29f1edca76de..a4e07974955b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -25,14 +25,11 @@
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 27#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 28#include "xfs_quota.h"
30#include "xfs_mount.h" 29#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 31#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 32#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 33#include "xfs_dinode.h"
37#include "xfs_inode.h" 34#include "xfs_inode.h"
38#include "xfs_btree.h" 35#include "xfs_btree.h"
@@ -43,7 +40,6 @@
43#include "xfs_error.h" 40#include "xfs_error.h"
44#include "xfs_itable.h" 41#include "xfs_itable.h"
45#include "xfs_fsops.h" 42#include "xfs_fsops.h"
46#include "xfs_rw.h"
47#include "xfs_attr.h" 43#include "xfs_attr.h"
48#include "xfs_buf_item.h" 44#include "xfs_buf_item.h"
49#include "xfs_utils.h" 45#include "xfs_utils.h"
@@ -94,7 +90,6 @@ mempool_t *xfs_ioend_pool;
94#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and 90#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
95 * unwritten extent conversion */ 91 * unwritten extent conversion */
96#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ 92#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
97#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
98#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ 93#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
99#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ 94#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
100#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ 95#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
@@ -116,9 +111,8 @@ mempool_t *xfs_ioend_pool;
116#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ 111#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
117#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ 112#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
118#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ 113#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ 114#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ 115#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
122 116
123/* 117/*
124 * Table driven mount option parser. 118 * Table driven mount option parser.
@@ -170,15 +164,13 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
170STATIC int 164STATIC int
171xfs_parseargs( 165xfs_parseargs(
172 struct xfs_mount *mp, 166 struct xfs_mount *mp,
173 char *options, 167 char *options)
174 char **mtpt)
175{ 168{
176 struct super_block *sb = mp->m_super; 169 struct super_block *sb = mp->m_super;
177 char *this_char, *value, *eov; 170 char *this_char, *value, *eov;
178 int dsunit = 0; 171 int dsunit = 0;
179 int dswidth = 0; 172 int dswidth = 0;
180 int iosize = 0; 173 int iosize = 0;
181 int dmapi_implies_ikeep = 1;
182 __uint8_t iosizelog = 0; 174 __uint8_t iosizelog = 0;
183 175
184 /* 176 /*
@@ -241,15 +233,10 @@ xfs_parseargs(
241 if (!mp->m_logname) 233 if (!mp->m_logname)
242 return ENOMEM; 234 return ENOMEM;
243 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 235 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
244 if (!value || !*value) { 236 cmn_err(CE_WARN,
245 cmn_err(CE_WARN, 237 "XFS: %s option not allowed on this system",
246 "XFS: %s option requires an argument", 238 this_char);
247 this_char); 239 return EINVAL;
248 return EINVAL;
249 }
250 *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
251 if (!*mtpt)
252 return ENOMEM;
253 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 240 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
254 if (!value || !*value) { 241 if (!value || !*value) {
255 cmn_err(CE_WARN, 242 cmn_err(CE_WARN,
@@ -286,8 +273,6 @@ xfs_parseargs(
286 mp->m_flags &= ~XFS_MOUNT_GRPID; 273 mp->m_flags &= ~XFS_MOUNT_GRPID;
287 } else if (!strcmp(this_char, MNTOPT_WSYNC)) { 274 } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
288 mp->m_flags |= XFS_MOUNT_WSYNC; 275 mp->m_flags |= XFS_MOUNT_WSYNC;
289 } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
290 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
291 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { 276 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
292 mp->m_flags |= XFS_MOUNT_NORECOVERY; 277 mp->m_flags |= XFS_MOUNT_NORECOVERY;
293 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { 278 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
@@ -327,7 +312,6 @@ xfs_parseargs(
327 } else if (!strcmp(this_char, MNTOPT_IKEEP)) { 312 } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
328 mp->m_flags |= XFS_MOUNT_IKEEP; 313 mp->m_flags |= XFS_MOUNT_IKEEP;
329 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { 314 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
330 dmapi_implies_ikeep = 0;
331 mp->m_flags &= ~XFS_MOUNT_IKEEP; 315 mp->m_flags &= ~XFS_MOUNT_IKEEP;
332 } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { 316 } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
333 mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; 317 mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
@@ -368,19 +352,22 @@ xfs_parseargs(
368 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { 352 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
369 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); 353 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
370 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 354 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
371 } else if (!strcmp(this_char, MNTOPT_DMAPI)) { 355 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
372 mp->m_flags |= XFS_MOUNT_DMAPI; 356 mp->m_flags |= XFS_MOUNT_DELAYLOG;
373 } else if (!strcmp(this_char, MNTOPT_XDSM)) { 357 cmn_err(CE_WARN,
374 mp->m_flags |= XFS_MOUNT_DMAPI; 358 "Enabling EXPERIMENTAL delayed logging feature "
375 } else if (!strcmp(this_char, MNTOPT_DMI)) { 359 "- use at your own risk.\n");
376 mp->m_flags |= XFS_MOUNT_DMAPI; 360 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
361 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
377 } else if (!strcmp(this_char, "ihashsize")) { 362 } else if (!strcmp(this_char, "ihashsize")) {
378 cmn_err(CE_WARN, 363 cmn_err(CE_WARN,
379 "XFS: ihashsize no longer used, option is deprecated."); 364 "XFS: ihashsize no longer used, option is deprecated.");
380 } else if (!strcmp(this_char, "osyncisdsync")) { 365 } else if (!strcmp(this_char, "osyncisdsync")) {
381 /* no-op, this is now the default */
382 cmn_err(CE_WARN, 366 cmn_err(CE_WARN,
383 "XFS: osyncisdsync is now the default, option is deprecated."); 367 "XFS: osyncisdsync has no effect, option is deprecated.");
368 } else if (!strcmp(this_char, "osyncisosync")) {
369 cmn_err(CE_WARN,
370 "XFS: osyncisosync has no effect, option is deprecated.");
384 } else if (!strcmp(this_char, "irixsgid")) { 371 } else if (!strcmp(this_char, "irixsgid")) {
385 cmn_err(CE_WARN, 372 cmn_err(CE_WARN,
386 "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); 373 "XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
@@ -421,12 +408,6 @@ xfs_parseargs(
421 return EINVAL; 408 return EINVAL;
422 } 409 }
423 410
424 if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
425 printk("XFS: %s option needs the mount point option as well\n",
426 MNTOPT_DMAPI);
427 return EINVAL;
428 }
429
430 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 411 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
431 cmn_err(CE_WARN, 412 cmn_err(CE_WARN,
432 "XFS: sunit and swidth must be specified together"); 413 "XFS: sunit and swidth must be specified together");
@@ -440,18 +421,6 @@ xfs_parseargs(
440 return EINVAL; 421 return EINVAL;
441 } 422 }
442 423
443 /*
444 * Applications using DMI filesystems often expect the
445 * inode generation number to be monotonically increasing.
446 * If we delete inode chunks we break this assumption, so
447 * keep unused inode chunks on disk for DMI filesystems
448 * until we come up with a better solution.
449 * Note that if "ikeep" or "noikeep" mount options are
450 * supplied, then they are honored.
451 */
452 if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
453 mp->m_flags |= XFS_MOUNT_IKEEP;
454
455done: 424done:
456 if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { 425 if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
457 /* 426 /*
@@ -530,11 +499,10 @@ xfs_showargs(
530 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, 499 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
531 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, 500 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
532 { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, 501 { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY },
533 { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC },
534 { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, 502 { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 },
535 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 503 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
536 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
537 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 504 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
505 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
538 { 0, NULL } 506 { 0, NULL }
539 }; 507 };
540 static struct proc_xfs_info xfs_info_unset[] = { 508 static struct proc_xfs_info xfs_info_unset[] = {
@@ -725,7 +693,8 @@ void
725xfs_blkdev_issue_flush( 693xfs_blkdev_issue_flush(
726 xfs_buftarg_t *buftarg) 694 xfs_buftarg_t *buftarg)
727{ 695{
728 blkdev_issue_flush(buftarg->bt_bdev, NULL); 696 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
697 BLKDEV_IFL_WAIT);
729} 698}
730 699
731STATIC void 700STATIC void
@@ -789,18 +758,18 @@ xfs_open_devices(
789 * Setup xfs_mount buffer target pointers 758 * Setup xfs_mount buffer target pointers
790 */ 759 */
791 error = ENOMEM; 760 error = ENOMEM;
792 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); 761 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
793 if (!mp->m_ddev_targp) 762 if (!mp->m_ddev_targp)
794 goto out_close_rtdev; 763 goto out_close_rtdev;
795 764
796 if (rtdev) { 765 if (rtdev) {
797 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); 766 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
798 if (!mp->m_rtdev_targp) 767 if (!mp->m_rtdev_targp)
799 goto out_free_ddev_targ; 768 goto out_free_ddev_targ;
800 } 769 }
801 770
802 if (logdev && logdev != ddev) { 771 if (logdev && logdev != ddev) {
803 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); 772 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
804 if (!mp->m_logdev_targp) 773 if (!mp->m_logdev_targp)
805 goto out_free_rtdev_targ; 774 goto out_free_rtdev_targ;
806 } else { 775 } else {
@@ -902,7 +871,8 @@ xfsaild_start(
902 struct xfs_ail *ailp) 871 struct xfs_ail *ailp)
903{ 872{
904 ailp->xa_target = 0; 873 ailp->xa_target = 0;
905 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild"); 874 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
875 ailp->xa_mount->m_fsname);
906 if (IS_ERR(ailp->xa_task)) 876 if (IS_ERR(ailp->xa_task))
907 return -PTR_ERR(ailp->xa_task); 877 return -PTR_ERR(ailp->xa_task);
908 return 0; 878 return 0;
@@ -935,7 +905,7 @@ xfs_fs_destroy_inode(
935{ 905{
936 struct xfs_inode *ip = XFS_I(inode); 906 struct xfs_inode *ip = XFS_I(inode);
937 907
938 xfs_itrace_entry(ip); 908 trace_xfs_destroy_inode(ip);
939 909
940 XFS_STATS_INC(vn_reclaim); 910 XFS_STATS_INC(vn_reclaim);
941 911
@@ -1051,10 +1021,8 @@ xfs_log_inode(
1051 * an inode in another recent transaction. So we play it safe and 1021 * an inode in another recent transaction. So we play it safe and
1052 * fire off the transaction anyway. 1022 * fire off the transaction anyway.
1053 */ 1023 */
1054 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1024 xfs_trans_ijoin(tp, ip);
1055 xfs_trans_ihold(tp, ip);
1056 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1025 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1057 xfs_trans_set_sync(tp);
1058 error = xfs_trans_commit(tp, 0); 1026 error = xfs_trans_commit(tp, 0);
1059 xfs_ilock_demote(ip, XFS_ILOCK_EXCL); 1027 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1060 1028
@@ -1070,28 +1038,20 @@ xfs_fs_write_inode(
1070 struct xfs_mount *mp = ip->i_mount; 1038 struct xfs_mount *mp = ip->i_mount;
1071 int error = EAGAIN; 1039 int error = EAGAIN;
1072 1040
1073 xfs_itrace_entry(ip); 1041 trace_xfs_write_inode(ip);
1074 1042
1075 if (XFS_FORCED_SHUTDOWN(mp)) 1043 if (XFS_FORCED_SHUTDOWN(mp))
1076 return XFS_ERROR(EIO); 1044 return XFS_ERROR(EIO);
1077 1045
1078 if (wbc->sync_mode == WB_SYNC_ALL) { 1046 if (wbc->sync_mode == WB_SYNC_ALL) {
1079 /* 1047 /*
1080 * Make sure the inode has hit stable storage. By using the 1048 * Make sure the inode has made it it into the log. Instead
1081 * log and the fsync transactions we reduce the IOs we have 1049 * of forcing it all the way to stable storage using a
1082 * to do here from two (log and inode) to just the log. 1050 * synchronous transaction we let the log force inside the
1083 * 1051 * ->sync_fs call do that for thus, which reduces the number
1084 * Note: We still need to do a delwri write of the inode after 1052 * of synchronous log foces dramatically.
1085 * this to flush it to the backing buffer so that bulkstat
1086 * works properly if this is the first time the inode has been
1087 * written. Because we hold the ilock atomically over the
1088 * transaction commit and the inode flush we are guaranteed
1089 * that the inode is not pinned when it returns. If the flush
1090 * lock is already held, then the inode has already been
1091 * flushed once and we don't need to flush it again. Hence
1092 * the code will only flush the inode if it isn't already
1093 * being flushed.
1094 */ 1053 */
1054 xfs_ioend_wait(ip);
1095 xfs_ilock(ip, XFS_ILOCK_SHARED); 1055 xfs_ilock(ip, XFS_ILOCK_SHARED);
1096 if (ip->i_update_core) { 1056 if (ip->i_update_core) {
1097 error = xfs_log_inode(ip); 1057 error = xfs_log_inode(ip);
@@ -1103,27 +1063,29 @@ xfs_fs_write_inode(
1103 * We make this non-blocking if the inode is contended, return 1063 * We make this non-blocking if the inode is contended, return
1104 * EAGAIN to indicate to the caller that they did not succeed. 1064 * EAGAIN to indicate to the caller that they did not succeed.
1105 * This prevents the flush path from blocking on inodes inside 1065 * This prevents the flush path from blocking on inodes inside
1106 * another operation right now, they get caught later by xfs_sync. 1066 * another operation right now, they get caught later by
1067 * xfs_sync.
1107 */ 1068 */
1108 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) 1069 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1109 goto out; 1070 goto out;
1110 }
1111 1071
1112 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) 1072 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1113 goto out_unlock; 1073 goto out_unlock;
1114 1074
1115 /* 1075 /*
1116 * Now we have the flush lock and the inode is not pinned, we can check 1076 * Now we have the flush lock and the inode is not pinned, we
1117 * if the inode is really clean as we know that there are no pending 1077 * can check if the inode is really clean as we know that
1118 * transaction completions, it is not waiting on the delayed write 1078 * there are no pending transaction completions, it is not
1119 * queue and there is no IO in progress. 1079 * waiting on the delayed write queue and there is no IO in
1120 */ 1080 * progress.
1121 if (xfs_inode_clean(ip)) { 1081 */
1122 xfs_ifunlock(ip); 1082 if (xfs_inode_clean(ip)) {
1123 error = 0; 1083 xfs_ifunlock(ip);
1124 goto out_unlock; 1084 error = 0;
1085 goto out_unlock;
1086 }
1087 error = xfs_iflush(ip, 0);
1125 } 1088 }
1126 error = xfs_iflush(ip, 0);
1127 1089
1128 out_unlock: 1090 out_unlock:
1129 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1091 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1138,12 +1100,15 @@ xfs_fs_write_inode(
1138} 1100}
1139 1101
1140STATIC void 1102STATIC void
1141xfs_fs_clear_inode( 1103xfs_fs_evict_inode(
1142 struct inode *inode) 1104 struct inode *inode)
1143{ 1105{
1144 xfs_inode_t *ip = XFS_I(inode); 1106 xfs_inode_t *ip = XFS_I(inode);
1145 1107
1146 xfs_itrace_entry(ip); 1108 trace_xfs_evict_inode(ip);
1109
1110 truncate_inode_pages(&inode->i_data, 0);
1111 end_writeback(inode);
1147 XFS_STATS_INC(vn_rele); 1112 XFS_STATS_INC(vn_rele);
1148 XFS_STATS_INC(vn_remove); 1113 XFS_STATS_INC(vn_remove);
1149 XFS_STATS_DEC(vn_active); 1114 XFS_STATS_DEC(vn_active);
@@ -1180,22 +1145,13 @@ xfs_fs_put_super(
1180{ 1145{
1181 struct xfs_mount *mp = XFS_M(sb); 1146 struct xfs_mount *mp = XFS_M(sb);
1182 1147
1148 /*
1149 * Unregister the memory shrinker before we tear down the mount
1150 * structure so we don't have memory reclaim racing with us here.
1151 */
1152 xfs_inode_shrinker_unregister(mp);
1183 xfs_syncd_stop(mp); 1153 xfs_syncd_stop(mp);
1184 1154
1185 if (!(sb->s_flags & MS_RDONLY)) {
1186 /*
1187 * XXX(hch): this should be SYNC_WAIT.
1188 *
1189 * Or more likely not needed at all because the VFS is already
1190 * calling ->sync_fs after shutting down all filestem
1191 * operations and just before calling ->put_super.
1192 */
1193 xfs_sync_data(mp, 0);
1194 xfs_sync_attr(mp, 0);
1195 }
1196
1197 XFS_SEND_PREUNMOUNT(mp);
1198
1199 /* 1155 /*
1200 * Blow away any referenced inode in the filestreams cache. 1156 * Blow away any referenced inode in the filestreams cache.
1201 * This can and will cause log traffic as inodes go inactive 1157 * This can and will cause log traffic as inodes go inactive
@@ -1205,14 +1161,10 @@ xfs_fs_put_super(
1205 1161
1206 XFS_bflush(mp->m_ddev_targp); 1162 XFS_bflush(mp->m_ddev_targp);
1207 1163
1208 XFS_SEND_UNMOUNT(mp);
1209
1210 xfs_unmountfs(mp); 1164 xfs_unmountfs(mp);
1211 xfs_freesb(mp); 1165 xfs_freesb(mp);
1212 xfs_inode_shrinker_unregister(mp);
1213 xfs_icsb_destroy_counters(mp); 1166 xfs_icsb_destroy_counters(mp);
1214 xfs_close_devices(mp); 1167 xfs_close_devices(mp);
1215 xfs_dmops_put(mp);
1216 xfs_free_fsname(mp); 1168 xfs_free_fsname(mp);
1217 kfree(mp); 1169 kfree(mp);
1218} 1170}
@@ -1274,6 +1226,7 @@ xfs_fs_statfs(
1274 struct xfs_inode *ip = XFS_I(dentry->d_inode); 1226 struct xfs_inode *ip = XFS_I(dentry->d_inode);
1275 __uint64_t fakeinos, id; 1227 __uint64_t fakeinos, id;
1276 xfs_extlen_t lsize; 1228 xfs_extlen_t lsize;
1229 __int64_t ffree;
1277 1230
1278 statp->f_type = XFS_SB_MAGIC; 1231 statp->f_type = XFS_SB_MAGIC;
1279 statp->f_namelen = MAXNAMELEN - 1; 1232 statp->f_namelen = MAXNAMELEN - 1;
@@ -1297,7 +1250,11 @@ xfs_fs_statfs(
1297 statp->f_files = min_t(typeof(statp->f_files), 1250 statp->f_files = min_t(typeof(statp->f_files),
1298 statp->f_files, 1251 statp->f_files,
1299 mp->m_maxicount); 1252 mp->m_maxicount);
1300 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1253
1254 /* make sure statp->f_ffree does not underflow */
1255 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1256 statp->f_ffree = max_t(__int64_t, ffree, 0);
1257
1301 spin_unlock(&mp->m_sb_lock); 1258 spin_unlock(&mp->m_sb_lock);
1302 1259
1303 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || 1260 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1450,7 +1407,7 @@ xfs_fs_freeze(
1450 1407
1451 xfs_save_resvblks(mp); 1408 xfs_save_resvblks(mp);
1452 xfs_quiesce_attr(mp); 1409 xfs_quiesce_attr(mp);
1453 return -xfs_fs_log_dummy(mp); 1410 return -xfs_fs_log_dummy(mp, SYNC_WAIT);
1454} 1411}
1455 1412
1456STATIC int 1413STATIC int
@@ -1530,7 +1487,6 @@ xfs_fs_fill_super(
1530 struct inode *root; 1487 struct inode *root;
1531 struct xfs_mount *mp = NULL; 1488 struct xfs_mount *mp = NULL;
1532 int flags = 0, error = ENOMEM; 1489 int flags = 0, error = ENOMEM;
1533 char *mtpt = NULL;
1534 1490
1535 mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); 1491 mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
1536 if (!mp) 1492 if (!mp)
@@ -1546,7 +1502,7 @@ xfs_fs_fill_super(
1546 mp->m_super = sb; 1502 mp->m_super = sb;
1547 sb->s_fs_info = mp; 1503 sb->s_fs_info = mp;
1548 1504
1549 error = xfs_parseargs(mp, (char *)data, &mtpt); 1505 error = xfs_parseargs(mp, (char *)data);
1550 if (error) 1506 if (error)
1551 goto out_free_fsname; 1507 goto out_free_fsname;
1552 1508
@@ -1558,16 +1514,12 @@ xfs_fs_fill_super(
1558#endif 1514#endif
1559 sb->s_op = &xfs_super_operations; 1515 sb->s_op = &xfs_super_operations;
1560 1516
1561 error = xfs_dmops_get(mp);
1562 if (error)
1563 goto out_free_fsname;
1564
1565 if (silent) 1517 if (silent)
1566 flags |= XFS_MFSI_QUIET; 1518 flags |= XFS_MFSI_QUIET;
1567 1519
1568 error = xfs_open_devices(mp); 1520 error = xfs_open_devices(mp);
1569 if (error) 1521 if (error)
1570 goto out_put_dmops; 1522 goto out_free_fsname;
1571 1523
1572 if (xfs_icsb_init_counters(mp)) 1524 if (xfs_icsb_init_counters(mp))
1573 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 1525 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
@@ -1595,8 +1547,6 @@ xfs_fs_fill_super(
1595 if (error) 1547 if (error)
1596 goto out_filestream_unmount; 1548 goto out_filestream_unmount;
1597 1549
1598 XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
1599
1600 sb->s_magic = XFS_SB_MAGIC; 1550 sb->s_magic = XFS_SB_MAGIC;
1601 sb->s_blocksize = mp->m_sb.sb_blocksize; 1551 sb->s_blocksize = mp->m_sb.sb_blocksize;
1602 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1552 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1625,7 +1575,6 @@ xfs_fs_fill_super(
1625 1575
1626 xfs_inode_shrinker_register(mp); 1576 xfs_inode_shrinker_register(mp);
1627 1577
1628 kfree(mtpt);
1629 return 0; 1578 return 0;
1630 1579
1631 out_filestream_unmount: 1580 out_filestream_unmount:
@@ -1635,11 +1584,8 @@ xfs_fs_fill_super(
1635 out_destroy_counters: 1584 out_destroy_counters:
1636 xfs_icsb_destroy_counters(mp); 1585 xfs_icsb_destroy_counters(mp);
1637 xfs_close_devices(mp); 1586 xfs_close_devices(mp);
1638 out_put_dmops:
1639 xfs_dmops_put(mp);
1640 out_free_fsname: 1587 out_free_fsname:
1641 xfs_free_fsname(mp); 1588 xfs_free_fsname(mp);
1642 kfree(mtpt);
1643 kfree(mp); 1589 kfree(mp);
1644 out: 1590 out:
1645 return -error; 1591 return -error;
@@ -1683,7 +1629,7 @@ static const struct super_operations xfs_super_operations = {
1683 .destroy_inode = xfs_fs_destroy_inode, 1629 .destroy_inode = xfs_fs_destroy_inode,
1684 .dirty_inode = xfs_fs_dirty_inode, 1630 .dirty_inode = xfs_fs_dirty_inode,
1685 .write_inode = xfs_fs_write_inode, 1631 .write_inode = xfs_fs_write_inode,
1686 .clear_inode = xfs_fs_clear_inode, 1632 .evict_inode = xfs_fs_evict_inode,
1687 .put_super = xfs_fs_put_super, 1633 .put_super = xfs_fs_put_super,
1688 .sync_fs = xfs_fs_sync_fs, 1634 .sync_fs = xfs_fs_sync_fs,
1689 .freeze_fs = xfs_fs_freeze, 1635 .freeze_fs = xfs_fs_freeze,
@@ -1746,16 +1692,22 @@ xfs_init_zones(void)
1746 if (!xfs_trans_zone) 1692 if (!xfs_trans_zone)
1747 goto out_destroy_ifork_zone; 1693 goto out_destroy_ifork_zone;
1748 1694
1695 xfs_log_item_desc_zone =
1696 kmem_zone_init(sizeof(struct xfs_log_item_desc),
1697 "xfs_log_item_desc");
1698 if (!xfs_log_item_desc_zone)
1699 goto out_destroy_trans_zone;
1700
1749 /* 1701 /*
1750 * The size of the zone allocated buf log item is the maximum 1702 * The size of the zone allocated buf log item is the maximum
1751 * size possible under XFS. This wastes a little bit of memory, 1703 * size possible under XFS. This wastes a little bit of memory,
1752 * but it is much faster. 1704 * but it is much faster.
1753 */ 1705 */
1754 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + 1706 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
1755 (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / 1707 (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
1756 NBWORD) * sizeof(int))), "xfs_buf_item"); 1708 NBWORD) * sizeof(int))), "xfs_buf_item");
1757 if (!xfs_buf_item_zone) 1709 if (!xfs_buf_item_zone)
1758 goto out_destroy_trans_zone; 1710 goto out_destroy_log_item_desc_zone;
1759 1711
1760 xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + 1712 xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
1761 ((XFS_EFD_MAX_FAST_EXTENTS - 1) * 1713 ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
@@ -1792,6 +1744,8 @@ xfs_init_zones(void)
1792 kmem_zone_destroy(xfs_efd_zone); 1744 kmem_zone_destroy(xfs_efd_zone);
1793 out_destroy_buf_item_zone: 1745 out_destroy_buf_item_zone:
1794 kmem_zone_destroy(xfs_buf_item_zone); 1746 kmem_zone_destroy(xfs_buf_item_zone);
1747 out_destroy_log_item_desc_zone:
1748 kmem_zone_destroy(xfs_log_item_desc_zone);
1795 out_destroy_trans_zone: 1749 out_destroy_trans_zone:
1796 kmem_zone_destroy(xfs_trans_zone); 1750 kmem_zone_destroy(xfs_trans_zone);
1797 out_destroy_ifork_zone: 1751 out_destroy_ifork_zone:
@@ -1822,6 +1776,7 @@ xfs_destroy_zones(void)
1822 kmem_zone_destroy(xfs_efi_zone); 1776 kmem_zone_destroy(xfs_efi_zone);
1823 kmem_zone_destroy(xfs_efd_zone); 1777 kmem_zone_destroy(xfs_efd_zone);
1824 kmem_zone_destroy(xfs_buf_item_zone); 1778 kmem_zone_destroy(xfs_buf_item_zone);
1779 kmem_zone_destroy(xfs_log_item_desc_zone);
1825 kmem_zone_destroy(xfs_trans_zone); 1780 kmem_zone_destroy(xfs_trans_zone);
1826 kmem_zone_destroy(xfs_ifork_zone); 1781 kmem_zone_destroy(xfs_ifork_zone);
1827 kmem_zone_destroy(xfs_dabuf_zone); 1782 kmem_zone_destroy(xfs_dabuf_zone);
@@ -1870,7 +1825,6 @@ init_xfs_fs(void)
1870 goto out_cleanup_procfs; 1825 goto out_cleanup_procfs;
1871 1826
1872 vfs_initquota(); 1827 vfs_initquota();
1873 xfs_inode_shrinker_init();
1874 1828
1875 error = register_filesystem(&xfs_fs_type); 1829 error = register_filesystem(&xfs_fs_type);
1876 if (error) 1830 if (error)
@@ -1898,7 +1852,6 @@ exit_xfs_fs(void)
1898{ 1852{
1899 vfs_exitquota(); 1853 vfs_exitquota();
1900 unregister_filesystem(&xfs_fs_type); 1854 unregister_filesystem(&xfs_fs_type);
1901 xfs_inode_shrinker_destroy();
1902 xfs_sysctl_unregister(); 1855 xfs_sysctl_unregister();
1903 xfs_cleanup_procfs(); 1856 xfs_cleanup_procfs();
1904 xfs_buf_terminate(); 1857 xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b1..1ef4a4d2d997 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -56,12 +56,6 @@ extern void xfs_qm_exit(void);
56# define XFS_BIGFS_STRING 56# define XFS_BIGFS_STRING
57#endif 57#endif
58 58
59#ifdef CONFIG_XFS_DMAPI
60# define XFS_DMAPI_STRING "dmapi support, "
61#else
62# define XFS_DMAPI_STRING
63#endif
64
65#ifdef DEBUG 59#ifdef DEBUG
66# define XFS_DBG_STRING "debug" 60# define XFS_DBG_STRING "debug"
67#else 61#else
@@ -72,7 +66,6 @@ extern void xfs_qm_exit(void);
72 XFS_SECURITY_STRING \ 66 XFS_SECURITY_STRING \
73 XFS_REALTIME_STRING \ 67 XFS_REALTIME_STRING \
74 XFS_BIGFS_STRING \ 68 XFS_BIGFS_STRING \
75 XFS_DMAPI_STRING \
76 XFS_DBG_STRING /* DBG must be last */ 69 XFS_DBG_STRING /* DBG must be last */
77 70
78struct xfs_inode; 71struct xfs_inode;
@@ -85,7 +78,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
85extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 78extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
86 79
87extern const struct export_operations xfs_export_operations; 80extern const struct export_operations xfs_export_operations;
88extern struct xattr_handler *xfs_xattr_handlers[]; 81extern const struct xattr_handler *xfs_xattr_handlers[];
89extern const struct quotactl_ops xfs_quotactl_operations; 82extern const struct quotactl_ops xfs_quotactl_operations;
90 83
91#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 84#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a427c638d909..81976ffed7d6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -24,27 +24,17 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_inode.h" 29#include "xfs_inode.h"
37#include "xfs_dinode.h" 30#include "xfs_dinode.h"
38#include "xfs_error.h" 31#include "xfs_error.h"
39#include "xfs_mru_cache.h"
40#include "xfs_filestream.h" 32#include "xfs_filestream.h"
41#include "xfs_vnodeops.h" 33#include "xfs_vnodeops.h"
42#include "xfs_utils.h"
43#include "xfs_buf_item.h"
44#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
45#include "xfs_rw.h"
46#include "xfs_quota.h" 35#include "xfs_quota.h"
47#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_fsops.h"
48 38
49#include <linux/kthread.h> 39#include <linux/kthread.h>
50#include <linux/freezer.h> 40#include <linux/freezer.h>
@@ -144,6 +134,41 @@ restart:
144 return last_error; 134 return last_error;
145} 135}
146 136
137/*
138 * Select the next per-ag structure to iterate during the walk. The reclaim
139 * walk is optimised only to walk AGs with reclaimable inodes in them.
140 */
141static struct xfs_perag *
142xfs_inode_ag_iter_next_pag(
143 struct xfs_mount *mp,
144 xfs_agnumber_t *first,
145 int tag)
146{
147 struct xfs_perag *pag = NULL;
148
149 if (tag == XFS_ICI_RECLAIM_TAG) {
150 int found;
151 int ref;
152
153 spin_lock(&mp->m_perag_lock);
154 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
155 (void **)&pag, *first, 1, tag);
156 if (found <= 0) {
157 spin_unlock(&mp->m_perag_lock);
158 return NULL;
159 }
160 *first = pag->pag_agno + 1;
161 /* open coded pag reference increment */
162 ref = atomic_inc_return(&pag->pag_ref);
163 spin_unlock(&mp->m_perag_lock);
164 trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
165 } else {
166 pag = xfs_perag_get(mp, *first);
167 (*first)++;
168 }
169 return pag;
170}
171
147int 172int
148xfs_inode_ag_iterator( 173xfs_inode_ag_iterator(
149 struct xfs_mount *mp, 174 struct xfs_mount *mp,
@@ -154,20 +179,15 @@ xfs_inode_ag_iterator(
154 int exclusive, 179 int exclusive,
155 int *nr_to_scan) 180 int *nr_to_scan)
156{ 181{
182 struct xfs_perag *pag;
157 int error = 0; 183 int error = 0;
158 int last_error = 0; 184 int last_error = 0;
159 xfs_agnumber_t ag; 185 xfs_agnumber_t ag;
160 int nr; 186 int nr;
161 187
162 nr = nr_to_scan ? *nr_to_scan : INT_MAX; 188 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
163 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 189 ag = 0;
164 struct xfs_perag *pag; 190 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
165
166 pag = xfs_perag_get(mp, ag);
167 if (!pag->pag_ici_init) {
168 xfs_perag_put(pag);
169 continue;
170 }
171 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 191 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
172 exclusive, &nr); 192 exclusive, &nr);
173 xfs_perag_put(pag); 193 xfs_perag_put(pag);
@@ -289,7 +309,7 @@ xfs_sync_inode_attr(
289/* 309/*
290 * Write out pagecache data for the whole filesystem. 310 * Write out pagecache data for the whole filesystem.
291 */ 311 */
292int 312STATIC int
293xfs_sync_data( 313xfs_sync_data(
294 struct xfs_mount *mp, 314 struct xfs_mount *mp,
295 int flags) 315 int flags)
@@ -310,7 +330,7 @@ xfs_sync_data(
310/* 330/*
311 * Write out inode metadata (attributes) for the whole filesystem. 331 * Write out inode metadata (attributes) for the whole filesystem.
312 */ 332 */
313int 333STATIC int
314xfs_sync_attr( 334xfs_sync_attr(
315 struct xfs_mount *mp, 335 struct xfs_mount *mp,
316 int flags) 336 int flags)
@@ -322,102 +342,24 @@ xfs_sync_attr(
322} 342}
323 343
324STATIC int 344STATIC int
325xfs_commit_dummy_trans(
326 struct xfs_mount *mp,
327 uint flags)
328{
329 struct xfs_inode *ip = mp->m_rootip;
330 struct xfs_trans *tp;
331 int error;
332
333 /*
334 * Put a dummy transaction in the log to tell recovery
335 * that all others are OK.
336 */
337 tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
338 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
339 if (error) {
340 xfs_trans_cancel(tp, 0);
341 return error;
342 }
343
344 xfs_ilock(ip, XFS_ILOCK_EXCL);
345
346 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
347 xfs_trans_ihold(tp, ip);
348 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
349 error = xfs_trans_commit(tp, 0);
350 xfs_iunlock(ip, XFS_ILOCK_EXCL);
351
352 /* the log force ensures this transaction is pushed to disk */
353 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
354 return error;
355}
356
357STATIC int
358xfs_sync_fsdata( 345xfs_sync_fsdata(
359 struct xfs_mount *mp, 346 struct xfs_mount *mp)
360 int flags)
361{ 347{
362 struct xfs_buf *bp; 348 struct xfs_buf *bp;
363 struct xfs_buf_log_item *bip;
364 int error = 0;
365
366 /*
367 * If this is xfssyncd() then only sync the superblock if we can
368 * lock it without sleeping and it is not pinned.
369 */
370 if (flags & SYNC_TRYLOCK) {
371 ASSERT(!(flags & SYNC_WAIT));
372
373 bp = xfs_getsb(mp, XBF_TRYLOCK);
374 if (!bp)
375 goto out;
376
377 bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
378 if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
379 goto out_brelse;
380 } else {
381 bp = xfs_getsb(mp, 0);
382
383 /*
384 * If the buffer is pinned then push on the log so we won't
385 * get stuck waiting in the write for someone, maybe
386 * ourselves, to flush the log.
387 *
388 * Even though we just pushed the log above, we did not have
389 * the superblock buffer locked at that point so it can
390 * become pinned in between there and here.
391 */
392 if (XFS_BUF_ISPINNED(bp))
393 xfs_log_force(mp, 0);
394 }
395
396
397 if (flags & SYNC_WAIT)
398 XFS_BUF_UNASYNC(bp);
399 else
400 XFS_BUF_ASYNC(bp);
401
402 error = xfs_bwrite(mp, bp);
403 if (error)
404 return error;
405 349
406 /* 350 /*
407 * If this is a data integrity sync make sure all pending buffers 351 * If the buffer is pinned then push on the log so we won't get stuck
408 * are flushed out for the log coverage check below. 352 * waiting in the write for someone, maybe ourselves, to flush the log.
353 *
354 * Even though we just pushed the log above, we did not have the
355 * superblock buffer locked at that point so it can become pinned in
356 * between there and here.
409 */ 357 */
410 if (flags & SYNC_WAIT) 358 bp = xfs_getsb(mp, 0);
411 xfs_flush_buftarg(mp->m_ddev_targp, 1); 359 if (XFS_BUF_ISPINNED(bp))
412 360 xfs_log_force(mp, 0);
413 if (xfs_log_need_covered(mp))
414 error = xfs_commit_dummy_trans(mp, flags);
415 return error;
416 361
417 out_brelse: 362 return xfs_bwrite(mp, bp);
418 xfs_buf_relse(bp);
419 out:
420 return error;
421} 363}
422 364
423/* 365/*
@@ -441,7 +383,7 @@ int
441xfs_quiesce_data( 383xfs_quiesce_data(
442 struct xfs_mount *mp) 384 struct xfs_mount *mp)
443{ 385{
444 int error; 386 int error, error2 = 0;
445 387
446 /* push non-blocking */ 388 /* push non-blocking */
447 xfs_sync_data(mp, 0); 389 xfs_sync_data(mp, 0);
@@ -452,13 +394,20 @@ xfs_quiesce_data(
452 xfs_qm_sync(mp, SYNC_WAIT); 394 xfs_qm_sync(mp, SYNC_WAIT);
453 395
454 /* write superblock and hoover up shutdown errors */ 396 /* write superblock and hoover up shutdown errors */
455 error = xfs_sync_fsdata(mp, SYNC_WAIT); 397 error = xfs_sync_fsdata(mp);
398
399 /* make sure all delwri buffers are written out */
400 xfs_flush_buftarg(mp->m_ddev_targp, 1);
401
402 /* mark the log as covered if needed */
403 if (xfs_log_need_covered(mp))
404 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
456 405
457 /* flush data-only devices */ 406 /* flush data-only devices */
458 if (mp->m_rtdev_targp) 407 if (mp->m_rtdev_targp)
459 XFS_bflush(mp->m_rtdev_targp); 408 XFS_bflush(mp->m_rtdev_targp);
460 409
461 return error; 410 return error ? error : error2;
462} 411}
463 412
464STATIC void 413STATIC void
@@ -581,9 +530,9 @@ xfs_flush_inodes(
581} 530}
582 531
583/* 532/*
584 * Every sync period we need to unpin all items, reclaim inodes, sync 533 * Every sync period we need to unpin all items, reclaim inodes and sync
585 * quota and write out the superblock. We might need to cover the log 534 * disk quotas. We might need to cover the log to indicate that the
586 * to indicate it is idle. 535 * filesystem is idle and not frozen.
587 */ 536 */
588STATIC void 537STATIC void
589xfs_sync_worker( 538xfs_sync_worker(
@@ -597,7 +546,9 @@ xfs_sync_worker(
597 xfs_reclaim_inodes(mp, 0); 546 xfs_reclaim_inodes(mp, 0);
598 /* dgc: errors ignored here */ 547 /* dgc: errors ignored here */
599 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 548 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
600 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 549 if (mp->m_super->s_frozen == SB_UNFROZEN &&
550 xfs_log_need_covered(mp))
551 error = xfs_fs_log_dummy(mp, 0);
601 } 552 }
602 mp->m_sync_seq++; 553 mp->m_sync_seq++;
603 wake_up(&mp->m_wait_single_sync_task); 554 wake_up(&mp->m_wait_single_sync_task);
@@ -660,7 +611,7 @@ xfs_syncd_init(
660 mp->m_sync_work.w_syncer = xfs_sync_worker; 611 mp->m_sync_work.w_syncer = xfs_sync_worker;
661 mp->m_sync_work.w_mount = mp; 612 mp->m_sync_work.w_mount = mp;
662 mp->m_sync_work.w_completion = NULL; 613 mp->m_sync_work.w_completion = NULL;
663 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); 614 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
664 if (IS_ERR(mp->m_sync_task)) 615 if (IS_ERR(mp->m_sync_task))
665 return -PTR_ERR(mp->m_sync_task); 616 return -PTR_ERR(mp->m_sync_task);
666 return 0; 617 return 0;
@@ -681,6 +632,17 @@ __xfs_inode_set_reclaim_tag(
681 radix_tree_tag_set(&pag->pag_ici_root, 632 radix_tree_tag_set(&pag->pag_ici_root,
682 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 633 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
683 XFS_ICI_RECLAIM_TAG); 634 XFS_ICI_RECLAIM_TAG);
635
636 if (!pag->pag_ici_reclaimable) {
637 /* propagate the reclaim tag up into the perag radix tree */
638 spin_lock(&ip->i_mount->m_perag_lock);
639 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
640 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
641 XFS_ICI_RECLAIM_TAG);
642 spin_unlock(&ip->i_mount->m_perag_lock);
643 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
644 -1, _RET_IP_);
645 }
684 pag->pag_ici_reclaimable++; 646 pag->pag_ici_reclaimable++;
685} 647}
686 648
@@ -706,6 +668,24 @@ xfs_inode_set_reclaim_tag(
706 xfs_perag_put(pag); 668 xfs_perag_put(pag);
707} 669}
708 670
671STATIC void
672__xfs_inode_clear_reclaim(
673 xfs_perag_t *pag,
674 xfs_inode_t *ip)
675{
676 pag->pag_ici_reclaimable--;
677 if (!pag->pag_ici_reclaimable) {
678 /* clear the reclaim tag from the perag radix tree */
679 spin_lock(&ip->i_mount->m_perag_lock);
680 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
681 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
682 XFS_ICI_RECLAIM_TAG);
683 spin_unlock(&ip->i_mount->m_perag_lock);
684 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
685 -1, _RET_IP_);
686 }
687}
688
709void 689void
710__xfs_inode_clear_reclaim_tag( 690__xfs_inode_clear_reclaim_tag(
711 xfs_mount_t *mp, 691 xfs_mount_t *mp,
@@ -714,7 +694,7 @@ __xfs_inode_clear_reclaim_tag(
714{ 694{
715 radix_tree_tag_clear(&pag->pag_ici_root, 695 radix_tree_tag_clear(&pag->pag_ici_root,
716 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 696 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
717 pag->pag_ici_reclaimable--; 697 __xfs_inode_clear_reclaim(pag, ip);
718} 698}
719 699
720/* 700/*
@@ -853,7 +833,37 @@ out:
853reclaim: 833reclaim:
854 xfs_ifunlock(ip); 834 xfs_ifunlock(ip);
855 xfs_iunlock(ip, XFS_ILOCK_EXCL); 835 xfs_iunlock(ip, XFS_ILOCK_EXCL);
856 xfs_ireclaim(ip); 836
837 XFS_STATS_INC(xs_ig_reclaims);
838 /*
839 * Remove the inode from the per-AG radix tree.
840 *
841 * Because radix_tree_delete won't complain even if the item was never
842 * added to the tree assert that it's been there before to catch
843 * problems with the inode life time early on.
844 */
845 write_lock(&pag->pag_ici_lock);
846 if (!radix_tree_delete(&pag->pag_ici_root,
847 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
848 ASSERT(0);
849 __xfs_inode_clear_reclaim(pag, ip);
850 write_unlock(&pag->pag_ici_lock);
851
852 /*
853 * Here we do an (almost) spurious inode lock in order to coordinate
854 * with inode cache radix tree lookups. This is because the lookup
855 * can reference the inodes in the cache without taking references.
856 *
857 * We make that OK here by ensuring that we wait until the inode is
858 * unlocked after the lookup before we go ahead and free it. We get
859 * both the ilock and the iolock because the code may need to drop the
860 * ilock one but will still hold the iolock.
861 */
862 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
863 xfs_qm_dqdetach(ip);
864 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
865
866 xfs_inode_free(ip);
857 return error; 867 return error;
858 868
859} 869}
@@ -869,88 +879,52 @@ xfs_reclaim_inodes(
869 879
870/* 880/*
871 * Shrinker infrastructure. 881 * Shrinker infrastructure.
872 *
873 * This is all far more complex than it needs to be. It adds a global list of
874 * mounts because the shrinkers can only call a global context. We need to make
875 * the shrinkers pass a context to avoid the need for global state.
876 */ 882 */
877static LIST_HEAD(xfs_mount_list);
878static struct rw_semaphore xfs_mount_list_lock;
879
880static int 883static int
881xfs_reclaim_inode_shrink( 884xfs_reclaim_inode_shrink(
885 struct shrinker *shrink,
882 int nr_to_scan, 886 int nr_to_scan,
883 gfp_t gfp_mask) 887 gfp_t gfp_mask)
884{ 888{
885 struct xfs_mount *mp; 889 struct xfs_mount *mp;
886 struct xfs_perag *pag; 890 struct xfs_perag *pag;
887 xfs_agnumber_t ag; 891 xfs_agnumber_t ag;
888 int reclaimable = 0; 892 int reclaimable;
889 893
894 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
890 if (nr_to_scan) { 895 if (nr_to_scan) {
891 if (!(gfp_mask & __GFP_FS)) 896 if (!(gfp_mask & __GFP_FS))
892 return -1; 897 return -1;
893 898
894 down_read(&xfs_mount_list_lock); 899 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
895 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
896 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
897 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); 900 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
898 if (nr_to_scan <= 0) 901 /* if we don't exhaust the scan, don't bother coming back */
899 break; 902 if (nr_to_scan > 0)
900 } 903 return -1;
901 up_read(&xfs_mount_list_lock); 904 }
902 }
903
904 down_read(&xfs_mount_list_lock);
905 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
906 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
907 905
908 pag = xfs_perag_get(mp, ag); 906 reclaimable = 0;
909 if (!pag->pag_ici_init) { 907 ag = 0;
910 xfs_perag_put(pag); 908 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
911 continue; 909 XFS_ICI_RECLAIM_TAG))) {
912 } 910 reclaimable += pag->pag_ici_reclaimable;
913 reclaimable += pag->pag_ici_reclaimable; 911 xfs_perag_put(pag);
914 xfs_perag_put(pag);
915 }
916 } 912 }
917 up_read(&xfs_mount_list_lock);
918 return reclaimable; 913 return reclaimable;
919} 914}
920 915
921static struct shrinker xfs_inode_shrinker = {
922 .shrink = xfs_reclaim_inode_shrink,
923 .seeks = DEFAULT_SEEKS,
924};
925
926void __init
927xfs_inode_shrinker_init(void)
928{
929 init_rwsem(&xfs_mount_list_lock);
930 register_shrinker(&xfs_inode_shrinker);
931}
932
933void
934xfs_inode_shrinker_destroy(void)
935{
936 ASSERT(list_empty(&xfs_mount_list));
937 unregister_shrinker(&xfs_inode_shrinker);
938}
939
940void 916void
941xfs_inode_shrinker_register( 917xfs_inode_shrinker_register(
942 struct xfs_mount *mp) 918 struct xfs_mount *mp)
943{ 919{
944 down_write(&xfs_mount_list_lock); 920 mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
945 list_add_tail(&mp->m_mplist, &xfs_mount_list); 921 mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
946 up_write(&xfs_mount_list_lock); 922 register_shrinker(&mp->m_inode_shrink);
947} 923}
948 924
949void 925void
950xfs_inode_shrinker_unregister( 926xfs_inode_shrinker_unregister(
951 struct xfs_mount *mp) 927 struct xfs_mount *mp)
952{ 928{
953 down_write(&xfs_mount_list_lock); 929 unregister_shrinker(&mp->m_inode_shrink);
954 list_del(&mp->m_mplist);
955 up_write(&xfs_mount_list_lock);
956} 930}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index cdcbaaca9880..fe78726196f8 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -35,9 +35,6 @@ typedef struct xfs_sync_work {
35int xfs_syncd_init(struct xfs_mount *mp); 35int xfs_syncd_init(struct xfs_mount *mp);
36void xfs_syncd_stop(struct xfs_mount *mp); 36void xfs_syncd_stop(struct xfs_mount *mp);
37 37
38int xfs_sync_attr(struct xfs_mount *mp, int flags);
39int xfs_sync_data(struct xfs_mount *mp, int flags);
40
41int xfs_quiesce_data(struct xfs_mount *mp); 38int xfs_quiesce_data(struct xfs_mount *mp);
42void xfs_quiesce_attr(struct xfs_mount *mp); 39void xfs_quiesce_attr(struct xfs_mount *mp);
43 40
@@ -55,8 +52,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp,
55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
56 int flags, int tag, int write_lock, int *nr_to_scan); 53 int flags, int tag, int write_lock, int *nr_to_scan);
57 54
58void xfs_inode_shrinker_init(void);
59void xfs_inode_shrinker_destroy(void);
60void xfs_inode_shrinker_register(struct xfs_mount *mp); 55void xfs_inode_shrinker_register(struct xfs_mount *mp);
61void xfs_inode_shrinker_unregister(struct xfs_mount *mp); 56void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
62 57
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 5a107601e969..88d25d4aa56e 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -24,24 +24,19 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_da_btree.h" 27#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
32#include "xfs_dir2_sf.h"
33#include "xfs_attr_sf.h"
34#include "xfs_dinode.h" 31#include "xfs_dinode.h"
35#include "xfs_inode.h" 32#include "xfs_inode.h"
36#include "xfs_btree.h" 33#include "xfs_btree.h"
37#include "xfs_dmapi.h"
38#include "xfs_mount.h" 34#include "xfs_mount.h"
39#include "xfs_ialloc.h" 35#include "xfs_ialloc.h"
40#include "xfs_itable.h" 36#include "xfs_itable.h"
41#include "xfs_alloc.h" 37#include "xfs_alloc.h"
42#include "xfs_bmap.h" 38#include "xfs_bmap.h"
43#include "xfs_attr.h" 39#include "xfs_attr.h"
44#include "xfs_attr_sf.h"
45#include "xfs_attr_leaf.h" 40#include "xfs_attr_leaf.h"
46#include "xfs_log_priv.h" 41#include "xfs_log_priv.h"
47#include "xfs_buf_item.h" 42#include "xfs_buf_item.h"
@@ -50,6 +45,8 @@
50#include "xfs_aops.h" 45#include "xfs_aops.h"
51#include "quota/xfs_dquot_item.h" 46#include "quota/xfs_dquot_item.h"
52#include "quota/xfs_dquot.h" 47#include "quota/xfs_dquot.h"
48#include "xfs_log_recover.h"
49#include "xfs_inode_item.h"
53 50
54/* 51/*
55 * We include this last to have the helpers above available for the trace 52 * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index fcaa62f0799e..be5dffd282a1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
32struct xfs_dquot; 32struct xfs_dquot;
33struct xlog_ticket; 33struct xlog_ticket;
34struct log; 34struct log;
35struct xlog_recover;
36struct xlog_recover_item;
37struct xfs_buf_log_format;
38struct xfs_inode_log_format;
35 39
36DECLARE_EVENT_CLASS(xfs_attr_list_class, 40DECLARE_EVENT_CLASS(xfs_attr_list_class,
37 TP_PROTO(struct xfs_attr_list_context *ctx), 41 TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -78,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
78 ) 82 )
79) 83)
80 84
81#define DEFINE_PERAG_REF_EVENT(name) \
82TRACE_EVENT(name, \
83 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
84 unsigned long caller_ip), \
85 TP_ARGS(mp, agno, refcount, caller_ip), \
86 TP_STRUCT__entry( \
87 __field(dev_t, dev) \
88 __field(xfs_agnumber_t, agno) \
89 __field(int, refcount) \
90 __field(unsigned long, caller_ip) \
91 ), \
92 TP_fast_assign( \
93 __entry->dev = mp->m_super->s_dev; \
94 __entry->agno = agno; \
95 __entry->refcount = refcount; \
96 __entry->caller_ip = caller_ip; \
97 ), \
98 TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
99 MAJOR(__entry->dev), MINOR(__entry->dev), \
100 __entry->agno, \
101 __entry->refcount, \
102 (char *)__entry->caller_ip) \
103);
104
105DEFINE_PERAG_REF_EVENT(xfs_perag_get)
106DEFINE_PERAG_REF_EVENT(xfs_perag_put)
107
108#define DEFINE_ATTR_LIST_EVENT(name) \ 85#define DEFINE_ATTR_LIST_EVENT(name) \
109DEFINE_EVENT(xfs_attr_list_class, name, \ 86DEFINE_EVENT(xfs_attr_list_class, name, \
110 TP_PROTO(struct xfs_attr_list_context *ctx), \ 87 TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -118,6 +95,40 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
118DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); 95DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
119DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); 96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
120 97
98DECLARE_EVENT_CLASS(xfs_perag_class,
99 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
100 unsigned long caller_ip),
101 TP_ARGS(mp, agno, refcount, caller_ip),
102 TP_STRUCT__entry(
103 __field(dev_t, dev)
104 __field(xfs_agnumber_t, agno)
105 __field(int, refcount)
106 __field(unsigned long, caller_ip)
107 ),
108 TP_fast_assign(
109 __entry->dev = mp->m_super->s_dev;
110 __entry->agno = agno;
111 __entry->refcount = refcount;
112 __entry->caller_ip = caller_ip;
113 ),
114 TP_printk("dev %d:%d agno %u refcount %d caller %pf",
115 MAJOR(__entry->dev), MINOR(__entry->dev),
116 __entry->agno,
117 __entry->refcount,
118 (char *)__entry->caller_ip)
119);
120
121#define DEFINE_PERAG_REF_EVENT(name) \
122DEFINE_EVENT(xfs_perag_class, name, \
123 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
128DEFINE_PERAG_REF_EVENT(xfs_perag_put);
129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
131
121TRACE_EVENT(xfs_attr_list_node_descend, 132TRACE_EVENT(xfs_attr_list_node_descend,
122 TP_PROTO(struct xfs_attr_list_context *ctx, 133 TP_PROTO(struct xfs_attr_list_context *ctx,
123 struct xfs_da_node_entry *btree), 134 struct xfs_da_node_entry *btree),
@@ -306,8 +317,6 @@ DEFINE_BUF_EVENT(xfs_buf_init);
306DEFINE_BUF_EVENT(xfs_buf_free); 317DEFINE_BUF_EVENT(xfs_buf_free);
307DEFINE_BUF_EVENT(xfs_buf_hold); 318DEFINE_BUF_EVENT(xfs_buf_hold);
308DEFINE_BUF_EVENT(xfs_buf_rele); 319DEFINE_BUF_EVENT(xfs_buf_rele);
309DEFINE_BUF_EVENT(xfs_buf_pin);
310DEFINE_BUF_EVENT(xfs_buf_unpin);
311DEFINE_BUF_EVENT(xfs_buf_iodone); 320DEFINE_BUF_EVENT(xfs_buf_iodone);
312DEFINE_BUF_EVENT(xfs_buf_iorequest); 321DEFINE_BUF_EVENT(xfs_buf_iorequest);
313DEFINE_BUF_EVENT(xfs_buf_bawrite); 322DEFINE_BUF_EVENT(xfs_buf_bawrite);
@@ -530,7 +539,7 @@ DEFINE_LOCK_EVENT(xfs_ilock_nowait);
530DEFINE_LOCK_EVENT(xfs_ilock_demote); 539DEFINE_LOCK_EVENT(xfs_ilock_demote);
531DEFINE_LOCK_EVENT(xfs_iunlock); 540DEFINE_LOCK_EVENT(xfs_iunlock);
532 541
533DECLARE_EVENT_CLASS(xfs_iget_class, 542DECLARE_EVENT_CLASS(xfs_inode_class,
534 TP_PROTO(struct xfs_inode *ip), 543 TP_PROTO(struct xfs_inode *ip),
535 TP_ARGS(ip), 544 TP_ARGS(ip),
536 TP_STRUCT__entry( 545 TP_STRUCT__entry(
@@ -546,47 +555,127 @@ DECLARE_EVENT_CLASS(xfs_iget_class,
546 __entry->ino) 555 __entry->ino)
547) 556)
548 557
549#define DEFINE_IGET_EVENT(name) \ 558#define DEFINE_INODE_EVENT(name) \
550DEFINE_EVENT(xfs_iget_class, name, \ 559DEFINE_EVENT(xfs_inode_class, name, \
551 TP_PROTO(struct xfs_inode *ip), \ 560 TP_PROTO(struct xfs_inode *ip), \
552 TP_ARGS(ip)) 561 TP_ARGS(ip))
553DEFINE_IGET_EVENT(xfs_iget_skip); 562DEFINE_INODE_EVENT(xfs_iget_skip);
554DEFINE_IGET_EVENT(xfs_iget_reclaim); 563DEFINE_INODE_EVENT(xfs_iget_reclaim);
555DEFINE_IGET_EVENT(xfs_iget_found); 564DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
556DEFINE_IGET_EVENT(xfs_iget_alloc); 565DEFINE_INODE_EVENT(xfs_iget_hit);
557 566DEFINE_INODE_EVENT(xfs_iget_miss);
558DECLARE_EVENT_CLASS(xfs_inode_class, 567
568DEFINE_INODE_EVENT(xfs_getattr);
569DEFINE_INODE_EVENT(xfs_setattr);
570DEFINE_INODE_EVENT(xfs_readlink);
571DEFINE_INODE_EVENT(xfs_alloc_file_space);
572DEFINE_INODE_EVENT(xfs_free_file_space);
573DEFINE_INODE_EVENT(xfs_readdir);
574#ifdef CONFIG_XFS_POSIX_ACL
575DEFINE_INODE_EVENT(xfs_check_acl);
576#endif
577DEFINE_INODE_EVENT(xfs_vm_bmap);
578DEFINE_INODE_EVENT(xfs_file_ioctl);
579DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
580DEFINE_INODE_EVENT(xfs_ioctl_setattr);
581DEFINE_INODE_EVENT(xfs_file_fsync);
582DEFINE_INODE_EVENT(xfs_destroy_inode);
583DEFINE_INODE_EVENT(xfs_write_inode);
584DEFINE_INODE_EVENT(xfs_evict_inode);
585
586DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
587DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
588
589DECLARE_EVENT_CLASS(xfs_iref_class,
559 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), 590 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
560 TP_ARGS(ip, caller_ip), 591 TP_ARGS(ip, caller_ip),
561 TP_STRUCT__entry( 592 TP_STRUCT__entry(
562 __field(dev_t, dev) 593 __field(dev_t, dev)
563 __field(xfs_ino_t, ino) 594 __field(xfs_ino_t, ino)
564 __field(int, count) 595 __field(int, count)
596 __field(int, pincount)
565 __field(unsigned long, caller_ip) 597 __field(unsigned long, caller_ip)
566 ), 598 ),
567 TP_fast_assign( 599 TP_fast_assign(
568 __entry->dev = VFS_I(ip)->i_sb->s_dev; 600 __entry->dev = VFS_I(ip)->i_sb->s_dev;
569 __entry->ino = ip->i_ino; 601 __entry->ino = ip->i_ino;
570 __entry->count = atomic_read(&VFS_I(ip)->i_count); 602 __entry->count = atomic_read(&VFS_I(ip)->i_count);
603 __entry->pincount = atomic_read(&ip->i_pincount);
571 __entry->caller_ip = caller_ip; 604 __entry->caller_ip = caller_ip;
572 ), 605 ),
573 TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", 606 TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
574 MAJOR(__entry->dev), MINOR(__entry->dev), 607 MAJOR(__entry->dev), MINOR(__entry->dev),
575 __entry->ino, 608 __entry->ino,
576 __entry->count, 609 __entry->count,
610 __entry->pincount,
577 (char *)__entry->caller_ip) 611 (char *)__entry->caller_ip)
578) 612)
579 613
580#define DEFINE_INODE_EVENT(name) \ 614#define DEFINE_IREF_EVENT(name) \
581DEFINE_EVENT(xfs_inode_class, name, \ 615DEFINE_EVENT(xfs_iref_class, name, \
582 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ 616 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
583 TP_ARGS(ip, caller_ip)) 617 TP_ARGS(ip, caller_ip))
584DEFINE_INODE_EVENT(xfs_ihold); 618DEFINE_IREF_EVENT(xfs_ihold);
585DEFINE_INODE_EVENT(xfs_irele); 619DEFINE_IREF_EVENT(xfs_irele);
586/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ 620DEFINE_IREF_EVENT(xfs_inode_pin);
587DEFINE_INODE_EVENT(xfs_inode); 621DEFINE_IREF_EVENT(xfs_inode_unpin);
588#define xfs_itrace_entry(ip) \ 622DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
589 trace_xfs_inode(ip, _THIS_IP_) 623
624DECLARE_EVENT_CLASS(xfs_namespace_class,
625 TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
626 TP_ARGS(dp, name),
627 TP_STRUCT__entry(
628 __field(dev_t, dev)
629 __field(xfs_ino_t, dp_ino)
630 __dynamic_array(char, name, name->len)
631 ),
632 TP_fast_assign(
633 __entry->dev = VFS_I(dp)->i_sb->s_dev;
634 __entry->dp_ino = dp->i_ino;
635 memcpy(__get_str(name), name->name, name->len);
636 ),
637 TP_printk("dev %d:%d dp ino 0x%llx name %s",
638 MAJOR(__entry->dev), MINOR(__entry->dev),
639 __entry->dp_ino,
640 __get_str(name))
641)
642
643#define DEFINE_NAMESPACE_EVENT(name) \
644DEFINE_EVENT(xfs_namespace_class, name, \
645 TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
646 TP_ARGS(dp, name))
647DEFINE_NAMESPACE_EVENT(xfs_remove);
648DEFINE_NAMESPACE_EVENT(xfs_link);
649DEFINE_NAMESPACE_EVENT(xfs_lookup);
650DEFINE_NAMESPACE_EVENT(xfs_create);
651DEFINE_NAMESPACE_EVENT(xfs_symlink);
652
653TRACE_EVENT(xfs_rename,
654 TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
655 struct xfs_name *src_name, struct xfs_name *target_name),
656 TP_ARGS(src_dp, target_dp, src_name, target_name),
657 TP_STRUCT__entry(
658 __field(dev_t, dev)
659 __field(xfs_ino_t, src_dp_ino)
660 __field(xfs_ino_t, target_dp_ino)
661 __dynamic_array(char, src_name, src_name->len)
662 __dynamic_array(char, target_name, target_name->len)
663 ),
664 TP_fast_assign(
665 __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
666 __entry->src_dp_ino = src_dp->i_ino;
667 __entry->target_dp_ino = target_dp->i_ino;
668 memcpy(__get_str(src_name), src_name->name, src_name->len);
669 memcpy(__get_str(target_name), target_name->name, target_name->len);
670 ),
671 TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
672 " src name %s target name %s",
673 MAJOR(__entry->dev), MINOR(__entry->dev),
674 __entry->src_dp_ino,
675 __entry->target_dp_ino,
676 __get_str(src_name),
677 __get_str(target_name))
678)
590 679
591DECLARE_EVENT_CLASS(xfs_dquot_class, 680DECLARE_EVENT_CLASS(xfs_dquot_class,
592 TP_PROTO(struct xfs_dquot *dqp), 681 TP_PROTO(struct xfs_dquot *dqp),
@@ -642,8 +731,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
642 TP_PROTO(struct xfs_dquot *dqp), \ 731 TP_PROTO(struct xfs_dquot *dqp), \
643 TP_ARGS(dqp)) 732 TP_ARGS(dqp))
644DEFINE_DQUOT_EVENT(xfs_dqadjust); 733DEFINE_DQUOT_EVENT(xfs_dqadjust);
645DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
646DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
647DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); 734DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
648DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); 735DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
649DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); 736DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -658,7 +745,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
658DEFINE_DQUOT_EVENT(xfs_dqlookup_found); 745DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
659DEFINE_DQUOT_EVENT(xfs_dqlookup_want); 746DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
660DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); 747DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
661DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
662DEFINE_DQUOT_EVENT(xfs_dqlookup_done); 748DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
663DEFINE_DQUOT_EVENT(xfs_dqget_hit); 749DEFINE_DQUOT_EVENT(xfs_dqget_hit);
664DEFINE_DQUOT_EVENT(xfs_dqget_miss); 750DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -669,9 +755,6 @@ DEFINE_DQUOT_EVENT(xfs_dqrele);
669DEFINE_DQUOT_EVENT(xfs_dqflush); 755DEFINE_DQUOT_EVENT(xfs_dqflush);
670DEFINE_DQUOT_EVENT(xfs_dqflush_force); 756DEFINE_DQUOT_EVENT(xfs_dqflush_force);
671DEFINE_DQUOT_EVENT(xfs_dqflush_done); 757DEFINE_DQUOT_EVENT(xfs_dqflush_done);
672/* not really iget events, but we re-use the format */
673DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
674DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
675 758
676DECLARE_EVENT_CLASS(xfs_loggrant_class, 759DECLARE_EVENT_CLASS(xfs_loggrant_class,
677 TP_PROTO(struct log *log, struct xlog_ticket *tic), 760 TP_PROTO(struct log *log, struct xlog_ticket *tic),
@@ -767,165 +850,177 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
767DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); 850DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
768DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); 851DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
769 852
770#define DEFINE_RW_EVENT(name) \ 853DECLARE_EVENT_CLASS(xfs_file_class,
771TRACE_EVENT(name, \ 854 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
772 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ 855 TP_ARGS(ip, count, offset, flags),
773 TP_ARGS(ip, count, offset, flags), \ 856 TP_STRUCT__entry(
774 TP_STRUCT__entry( \ 857 __field(dev_t, dev)
775 __field(dev_t, dev) \ 858 __field(xfs_ino_t, ino)
776 __field(xfs_ino_t, ino) \ 859 __field(xfs_fsize_t, size)
777 __field(xfs_fsize_t, size) \ 860 __field(xfs_fsize_t, new_size)
778 __field(xfs_fsize_t, new_size) \ 861 __field(loff_t, offset)
779 __field(loff_t, offset) \ 862 __field(size_t, count)
780 __field(size_t, count) \ 863 __field(int, flags)
781 __field(int, flags) \ 864 ),
782 ), \ 865 TP_fast_assign(
783 TP_fast_assign( \ 866 __entry->dev = VFS_I(ip)->i_sb->s_dev;
784 __entry->dev = VFS_I(ip)->i_sb->s_dev; \ 867 __entry->ino = ip->i_ino;
785 __entry->ino = ip->i_ino; \ 868 __entry->size = ip->i_d.di_size;
786 __entry->size = ip->i_d.di_size; \ 869 __entry->new_size = ip->i_new_size;
787 __entry->new_size = ip->i_new_size; \ 870 __entry->offset = offset;
788 __entry->offset = offset; \ 871 __entry->count = count;
789 __entry->count = count; \ 872 __entry->flags = flags;
790 __entry->flags = flags; \ 873 ),
791 ), \ 874 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
792 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 875 "offset 0x%llx count 0x%zx ioflags %s",
793 "offset 0x%llx count 0x%zx ioflags %s", \ 876 MAJOR(__entry->dev), MINOR(__entry->dev),
794 MAJOR(__entry->dev), MINOR(__entry->dev), \ 877 __entry->ino,
795 __entry->ino, \ 878 __entry->size,
796 __entry->size, \ 879 __entry->new_size,
797 __entry->new_size, \ 880 __entry->offset,
798 __entry->offset, \ 881 __entry->count,
799 __entry->count, \ 882 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
800 __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
801) 883)
884
885#define DEFINE_RW_EVENT(name) \
886DEFINE_EVENT(xfs_file_class, name, \
887 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
888 TP_ARGS(ip, count, offset, flags))
802DEFINE_RW_EVENT(xfs_file_read); 889DEFINE_RW_EVENT(xfs_file_read);
803DEFINE_RW_EVENT(xfs_file_buffered_write); 890DEFINE_RW_EVENT(xfs_file_buffered_write);
804DEFINE_RW_EVENT(xfs_file_direct_write); 891DEFINE_RW_EVENT(xfs_file_direct_write);
805DEFINE_RW_EVENT(xfs_file_splice_read); 892DEFINE_RW_EVENT(xfs_file_splice_read);
806DEFINE_RW_EVENT(xfs_file_splice_write); 893DEFINE_RW_EVENT(xfs_file_splice_write);
807 894
808 895DECLARE_EVENT_CLASS(xfs_page_class,
809#define DEFINE_PAGE_EVENT(name) \ 896 TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
810TRACE_EVENT(name, \ 897 TP_ARGS(inode, page, off),
811 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ 898 TP_STRUCT__entry(
812 TP_ARGS(inode, page, off), \ 899 __field(dev_t, dev)
813 TP_STRUCT__entry( \ 900 __field(xfs_ino_t, ino)
814 __field(dev_t, dev) \ 901 __field(pgoff_t, pgoff)
815 __field(xfs_ino_t, ino) \ 902 __field(loff_t, size)
816 __field(pgoff_t, pgoff) \ 903 __field(unsigned long, offset)
817 __field(loff_t, size) \ 904 __field(int, delalloc)
818 __field(unsigned long, offset) \ 905 __field(int, unwritten)
819 __field(int, delalloc) \ 906 ),
820 __field(int, unmapped) \ 907 TP_fast_assign(
821 __field(int, unwritten) \ 908 int delalloc = -1, unwritten = -1;
822 ), \ 909
823 TP_fast_assign( \ 910 if (page_has_buffers(page))
824 int delalloc = -1, unmapped = -1, unwritten = -1; \ 911 xfs_count_page_state(page, &delalloc, &unwritten);
825 \ 912 __entry->dev = inode->i_sb->s_dev;
826 if (page_has_buffers(page)) \ 913 __entry->ino = XFS_I(inode)->i_ino;
827 xfs_count_page_state(page, &delalloc, \ 914 __entry->pgoff = page_offset(page);
828 &unmapped, &unwritten); \ 915 __entry->size = i_size_read(inode);
829 __entry->dev = inode->i_sb->s_dev; \ 916 __entry->offset = off;
830 __entry->ino = XFS_I(inode)->i_ino; \ 917 __entry->delalloc = delalloc;
831 __entry->pgoff = page_offset(page); \ 918 __entry->unwritten = unwritten;
832 __entry->size = i_size_read(inode); \ 919 ),
833 __entry->offset = off; \ 920 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
834 __entry->delalloc = delalloc; \ 921 "delalloc %d unwritten %d",
835 __entry->unmapped = unmapped; \ 922 MAJOR(__entry->dev), MINOR(__entry->dev),
836 __entry->unwritten = unwritten; \ 923 __entry->ino,
837 ), \ 924 __entry->pgoff,
838 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \ 925 __entry->size,
839 "delalloc %d unmapped %d unwritten %d", \ 926 __entry->offset,
840 MAJOR(__entry->dev), MINOR(__entry->dev), \ 927 __entry->delalloc,
841 __entry->ino, \ 928 __entry->unwritten)
842 __entry->pgoff, \
843 __entry->size, \
844 __entry->offset, \
845 __entry->delalloc, \
846 __entry->unmapped, \
847 __entry->unwritten) \
848) 929)
930
931#define DEFINE_PAGE_EVENT(name) \
932DEFINE_EVENT(xfs_page_class, name, \
933 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
934 TP_ARGS(inode, page, off))
849DEFINE_PAGE_EVENT(xfs_writepage); 935DEFINE_PAGE_EVENT(xfs_writepage);
850DEFINE_PAGE_EVENT(xfs_releasepage); 936DEFINE_PAGE_EVENT(xfs_releasepage);
851DEFINE_PAGE_EVENT(xfs_invalidatepage); 937DEFINE_PAGE_EVENT(xfs_invalidatepage);
852 938
853#define DEFINE_IOMAP_EVENT(name) \ 939DECLARE_EVENT_CLASS(xfs_iomap_class,
854TRACE_EVENT(name, \ 940 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
855 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 941 int flags, struct xfs_bmbt_irec *irec),
856 int flags, struct xfs_bmbt_irec *irec), \ 942 TP_ARGS(ip, offset, count, flags, irec),
857 TP_ARGS(ip, offset, count, flags, irec), \ 943 TP_STRUCT__entry(
858 TP_STRUCT__entry( \ 944 __field(dev_t, dev)
859 __field(dev_t, dev) \ 945 __field(xfs_ino_t, ino)
860 __field(xfs_ino_t, ino) \ 946 __field(loff_t, size)
861 __field(loff_t, size) \ 947 __field(loff_t, new_size)
862 __field(loff_t, new_size) \ 948 __field(loff_t, offset)
863 __field(loff_t, offset) \ 949 __field(size_t, count)
864 __field(size_t, count) \ 950 __field(int, flags)
865 __field(int, flags) \ 951 __field(xfs_fileoff_t, startoff)
866 __field(xfs_fileoff_t, startoff) \ 952 __field(xfs_fsblock_t, startblock)
867 __field(xfs_fsblock_t, startblock) \ 953 __field(xfs_filblks_t, blockcount)
868 __field(xfs_filblks_t, blockcount) \ 954 ),
869 ), \ 955 TP_fast_assign(
870 TP_fast_assign( \ 956 __entry->dev = VFS_I(ip)->i_sb->s_dev;
871 __entry->dev = VFS_I(ip)->i_sb->s_dev; \ 957 __entry->ino = ip->i_ino;
872 __entry->ino = ip->i_ino; \ 958 __entry->size = ip->i_d.di_size;
873 __entry->size = ip->i_d.di_size; \ 959 __entry->new_size = ip->i_new_size;
874 __entry->new_size = ip->i_new_size; \ 960 __entry->offset = offset;
875 __entry->offset = offset; \ 961 __entry->count = count;
876 __entry->count = count; \ 962 __entry->flags = flags;
877 __entry->flags = flags; \ 963 __entry->startoff = irec ? irec->br_startoff : 0;
878 __entry->startoff = irec ? irec->br_startoff : 0; \ 964 __entry->startblock = irec ? irec->br_startblock : 0;
879 __entry->startblock = irec ? irec->br_startblock : 0; \ 965 __entry->blockcount = irec ? irec->br_blockcount : 0;
880 __entry->blockcount = irec ? irec->br_blockcount : 0; \ 966 ),
881 ), \ 967 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
882 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 968 "offset 0x%llx count %zd flags %s "
883 "offset 0x%llx count %zd flags %s " \ 969 "startoff 0x%llx startblock %lld blockcount 0x%llx",
884 "startoff 0x%llx startblock %lld blockcount 0x%llx", \ 970 MAJOR(__entry->dev), MINOR(__entry->dev),
885 MAJOR(__entry->dev), MINOR(__entry->dev), \ 971 __entry->ino,
886 __entry->ino, \ 972 __entry->size,
887 __entry->size, \ 973 __entry->new_size,
888 __entry->new_size, \ 974 __entry->offset,
889 __entry->offset, \ 975 __entry->count,
890 __entry->count, \ 976 __print_flags(__entry->flags, "|", BMAPI_FLAGS),
891 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ 977 __entry->startoff,
892 __entry->startoff, \ 978 (__int64_t)__entry->startblock,
893 (__int64_t)__entry->startblock, \ 979 __entry->blockcount)
894 __entry->blockcount) \
895) 980)
981
982#define DEFINE_IOMAP_EVENT(name) \
983DEFINE_EVENT(xfs_iomap_class, name, \
984 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
985 int flags, struct xfs_bmbt_irec *irec), \
986 TP_ARGS(ip, offset, count, flags, irec))
896DEFINE_IOMAP_EVENT(xfs_iomap_enter); 987DEFINE_IOMAP_EVENT(xfs_iomap_enter);
897DEFINE_IOMAP_EVENT(xfs_iomap_found); 988DEFINE_IOMAP_EVENT(xfs_iomap_found);
898DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 989DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
899 990
900#define DEFINE_SIMPLE_IO_EVENT(name) \ 991DECLARE_EVENT_CLASS(xfs_simple_io_class,
901TRACE_EVENT(name, \ 992 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
902 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ 993 TP_ARGS(ip, offset, count),
903 TP_ARGS(ip, offset, count), \ 994 TP_STRUCT__entry(
904 TP_STRUCT__entry( \ 995 __field(dev_t, dev)
905 __field(dev_t, dev) \ 996 __field(xfs_ino_t, ino)
906 __field(xfs_ino_t, ino) \ 997 __field(loff_t, size)
907 __field(loff_t, size) \ 998 __field(loff_t, new_size)
908 __field(loff_t, new_size) \ 999 __field(loff_t, offset)
909 __field(loff_t, offset) \ 1000 __field(size_t, count)
910 __field(size_t, count) \ 1001 ),
911 ), \ 1002 TP_fast_assign(
912 TP_fast_assign( \ 1003 __entry->dev = VFS_I(ip)->i_sb->s_dev;
913 __entry->dev = VFS_I(ip)->i_sb->s_dev; \ 1004 __entry->ino = ip->i_ino;
914 __entry->ino = ip->i_ino; \ 1005 __entry->size = ip->i_d.di_size;
915 __entry->size = ip->i_d.di_size; \ 1006 __entry->new_size = ip->i_new_size;
916 __entry->new_size = ip->i_new_size; \ 1007 __entry->offset = offset;
917 __entry->offset = offset; \ 1008 __entry->count = count;
918 __entry->count = count; \ 1009 ),
919 ), \ 1010 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
920 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 1011 "offset 0x%llx count %zd",
921 "offset 0x%llx count %zd", \ 1012 MAJOR(__entry->dev), MINOR(__entry->dev),
922 MAJOR(__entry->dev), MINOR(__entry->dev), \ 1013 __entry->ino,
923 __entry->ino, \ 1014 __entry->size,
924 __entry->size, \ 1015 __entry->new_size,
925 __entry->new_size, \ 1016 __entry->offset,
926 __entry->offset, \ 1017 __entry->count)
927 __entry->count) \
928); 1018);
1019
1020#define DEFINE_SIMPLE_IO_EVENT(name) \
1021DEFINE_EVENT(xfs_simple_io_class, name, \
1022 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
1023 TP_ARGS(ip, offset, count))
929DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 1024DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
930DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 1025DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
931 1026
@@ -1051,83 +1146,112 @@ TRACE_EVENT(xfs_bunmap,
1051 1146
1052); 1147);
1053 1148
1149#define XFS_BUSY_SYNC \
1150 { 0, "async" }, \
1151 { 1, "sync" }
1152
1054TRACE_EVENT(xfs_alloc_busy, 1153TRACE_EVENT(xfs_alloc_busy,
1055 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1154 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1056 xfs_extlen_t len, int slot), 1155 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1057 TP_ARGS(mp, agno, agbno, len, slot), 1156 TP_ARGS(trans, agno, agbno, len, sync),
1058 TP_STRUCT__entry( 1157 TP_STRUCT__entry(
1059 __field(dev_t, dev) 1158 __field(dev_t, dev)
1159 __field(struct xfs_trans *, tp)
1160 __field(int, tid)
1060 __field(xfs_agnumber_t, agno) 1161 __field(xfs_agnumber_t, agno)
1061 __field(xfs_agblock_t, agbno) 1162 __field(xfs_agblock_t, agbno)
1062 __field(xfs_extlen_t, len) 1163 __field(xfs_extlen_t, len)
1063 __field(int, slot) 1164 __field(int, sync)
1064 ), 1165 ),
1065 TP_fast_assign( 1166 TP_fast_assign(
1066 __entry->dev = mp->m_super->s_dev; 1167 __entry->dev = trans->t_mountp->m_super->s_dev;
1168 __entry->tp = trans;
1169 __entry->tid = trans->t_ticket->t_tid;
1067 __entry->agno = agno; 1170 __entry->agno = agno;
1068 __entry->agbno = agbno; 1171 __entry->agbno = agbno;
1069 __entry->len = len; 1172 __entry->len = len;
1070 __entry->slot = slot; 1173 __entry->sync = sync;
1071 ), 1174 ),
1072 TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", 1175 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1073 MAJOR(__entry->dev), MINOR(__entry->dev), 1176 MAJOR(__entry->dev), MINOR(__entry->dev),
1177 __entry->tp,
1178 __entry->tid,
1074 __entry->agno, 1179 __entry->agno,
1075 __entry->agbno, 1180 __entry->agbno,
1076 __entry->len, 1181 __entry->len,
1077 __entry->slot) 1182 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1078 1183
1079); 1184);
1080 1185
1081#define XFS_BUSY_STATES \
1082 { 0, "found" }, \
1083 { 1, "missing" }
1084
1085TRACE_EVENT(xfs_alloc_unbusy, 1186TRACE_EVENT(xfs_alloc_unbusy,
1086 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1187 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1087 int slot, int found), 1188 xfs_agblock_t agbno, xfs_extlen_t len),
1088 TP_ARGS(mp, agno, slot, found), 1189 TP_ARGS(mp, agno, agbno, len),
1089 TP_STRUCT__entry( 1190 TP_STRUCT__entry(
1090 __field(dev_t, dev) 1191 __field(dev_t, dev)
1091 __field(xfs_agnumber_t, agno) 1192 __field(xfs_agnumber_t, agno)
1092 __field(int, slot) 1193 __field(xfs_agblock_t, agbno)
1093 __field(int, found) 1194 __field(xfs_extlen_t, len)
1094 ), 1195 ),
1095 TP_fast_assign( 1196 TP_fast_assign(
1096 __entry->dev = mp->m_super->s_dev; 1197 __entry->dev = mp->m_super->s_dev;
1097 __entry->agno = agno; 1198 __entry->agno = agno;
1098 __entry->slot = slot; 1199 __entry->agbno = agbno;
1099 __entry->found = found; 1200 __entry->len = len;
1100 ), 1201 ),
1101 TP_printk("dev %d:%d agno %u slot %d %s", 1202 TP_printk("dev %d:%d agno %u agbno %u len %u",
1102 MAJOR(__entry->dev), MINOR(__entry->dev), 1203 MAJOR(__entry->dev), MINOR(__entry->dev),
1103 __entry->agno, 1204 __entry->agno,
1104 __entry->slot, 1205 __entry->agbno,
1105 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1206 __entry->len)
1106); 1207);
1107 1208
1209#define XFS_BUSY_STATES \
1210 { 0, "missing" }, \
1211 { 1, "found" }
1212
1108TRACE_EVENT(xfs_alloc_busysearch, 1213TRACE_EVENT(xfs_alloc_busysearch,
1109 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1214 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1110 xfs_extlen_t len, xfs_lsn_t lsn), 1215 xfs_agblock_t agbno, xfs_extlen_t len, int found),
1111 TP_ARGS(mp, agno, agbno, len, lsn), 1216 TP_ARGS(mp, agno, agbno, len, found),
1112 TP_STRUCT__entry( 1217 TP_STRUCT__entry(
1113 __field(dev_t, dev) 1218 __field(dev_t, dev)
1114 __field(xfs_agnumber_t, agno) 1219 __field(xfs_agnumber_t, agno)
1115 __field(xfs_agblock_t, agbno) 1220 __field(xfs_agblock_t, agbno)
1116 __field(xfs_extlen_t, len) 1221 __field(xfs_extlen_t, len)
1117 __field(xfs_lsn_t, lsn) 1222 __field(int, found)
1118 ), 1223 ),
1119 TP_fast_assign( 1224 TP_fast_assign(
1120 __entry->dev = mp->m_super->s_dev; 1225 __entry->dev = mp->m_super->s_dev;
1121 __entry->agno = agno; 1226 __entry->agno = agno;
1122 __entry->agbno = agbno; 1227 __entry->agbno = agbno;
1123 __entry->len = len; 1228 __entry->len = len;
1124 __entry->lsn = lsn; 1229 __entry->found = found;
1125 ), 1230 ),
1126 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", 1231 TP_printk("dev %d:%d agno %u agbno %u len %u %s",
1127 MAJOR(__entry->dev), MINOR(__entry->dev), 1232 MAJOR(__entry->dev), MINOR(__entry->dev),
1128 __entry->agno, 1233 __entry->agno,
1129 __entry->agbno, 1234 __entry->agbno,
1130 __entry->len, 1235 __entry->len,
1236 __print_symbolic(__entry->found, XFS_BUSY_STATES))
1237);
1238
1239TRACE_EVENT(xfs_trans_commit_lsn,
1240 TP_PROTO(struct xfs_trans *trans),
1241 TP_ARGS(trans),
1242 TP_STRUCT__entry(
1243 __field(dev_t, dev)
1244 __field(struct xfs_trans *, tp)
1245 __field(xfs_lsn_t, lsn)
1246 ),
1247 TP_fast_assign(
1248 __entry->dev = trans->t_mountp->m_super->s_dev;
1249 __entry->tp = trans;
1250 __entry->lsn = trans->t_commit_lsn;
1251 ),
1252 TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
1253 MAJOR(__entry->dev), MINOR(__entry->dev),
1254 __entry->tp,
1131 __entry->lsn) 1255 __entry->lsn)
1132); 1256);
1133 1257
@@ -1495,6 +1619,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); 1619DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); 1620DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497 1621
1622DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
1623 TP_PROTO(struct log *log, struct xlog_recover *trans,
1624 struct xlog_recover_item *item, int pass),
1625 TP_ARGS(log, trans, item, pass),
1626 TP_STRUCT__entry(
1627 __field(dev_t, dev)
1628 __field(unsigned long, item)
1629 __field(xlog_tid_t, tid)
1630 __field(int, type)
1631 __field(int, pass)
1632 __field(int, count)
1633 __field(int, total)
1634 ),
1635 TP_fast_assign(
1636 __entry->dev = log->l_mp->m_super->s_dev;
1637 __entry->item = (unsigned long)item;
1638 __entry->tid = trans->r_log_tid;
1639 __entry->type = ITEM_TYPE(item);
1640 __entry->pass = pass;
1641 __entry->count = item->ri_cnt;
1642 __entry->total = item->ri_total;
1643 ),
1644 TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
1645 "item region count/total %d/%d",
1646 MAJOR(__entry->dev), MINOR(__entry->dev),
1647 __entry->tid,
1648 __entry->pass,
1649 (void *)__entry->item,
1650 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
1651 __entry->count,
1652 __entry->total)
1653)
1654
1655#define DEFINE_LOG_RECOVER_ITEM(name) \
1656DEFINE_EVENT(xfs_log_recover_item_class, name, \
1657 TP_PROTO(struct log *log, struct xlog_recover *trans, \
1658 struct xlog_recover_item *item, int pass), \
1659 TP_ARGS(log, trans, item, pass))
1660
1661DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
1662DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
1663DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
1664DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
1665DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
1666
1667DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
1668 TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
1669 TP_ARGS(log, buf_f),
1670 TP_STRUCT__entry(
1671 __field(dev_t, dev)
1672 __field(__int64_t, blkno)
1673 __field(unsigned short, len)
1674 __field(unsigned short, flags)
1675 __field(unsigned short, size)
1676 __field(unsigned int, map_size)
1677 ),
1678 TP_fast_assign(
1679 __entry->dev = log->l_mp->m_super->s_dev;
1680 __entry->blkno = buf_f->blf_blkno;
1681 __entry->len = buf_f->blf_len;
1682 __entry->flags = buf_f->blf_flags;
1683 __entry->size = buf_f->blf_size;
1684 __entry->map_size = buf_f->blf_map_size;
1685 ),
1686 TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
1687 "map_size %d",
1688 MAJOR(__entry->dev), MINOR(__entry->dev),
1689 __entry->blkno,
1690 __entry->len,
1691 __entry->flags,
1692 __entry->size,
1693 __entry->map_size)
1694)
1695
1696#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
1697DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
1698 TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
1699 TP_ARGS(log, buf_f))
1700
1701DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
1702DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
1703DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
1704DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
1705DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
1706DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
1707DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
1708DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
1709
1710DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
1711 TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
1712 TP_ARGS(log, in_f),
1713 TP_STRUCT__entry(
1714 __field(dev_t, dev)
1715 __field(xfs_ino_t, ino)
1716 __field(unsigned short, size)
1717 __field(int, fields)
1718 __field(unsigned short, asize)
1719 __field(unsigned short, dsize)
1720 __field(__int64_t, blkno)
1721 __field(int, len)
1722 __field(int, boffset)
1723 ),
1724 TP_fast_assign(
1725 __entry->dev = log->l_mp->m_super->s_dev;
1726 __entry->ino = in_f->ilf_ino;
1727 __entry->size = in_f->ilf_size;
1728 __entry->fields = in_f->ilf_fields;
1729 __entry->asize = in_f->ilf_asize;
1730 __entry->dsize = in_f->ilf_dsize;
1731 __entry->blkno = in_f->ilf_blkno;
1732 __entry->len = in_f->ilf_len;
1733 __entry->boffset = in_f->ilf_boffset;
1734 ),
1735 TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
1736 "dsize %d, blkno 0x%llx, len %d, boffset %d",
1737 MAJOR(__entry->dev), MINOR(__entry->dev),
1738 __entry->ino,
1739 __entry->size,
1740 __entry->fields,
1741 __entry->asize,
1742 __entry->dsize,
1743 __entry->blkno,
1744 __entry->len,
1745 __entry->boffset)
1746)
1747#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
1748DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
1749 TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
1750 TP_ARGS(log, in_f))
1751
1752DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1753DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1754DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1755
1498#endif /* _TRACE_XFS_H */ 1756#endif /* _TRACE_XFS_H */
1499 1757
1500#undef TRACE_INCLUDE_PATH 1758#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6b..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
72 (void *)value, size, xflags); 72 (void *)value, size, xflags);
73} 73}
74 74
75static struct xattr_handler xfs_xattr_user_handler = { 75static const struct xattr_handler xfs_xattr_user_handler = {
76 .prefix = XATTR_USER_PREFIX, 76 .prefix = XATTR_USER_PREFIX,
77 .flags = 0, /* no flags implies user namespace */ 77 .flags = 0, /* no flags implies user namespace */
78 .get = xfs_xattr_get, 78 .get = xfs_xattr_get,
79 .set = xfs_xattr_set, 79 .set = xfs_xattr_set,
80}; 80};
81 81
82static struct xattr_handler xfs_xattr_trusted_handler = { 82static const struct xattr_handler xfs_xattr_trusted_handler = {
83 .prefix = XATTR_TRUSTED_PREFIX, 83 .prefix = XATTR_TRUSTED_PREFIX,
84 .flags = ATTR_ROOT, 84 .flags = ATTR_ROOT,
85 .get = xfs_xattr_get, 85 .get = xfs_xattr_get,
86 .set = xfs_xattr_set, 86 .set = xfs_xattr_set,
87}; 87};
88 88
89static struct xattr_handler xfs_xattr_security_handler = { 89static const struct xattr_handler xfs_xattr_security_handler = {
90 .prefix = XATTR_SECURITY_PREFIX, 90 .prefix = XATTR_SECURITY_PREFIX,
91 .flags = ATTR_SECURE, 91 .flags = ATTR_SECURE,
92 .get = xfs_xattr_get, 92 .get = xfs_xattr_get,
93 .set = xfs_xattr_set, 93 .set = xfs_xattr_set,
94}; 94};
95 95
96struct xattr_handler *xfs_xattr_handlers[] = { 96const struct xattr_handler *xfs_xattr_handlers[] = {
97 &xfs_xattr_user_handler, 97 &xfs_xattr_user_handler,
98 &xfs_xattr_trusted_handler, 98 &xfs_xattr_trusted_handler,
99 &xfs_xattr_security_handler, 99 &xfs_xattr_security_handler,
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5f79dd78626b..e1a2f6800e01 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -23,25 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h"
40#include "xfs_bmap.h" 31#include "xfs_bmap.h"
41#include "xfs_rtalloc.h" 32#include "xfs_rtalloc.h"
42#include "xfs_error.h" 33#include "xfs_error.h"
43#include "xfs_itable.h" 34#include "xfs_itable.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_trans_space.h" 37#include "xfs_trans_space.h"
@@ -64,8 +54,6 @@
64 flush lock - ditto. 54 flush lock - ditto.
65*/ 55*/
66 56
67STATIC void xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
68
69#ifdef DEBUG 57#ifdef DEBUG
70xfs_buftarg_t *xfs_dqerror_target; 58xfs_buftarg_t *xfs_dqerror_target;
71int xfs_do_dqerror; 59int xfs_do_dqerror;
@@ -101,7 +89,7 @@ xfs_qm_dqinit(
101 * No need to re-initialize these if this is a reclaimed dquot. 89 * No need to re-initialize these if this is a reclaimed dquot.
102 */ 90 */
103 if (brandnewdquot) { 91 if (brandnewdquot) {
104 dqp->dq_flnext = dqp->dq_flprev = dqp; 92 INIT_LIST_HEAD(&dqp->q_freelist);
105 mutex_init(&dqp->q_qlock); 93 mutex_init(&dqp->q_qlock);
106 init_waitqueue_head(&dqp->q_pinwait); 94 init_waitqueue_head(&dqp->q_pinwait);
107 95
@@ -119,20 +107,20 @@ xfs_qm_dqinit(
119 * Only the q_core portion was zeroed in dqreclaim_one(). 107 * Only the q_core portion was zeroed in dqreclaim_one().
120 * So, we need to reset others. 108 * So, we need to reset others.
121 */ 109 */
122 dqp->q_nrefs = 0; 110 dqp->q_nrefs = 0;
123 dqp->q_blkno = 0; 111 dqp->q_blkno = 0;
124 dqp->MPL_NEXT = dqp->HL_NEXT = NULL; 112 INIT_LIST_HEAD(&dqp->q_mplist);
125 dqp->HL_PREVP = dqp->MPL_PREVP = NULL; 113 INIT_LIST_HEAD(&dqp->q_hashlist);
126 dqp->q_bufoffset = 0; 114 dqp->q_bufoffset = 0;
127 dqp->q_fileoffset = 0; 115 dqp->q_fileoffset = 0;
128 dqp->q_transp = NULL; 116 dqp->q_transp = NULL;
129 dqp->q_gdquot = NULL; 117 dqp->q_gdquot = NULL;
130 dqp->q_res_bcount = 0; 118 dqp->q_res_bcount = 0;
131 dqp->q_res_icount = 0; 119 dqp->q_res_icount = 0;
132 dqp->q_res_rtbcount = 0; 120 dqp->q_res_rtbcount = 0;
133 atomic_set(&dqp->q_pincount, 0); 121 atomic_set(&dqp->q_pincount, 0);
134 dqp->q_hash = NULL; 122 dqp->q_hash = NULL;
135 ASSERT(dqp->dq_flnext == dqp->dq_flprev); 123 ASSERT(list_empty(&dqp->q_freelist));
136 124
137 trace_xfs_dqreuse(dqp); 125 trace_xfs_dqreuse(dqp);
138 } 126 }
@@ -158,7 +146,7 @@ void
158xfs_qm_dqdestroy( 146xfs_qm_dqdestroy(
159 xfs_dquot_t *dqp) 147 xfs_dquot_t *dqp)
160{ 148{
161 ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); 149 ASSERT(list_empty(&dqp->q_freelist));
162 150
163 mutex_destroy(&dqp->q_qlock); 151 mutex_destroy(&dqp->q_qlock);
164 sv_destroy(&dqp->q_pinwait); 152 sv_destroy(&dqp->q_pinwait);
@@ -252,7 +240,7 @@ xfs_qm_adjust_dqtimers(
252 (be64_to_cpu(d->d_bcount) >= 240 (be64_to_cpu(d->d_bcount) >=
253 be64_to_cpu(d->d_blk_hardlimit)))) { 241 be64_to_cpu(d->d_blk_hardlimit)))) {
254 d->d_btimer = cpu_to_be32(get_seconds() + 242 d->d_btimer = cpu_to_be32(get_seconds() +
255 XFS_QI_BTIMELIMIT(mp)); 243 mp->m_quotainfo->qi_btimelimit);
256 } else { 244 } else {
257 d->d_bwarns = 0; 245 d->d_bwarns = 0;
258 } 246 }
@@ -275,7 +263,7 @@ xfs_qm_adjust_dqtimers(
275 (be64_to_cpu(d->d_icount) >= 263 (be64_to_cpu(d->d_icount) >=
276 be64_to_cpu(d->d_ino_hardlimit)))) { 264 be64_to_cpu(d->d_ino_hardlimit)))) {
277 d->d_itimer = cpu_to_be32(get_seconds() + 265 d->d_itimer = cpu_to_be32(get_seconds() +
278 XFS_QI_ITIMELIMIT(mp)); 266 mp->m_quotainfo->qi_itimelimit);
279 } else { 267 } else {
280 d->d_iwarns = 0; 268 d->d_iwarns = 0;
281 } 269 }
@@ -298,7 +286,7 @@ xfs_qm_adjust_dqtimers(
298 (be64_to_cpu(d->d_rtbcount) >= 286 (be64_to_cpu(d->d_rtbcount) >=
299 be64_to_cpu(d->d_rtb_hardlimit)))) { 287 be64_to_cpu(d->d_rtb_hardlimit)))) {
300 d->d_rtbtimer = cpu_to_be32(get_seconds() + 288 d->d_rtbtimer = cpu_to_be32(get_seconds() +
301 XFS_QI_RTBTIMELIMIT(mp)); 289 mp->m_quotainfo->qi_rtbtimelimit);
302 } else { 290 } else {
303 d->d_rtbwarns = 0; 291 d->d_rtbwarns = 0;
304 } 292 }
@@ -325,6 +313,7 @@ xfs_qm_init_dquot_blk(
325 uint type, 313 uint type,
326 xfs_buf_t *bp) 314 xfs_buf_t *bp)
327{ 315{
316 struct xfs_quotainfo *q = mp->m_quotainfo;
328 xfs_dqblk_t *d; 317 xfs_dqblk_t *d;
329 int curid, i; 318 int curid, i;
330 319
@@ -337,16 +326,16 @@ xfs_qm_init_dquot_blk(
337 /* 326 /*
338 * ID of the first dquot in the block - id's are zero based. 327 * ID of the first dquot in the block - id's are zero based.
339 */ 328 */
340 curid = id - (id % XFS_QM_DQPERBLK(mp)); 329 curid = id - (id % q->qi_dqperchunk);
341 ASSERT(curid >= 0); 330 ASSERT(curid >= 0);
342 memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); 331 memset(d, 0, BBTOB(q->qi_dqchunklen));
343 for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) 332 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
344 xfs_qm_dqinit_core(curid, type, d); 333 xfs_qm_dqinit_core(curid, type, d);
345 xfs_trans_dquot_buf(tp, bp, 334 xfs_trans_dquot_buf(tp, bp,
346 (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : 335 (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
347 ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : 336 ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
348 XFS_BLI_GDQUOT_BUF))); 337 XFS_BLF_GDQUOT_BUF)));
349 xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); 338 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
350} 339}
351 340
352 341
@@ -389,21 +378,14 @@ xfs_qm_dqalloc(
389 return (ESRCH); 378 return (ESRCH);
390 } 379 }
391 380
392 /* 381 xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
393 * xfs_trans_commit normally decrements the vnode ref count
394 * when it unlocks the inode. Since we want to keep the quota
395 * inode around, we bump the vnode ref count now.
396 */
397 IHOLD(quotip);
398
399 xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
400 nmaps = 1; 382 nmaps = 1;
401 if ((error = xfs_bmapi(tp, quotip, 383 if ((error = xfs_bmapi(tp, quotip,
402 offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, 384 offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
403 XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, 385 XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
404 &firstblock, 386 &firstblock,
405 XFS_QM_DQALLOC_SPACE_RES(mp), 387 XFS_QM_DQALLOC_SPACE_RES(mp),
406 &map, &nmaps, &flist, NULL))) { 388 &map, &nmaps, &flist))) {
407 goto error0; 389 goto error0;
408 } 390 }
409 ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); 391 ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -419,7 +401,7 @@ xfs_qm_dqalloc(
419 /* now we can just get the buffer (there's nothing to read yet) */ 401 /* now we can just get the buffer (there's nothing to read yet) */
420 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, 402 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
421 dqp->q_blkno, 403 dqp->q_blkno,
422 XFS_QI_DQCHUNKLEN(mp), 404 mp->m_quotainfo->qi_dqchunklen,
423 0); 405 0);
424 if (!bp || (error = XFS_BUF_GETERROR(bp))) 406 if (!bp || (error = XFS_BUF_GETERROR(bp)))
425 goto error1; 407 goto error1;
@@ -500,7 +482,8 @@ xfs_qm_dqtobp(
500 */ 482 */
501 if (dqp->q_blkno == (xfs_daddr_t) 0) { 483 if (dqp->q_blkno == (xfs_daddr_t) 0) {
502 /* We use the id as an index */ 484 /* We use the id as an index */
503 dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp); 485 dqp->q_fileoffset = (xfs_fileoff_t)id /
486 mp->m_quotainfo->qi_dqperchunk;
504 nmaps = 1; 487 nmaps = 1;
505 quotip = XFS_DQ_TO_QIP(dqp); 488 quotip = XFS_DQ_TO_QIP(dqp);
506 xfs_ilock(quotip, XFS_ILOCK_SHARED); 489 xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -518,7 +501,7 @@ xfs_qm_dqtobp(
518 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, 501 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
519 XFS_DQUOT_CLUSTER_SIZE_FSB, 502 XFS_DQUOT_CLUSTER_SIZE_FSB,
520 XFS_BMAPI_METADATA, 503 XFS_BMAPI_METADATA,
521 NULL, 0, &map, &nmaps, NULL, NULL); 504 NULL, 0, &map, &nmaps, NULL);
522 505
523 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 506 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
524 if (error) 507 if (error)
@@ -529,7 +512,7 @@ xfs_qm_dqtobp(
529 /* 512 /*
530 * offset of dquot in the (fixed sized) dquot chunk. 513 * offset of dquot in the (fixed sized) dquot chunk.
531 */ 514 */
532 dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * 515 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
533 sizeof(xfs_dqblk_t); 516 sizeof(xfs_dqblk_t);
534 if (map.br_startblock == HOLESTARTBLOCK) { 517 if (map.br_startblock == HOLESTARTBLOCK) {
535 /* 518 /*
@@ -559,15 +542,13 @@ xfs_qm_dqtobp(
559 * Read in the buffer, unless we've just done the allocation 542 * Read in the buffer, unless we've just done the allocation
560 * (in which case we already have the buf). 543 * (in which case we already have the buf).
561 */ 544 */
562 if (! newdquot) { 545 if (!newdquot) {
563 trace_xfs_dqtobp_read(dqp); 546 trace_xfs_dqtobp_read(dqp);
564 547
565 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 548 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
566 dqp->q_blkno, 549 dqp->q_blkno,
567 XFS_QI_DQCHUNKLEN(mp), 550 mp->m_quotainfo->qi_dqchunklen,
568 0, &bp))) { 551 0, &bp);
569 return (error);
570 }
571 if (error || !bp) 552 if (error || !bp)
572 return XFS_ERROR(error); 553 return XFS_ERROR(error);
573 } 554 }
@@ -689,14 +670,14 @@ xfs_qm_idtodq(
689 tp = NULL; 670 tp = NULL;
690 if (flags & XFS_QMOPT_DQALLOC) { 671 if (flags & XFS_QMOPT_DQALLOC) {
691 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); 672 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
692 if ((error = xfs_trans_reserve(tp, 673 error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
693 XFS_QM_DQALLOC_SPACE_RES(mp), 674 XFS_WRITE_LOG_RES(mp) +
694 XFS_WRITE_LOG_RES(mp) + 675 BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
695 BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + 676 128,
696 128, 677 0,
697 0, 678 XFS_TRANS_PERM_LOG_RES,
698 XFS_TRANS_PERM_LOG_RES, 679 XFS_WRITE_LOG_COUNT);
699 XFS_WRITE_LOG_COUNT))) { 680 if (error) {
700 cancelflags = 0; 681 cancelflags = 0;
701 goto error0; 682 goto error0;
702 } 683 }
@@ -751,7 +732,6 @@ xfs_qm_dqlookup(
751{ 732{
752 xfs_dquot_t *dqp; 733 xfs_dquot_t *dqp;
753 uint flist_locked; 734 uint flist_locked;
754 xfs_dquot_t *d;
755 735
756 ASSERT(mutex_is_locked(&qh->qh_lock)); 736 ASSERT(mutex_is_locked(&qh->qh_lock));
757 737
@@ -760,7 +740,7 @@ xfs_qm_dqlookup(
760 /* 740 /*
761 * Traverse the hashchain looking for a match 741 * Traverse the hashchain looking for a match
762 */ 742 */
763 for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { 743 list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
764 /* 744 /*
765 * We already have the hashlock. We don't need the 745 * We already have the hashlock. We don't need the
766 * dqlock to look at the id field of the dquot, since the 746 * dqlock to look at the id field of the dquot, since the
@@ -772,12 +752,12 @@ xfs_qm_dqlookup(
772 /* 752 /*
773 * All in core dquots must be on the dqlist of mp 753 * All in core dquots must be on the dqlist of mp
774 */ 754 */
775 ASSERT(dqp->MPL_PREVP != NULL); 755 ASSERT(!list_empty(&dqp->q_mplist));
776 756
777 xfs_dqlock(dqp); 757 xfs_dqlock(dqp);
778 if (dqp->q_nrefs == 0) { 758 if (dqp->q_nrefs == 0) {
779 ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); 759 ASSERT(!list_empty(&dqp->q_freelist));
780 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 760 if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
781 trace_xfs_dqlookup_want(dqp); 761 trace_xfs_dqlookup_want(dqp);
782 762
783 /* 763 /*
@@ -787,7 +767,7 @@ xfs_qm_dqlookup(
787 */ 767 */
788 dqp->dq_flags |= XFS_DQ_WANT; 768 dqp->dq_flags |= XFS_DQ_WANT;
789 xfs_dqunlock(dqp); 769 xfs_dqunlock(dqp);
790 xfs_qm_freelist_lock(xfs_Gqm); 770 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
791 xfs_dqlock(dqp); 771 xfs_dqlock(dqp);
792 dqp->dq_flags &= ~(XFS_DQ_WANT); 772 dqp->dq_flags &= ~(XFS_DQ_WANT);
793 } 773 }
@@ -802,46 +782,28 @@ xfs_qm_dqlookup(
802 782
803 if (flist_locked) { 783 if (flist_locked) {
804 if (dqp->q_nrefs != 0) { 784 if (dqp->q_nrefs != 0) {
805 xfs_qm_freelist_unlock(xfs_Gqm); 785 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
806 flist_locked = B_FALSE; 786 flist_locked = B_FALSE;
807 } else { 787 } else {
808 /* 788 /* take it off the freelist */
809 * take it off the freelist
810 */
811 trace_xfs_dqlookup_freelist(dqp); 789 trace_xfs_dqlookup_freelist(dqp);
812 XQM_FREELIST_REMOVE(dqp); 790 list_del_init(&dqp->q_freelist);
813 /* xfs_qm_freelist_print(&(xfs_Gqm-> 791 xfs_Gqm->qm_dqfrlist_cnt--;
814 qm_dqfreelist),
815 "after removal"); */
816 } 792 }
817 } 793 }
818 794
819 /*
820 * grab a reference
821 */
822 XFS_DQHOLD(dqp); 795 XFS_DQHOLD(dqp);
823 796
824 if (flist_locked) 797 if (flist_locked)
825 xfs_qm_freelist_unlock(xfs_Gqm); 798 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
826 /* 799 /*
827 * move the dquot to the front of the hashchain 800 * move the dquot to the front of the hashchain
828 */ 801 */
829 ASSERT(mutex_is_locked(&qh->qh_lock)); 802 ASSERT(mutex_is_locked(&qh->qh_lock));
830 if (dqp->HL_PREVP != &qh->qh_next) { 803 list_move(&dqp->q_hashlist, &qh->qh_list);
831 trace_xfs_dqlookup_move(dqp);
832 if ((d = dqp->HL_NEXT))
833 d->HL_PREVP = dqp->HL_PREVP;
834 *(dqp->HL_PREVP) = d;
835 d = qh->qh_next;
836 d->HL_PREVP = &dqp->HL_NEXT;
837 dqp->HL_NEXT = d;
838 dqp->HL_PREVP = &qh->qh_next;
839 qh->qh_next = dqp;
840 }
841 trace_xfs_dqlookup_done(dqp); 804 trace_xfs_dqlookup_done(dqp);
842 *O_dqpp = dqp; 805 *O_dqpp = dqp;
843 ASSERT(mutex_is_locked(&qh->qh_lock)); 806 return 0;
844 return (0);
845 } 807 }
846 } 808 }
847 809
@@ -975,16 +937,17 @@ xfs_qm_dqget(
975 */ 937 */
976 if (ip) { 938 if (ip) {
977 xfs_ilock(ip, XFS_ILOCK_EXCL); 939 xfs_ilock(ip, XFS_ILOCK_EXCL);
978 if (! XFS_IS_DQTYPE_ON(mp, type)) { 940
979 /* inode stays locked on return */
980 xfs_qm_dqdestroy(dqp);
981 return XFS_ERROR(ESRCH);
982 }
983 /* 941 /*
984 * A dquot could be attached to this inode by now, since 942 * A dquot could be attached to this inode by now, since
985 * we had dropped the ilock. 943 * we had dropped the ilock.
986 */ 944 */
987 if (type == XFS_DQ_USER) { 945 if (type == XFS_DQ_USER) {
946 if (!XFS_IS_UQUOTA_ON(mp)) {
947 /* inode stays locked on return */
948 xfs_qm_dqdestroy(dqp);
949 return XFS_ERROR(ESRCH);
950 }
988 if (ip->i_udquot) { 951 if (ip->i_udquot) {
989 xfs_qm_dqdestroy(dqp); 952 xfs_qm_dqdestroy(dqp);
990 dqp = ip->i_udquot; 953 dqp = ip->i_udquot;
@@ -992,6 +955,11 @@ xfs_qm_dqget(
992 goto dqret; 955 goto dqret;
993 } 956 }
994 } else { 957 } else {
958 if (!XFS_IS_OQUOTA_ON(mp)) {
959 /* inode stays locked on return */
960 xfs_qm_dqdestroy(dqp);
961 return XFS_ERROR(ESRCH);
962 }
995 if (ip->i_gdquot) { 963 if (ip->i_gdquot) {
996 xfs_qm_dqdestroy(dqp); 964 xfs_qm_dqdestroy(dqp);
997 dqp = ip->i_gdquot; 965 dqp = ip->i_gdquot;
@@ -1033,13 +1001,14 @@ xfs_qm_dqget(
1033 */ 1001 */
1034 ASSERT(mutex_is_locked(&h->qh_lock)); 1002 ASSERT(mutex_is_locked(&h->qh_lock));
1035 dqp->q_hash = h; 1003 dqp->q_hash = h;
1036 XQM_HASHLIST_INSERT(h, dqp); 1004 list_add(&dqp->q_hashlist, &h->qh_list);
1005 h->qh_version++;
1037 1006
1038 /* 1007 /*
1039 * Attach this dquot to this filesystem's list of all dquots, 1008 * Attach this dquot to this filesystem's list of all dquots,
1040 * kept inside the mount structure in m_quotainfo field 1009 * kept inside the mount structure in m_quotainfo field
1041 */ 1010 */
1042 xfs_qm_mplist_lock(mp); 1011 mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
1043 1012
1044 /* 1013 /*
1045 * We return a locked dquot to the caller, with a reference taken 1014 * We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1016,9 @@ xfs_qm_dqget(
1047 xfs_dqlock(dqp); 1016 xfs_dqlock(dqp);
1048 dqp->q_nrefs = 1; 1017 dqp->q_nrefs = 1;
1049 1018
1050 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); 1019 list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
1051 1020 mp->m_quotainfo->qi_dquots++;
1052 xfs_qm_mplist_unlock(mp); 1021 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1053 mutex_unlock(&h->qh_lock); 1022 mutex_unlock(&h->qh_lock);
1054 dqret: 1023 dqret:
1055 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1024 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1055,10 @@ xfs_qm_dqput(
1086 * drop the dqlock and acquire the freelist and dqlock 1055 * drop the dqlock and acquire the freelist and dqlock
1087 * in the right order; but try to get it out-of-order first 1056 * in the right order; but try to get it out-of-order first
1088 */ 1057 */
1089 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 1058 if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
1090 trace_xfs_dqput_wait(dqp); 1059 trace_xfs_dqput_wait(dqp);
1091 xfs_dqunlock(dqp); 1060 xfs_dqunlock(dqp);
1092 xfs_qm_freelist_lock(xfs_Gqm); 1061 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1093 xfs_dqlock(dqp); 1062 xfs_dqlock(dqp);
1094 } 1063 }
1095 1064
@@ -1100,10 +1069,8 @@ xfs_qm_dqput(
1100 if (--dqp->q_nrefs == 0) { 1069 if (--dqp->q_nrefs == 0) {
1101 trace_xfs_dqput_free(dqp); 1070 trace_xfs_dqput_free(dqp);
1102 1071
1103 /* 1072 list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
1104 * insert at end of the freelist. 1073 xfs_Gqm->qm_dqfrlist_cnt++;
1105 */
1106 XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
1107 1074
1108 /* 1075 /*
1109 * If we just added a udquot to the freelist, then 1076 * If we just added a udquot to the freelist, then
@@ -1118,10 +1085,6 @@ xfs_qm_dqput(
1118 xfs_dqlock(gdqp); 1085 xfs_dqlock(gdqp);
1119 dqp->q_gdquot = NULL; 1086 dqp->q_gdquot = NULL;
1120 } 1087 }
1121
1122 /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
1123 "@@@@@++ Free list (after append) @@@@@+");
1124 */
1125 } 1088 }
1126 xfs_dqunlock(dqp); 1089 xfs_dqunlock(dqp);
1127 1090
@@ -1133,7 +1096,7 @@ xfs_qm_dqput(
1133 break; 1096 break;
1134 dqp = gdqp; 1097 dqp = gdqp;
1135 } 1098 }
1136 xfs_qm_freelist_unlock(xfs_Gqm); 1099 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1137} 1100}
1138 1101
1139/* 1102/*
@@ -1159,6 +1122,46 @@ xfs_qm_dqrele(
1159 xfs_qm_dqput(dqp); 1122 xfs_qm_dqput(dqp);
1160} 1123}
1161 1124
1125/*
1126 * This is the dquot flushing I/O completion routine. It is called
1127 * from interrupt level when the buffer containing the dquot is
1128 * flushed to disk. It is responsible for removing the dquot logitem
1129 * from the AIL if it has not been re-logged, and unlocking the dquot's
1130 * flush lock. This behavior is very similar to that of inodes..
1131 */
1132STATIC void
1133xfs_qm_dqflush_done(
1134 struct xfs_buf *bp,
1135 struct xfs_log_item *lip)
1136{
1137 xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip;
1138 xfs_dquot_t *dqp = qip->qli_dquot;
1139 struct xfs_ail *ailp = lip->li_ailp;
1140
1141 /*
1142 * We only want to pull the item from the AIL if its
1143 * location in the log has not changed since we started the flush.
1144 * Thus, we only bother if the dquot's lsn has
1145 * not changed. First we check the lsn outside the lock
1146 * since it's cheaper, and then we recheck while
1147 * holding the lock before removing the dquot from the AIL.
1148 */
1149 if ((lip->li_flags & XFS_LI_IN_AIL) &&
1150 lip->li_lsn == qip->qli_flush_lsn) {
1151
1152 /* xfs_trans_ail_delete() drops the AIL lock. */
1153 spin_lock(&ailp->xa_lock);
1154 if (lip->li_lsn == qip->qli_flush_lsn)
1155 xfs_trans_ail_delete(ailp, lip);
1156 else
1157 spin_unlock(&ailp->xa_lock);
1158 }
1159
1160 /*
1161 * Release the dq's flush lock since we're done with it.
1162 */
1163 xfs_dqfunlock(dqp);
1164}
1162 1165
1163/* 1166/*
1164 * Write a modified dquot to disk. 1167 * Write a modified dquot to disk.
@@ -1240,8 +1243,9 @@ xfs_qm_dqflush(
1240 * Attach an iodone routine so that we can remove this dquot from the 1243 * Attach an iodone routine so that we can remove this dquot from the
1241 * AIL and release the flush lock once the dquot is synced to disk. 1244 * AIL and release the flush lock once the dquot is synced to disk.
1242 */ 1245 */
1243 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *)) 1246 xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
1244 xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item)); 1247 &dqp->q_logitem.qli_item);
1248
1245 /* 1249 /*
1246 * If the buffer is pinned then push on the log so we won't 1250 * If the buffer is pinned then push on the log so we won't
1247 * get stuck waiting in the write for too long. 1251 * get stuck waiting in the write for too long.
@@ -1265,50 +1269,6 @@ xfs_qm_dqflush(
1265 1269
1266} 1270}
1267 1271
1268/*
1269 * This is the dquot flushing I/O completion routine. It is called
1270 * from interrupt level when the buffer containing the dquot is
1271 * flushed to disk. It is responsible for removing the dquot logitem
1272 * from the AIL if it has not been re-logged, and unlocking the dquot's
1273 * flush lock. This behavior is very similar to that of inodes..
1274 */
1275/*ARGSUSED*/
1276STATIC void
1277xfs_qm_dqflush_done(
1278 xfs_buf_t *bp,
1279 xfs_dq_logitem_t *qip)
1280{
1281 xfs_dquot_t *dqp;
1282 struct xfs_ail *ailp;
1283
1284 dqp = qip->qli_dquot;
1285 ailp = qip->qli_item.li_ailp;
1286
1287 /*
1288 * We only want to pull the item from the AIL if its
1289 * location in the log has not changed since we started the flush.
1290 * Thus, we only bother if the dquot's lsn has
1291 * not changed. First we check the lsn outside the lock
1292 * since it's cheaper, and then we recheck while
1293 * holding the lock before removing the dquot from the AIL.
1294 */
1295 if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
1296 qip->qli_item.li_lsn == qip->qli_flush_lsn) {
1297
1298 /* xfs_trans_ail_delete() drops the AIL lock. */
1299 spin_lock(&ailp->xa_lock);
1300 if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
1301 xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
1302 else
1303 spin_unlock(&ailp->xa_lock);
1304 }
1305
1306 /*
1307 * Release the dq's flush lock since we're done with it.
1308 */
1309 xfs_dqfunlock(dqp);
1310}
1311
1312int 1272int
1313xfs_qm_dqlock_nowait( 1273xfs_qm_dqlock_nowait(
1314 xfs_dquot_t *dqp) 1274 xfs_dquot_t *dqp)
@@ -1386,10 +1346,10 @@ int
1386xfs_qm_dqpurge( 1346xfs_qm_dqpurge(
1387 xfs_dquot_t *dqp) 1347 xfs_dquot_t *dqp)
1388{ 1348{
1389 xfs_dqhash_t *thishash; 1349 xfs_dqhash_t *qh = dqp->q_hash;
1390 xfs_mount_t *mp = dqp->q_mount; 1350 xfs_mount_t *mp = dqp->q_mount;
1391 1351
1392 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 1352 ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
1393 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); 1353 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
1394 1354
1395 xfs_dqlock(dqp); 1355 xfs_dqlock(dqp);
@@ -1407,7 +1367,7 @@ xfs_qm_dqpurge(
1407 return (1); 1367 return (1);
1408 } 1368 }
1409 1369
1410 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); 1370 ASSERT(!list_empty(&dqp->q_freelist));
1411 1371
1412 /* 1372 /*
1413 * If we're turning off quotas, we have to make sure that, for 1373 * If we're turning off quotas, we have to make sure that, for
@@ -1452,14 +1412,16 @@ xfs_qm_dqpurge(
1452 ASSERT(XFS_FORCED_SHUTDOWN(mp) || 1412 ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
1453 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); 1413 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
1454 1414
1455 thishash = dqp->q_hash; 1415 list_del_init(&dqp->q_hashlist);
1456 XQM_HASHLIST_REMOVE(thishash, dqp); 1416 qh->qh_version++;
1457 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); 1417 list_del_init(&dqp->q_mplist);
1418 mp->m_quotainfo->qi_dqreclaims++;
1419 mp->m_quotainfo->qi_dquots--;
1458 /* 1420 /*
1459 * XXX Move this to the front of the freelist, if we can get the 1421 * XXX Move this to the front of the freelist, if we can get the
1460 * freelist lock. 1422 * freelist lock.
1461 */ 1423 */
1462 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); 1424 ASSERT(!list_empty(&dqp->q_freelist));
1463 1425
1464 dqp->q_mount = NULL; 1426 dqp->q_mount = NULL;
1465 dqp->q_hash = NULL; 1427 dqp->q_hash = NULL;
@@ -1467,7 +1429,7 @@ xfs_qm_dqpurge(
1467 memset(&dqp->q_core, 0, sizeof(dqp->q_core)); 1429 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
1468 xfs_dqfunlock(dqp); 1430 xfs_dqfunlock(dqp);
1469 xfs_dqunlock(dqp); 1431 xfs_dqunlock(dqp);
1470 mutex_unlock(&thishash->qh_lock); 1432 mutex_unlock(&qh->qh_lock);
1471 return (0); 1433 return (0);
1472} 1434}
1473 1435
@@ -1517,6 +1479,7 @@ void
1517xfs_qm_dqflock_pushbuf_wait( 1479xfs_qm_dqflock_pushbuf_wait(
1518 xfs_dquot_t *dqp) 1480 xfs_dquot_t *dqp)
1519{ 1481{
1482 xfs_mount_t *mp = dqp->q_mount;
1520 xfs_buf_t *bp; 1483 xfs_buf_t *bp;
1521 1484
1522 /* 1485 /*
@@ -1525,14 +1488,14 @@ xfs_qm_dqflock_pushbuf_wait(
1525 * out immediately. We'll be able to acquire 1488 * out immediately. We'll be able to acquire
1526 * the flush lock when the I/O completes. 1489 * the flush lock when the I/O completes.
1527 */ 1490 */
1528 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, 1491 bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
1529 XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK); 1492 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
1530 if (!bp) 1493 if (!bp)
1531 goto out_lock; 1494 goto out_lock;
1532 1495
1533 if (XFS_BUF_ISDELAYWRITE(bp)) { 1496 if (XFS_BUF_ISDELAYWRITE(bp)) {
1534 if (XFS_BUF_ISPINNED(bp)) 1497 if (XFS_BUF_ISPINNED(bp))
1535 xfs_log_force(dqp->q_mount, 0); 1498 xfs_log_force(mp, 0);
1536 xfs_buf_delwri_promote(bp); 1499 xfs_buf_delwri_promote(bp);
1537 wake_up_process(bp->b_target->bt_task); 1500 wake_up_process(bp->b_target->bt_task);
1538 } 1501 }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da586d1b..5da3a23b820d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
33 * The hash chain headers (hash buckets) 33 * The hash chain headers (hash buckets)
34 */ 34 */
35typedef struct xfs_dqhash { 35typedef struct xfs_dqhash {
36 struct xfs_dquot *qh_next; 36 struct list_head qh_list;
37 struct mutex qh_lock; 37 struct mutex qh_lock;
38 uint qh_version; /* ever increasing version */ 38 uint qh_version; /* ever increasing version */
39 uint qh_nelems; /* number of dquots on the list */ 39 uint qh_nelems; /* number of dquots on the list */
40} xfs_dqhash_t; 40} xfs_dqhash_t;
41 41
42typedef struct xfs_dqlink {
43 struct xfs_dquot *ql_next; /* forward link */
44 struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */
45} xfs_dqlink_t;
46
47struct xfs_mount; 42struct xfs_mount;
48struct xfs_trans; 43struct xfs_trans;
49 44
50/* 45/*
51 * This is the marker which is designed to occupy the first few
52 * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
53 * must come first.
54 * This serves as the marker ("sentinel") when we have to restart list
55 * iterations because of locking considerations.
56 */
57typedef struct xfs_dqmarker {
58 struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */
59 struct xfs_dquot*dqm_flprev;
60 xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */
61 xfs_dqlink_t dqm_hashlist; /* link to the hash chain */
62 uint dqm_flags; /* various flags (XFS_DQ_*) */
63} xfs_dqmarker_t;
64
65/*
66 * The incore dquot structure 46 * The incore dquot structure
67 */ 47 */
68typedef struct xfs_dquot { 48typedef struct xfs_dquot {
69 xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ 49 uint dq_flags; /* various flags (XFS_DQ_*) */
50 struct list_head q_freelist; /* global free list of dquots */
51 struct list_head q_mplist; /* mount's list of dquots */
52 struct list_head q_hashlist; /* gloabl hash list of dquots */
70 xfs_dqhash_t *q_hash; /* the hashchain header */ 53 xfs_dqhash_t *q_hash; /* the hashchain header */
71 struct xfs_mount*q_mount; /* filesystem this relates to */ 54 struct xfs_mount*q_mount; /* filesystem this relates to */
72 struct xfs_trans*q_transp; /* trans this belongs to currently */ 55 struct xfs_trans*q_transp; /* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ 70 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
88} xfs_dquot_t; 71} xfs_dquot_t;
89 72
90
91#define dq_flnext q_lists.dqm_flnext
92#define dq_flprev q_lists.dqm_flprev
93#define dq_mplist q_lists.dqm_mplist
94#define dq_hashlist q_lists.dqm_hashlist
95#define dq_flags q_lists.dqm_flags
96
97/* 73/*
98 * Lock hierarchy for q_qlock: 74 * Lock hierarchy for q_qlock:
99 * XFS_QLOCK_NORMAL is the implicit default, 75 * XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
127} 103}
128 104
129#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) 105#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
130#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
131#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 106#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
132#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 107#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
133#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) 108#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 4e4ee9a57194..2a1f3dc10a02 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -23,42 +23,36 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_bmap.h" 31#include "xfs_bmap.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_rtalloc.h" 32#include "xfs_rtalloc.h"
42#include "xfs_error.h" 33#include "xfs_error.h"
43#include "xfs_itable.h" 34#include "xfs_itable.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_trans_priv.h" 37#include "xfs_trans_priv.h"
48#include "xfs_qm.h" 38#include "xfs_qm.h"
49 39
40static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
41{
42 return container_of(lip, struct xfs_dq_logitem, qli_item);
43}
44
50/* 45/*
51 * returns the number of iovecs needed to log the given dquot item. 46 * returns the number of iovecs needed to log the given dquot item.
52 */ 47 */
53/* ARGSUSED */
54STATIC uint 48STATIC uint
55xfs_qm_dquot_logitem_size( 49xfs_qm_dquot_logitem_size(
56 xfs_dq_logitem_t *logitem) 50 struct xfs_log_item *lip)
57{ 51{
58 /* 52 /*
59 * we need only two iovecs, one for the format, one for the real thing 53 * we need only two iovecs, one for the format, one for the real thing
60 */ 54 */
61 return (2); 55 return 2;
62} 56}
63 57
64/* 58/*
@@ -66,22 +60,21 @@ xfs_qm_dquot_logitem_size(
66 */ 60 */
67STATIC void 61STATIC void
68xfs_qm_dquot_logitem_format( 62xfs_qm_dquot_logitem_format(
69 xfs_dq_logitem_t *logitem, 63 struct xfs_log_item *lip,
70 xfs_log_iovec_t *logvec) 64 struct xfs_log_iovec *logvec)
71{ 65{
72 ASSERT(logitem); 66 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
73 ASSERT(logitem->qli_dquot);
74 67
75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; 68 logvec->i_addr = &qlip->qli_format;
76 logvec->i_len = sizeof(xfs_dq_logformat_t); 69 logvec->i_len = sizeof(xfs_dq_logformat_t);
77 logvec->i_type = XLOG_REG_TYPE_QFORMAT; 70 logvec->i_type = XLOG_REG_TYPE_QFORMAT;
78 logvec++; 71 logvec++;
79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; 72 logvec->i_addr = &qlip->qli_dquot->q_core;
80 logvec->i_len = sizeof(xfs_disk_dquot_t); 73 logvec->i_len = sizeof(xfs_disk_dquot_t);
81 logvec->i_type = XLOG_REG_TYPE_DQUOT; 74 logvec->i_type = XLOG_REG_TYPE_DQUOT;
82 75
83 ASSERT(2 == logitem->qli_item.li_desc->lid_size); 76 ASSERT(2 == lip->li_desc->lid_size);
84 logitem->qli_format.qlf_size = 2; 77 qlip->qli_format.qlf_size = 2;
85 78
86} 79}
87 80
@@ -90,9 +83,9 @@ xfs_qm_dquot_logitem_format(
90 */ 83 */
91STATIC void 84STATIC void
92xfs_qm_dquot_logitem_pin( 85xfs_qm_dquot_logitem_pin(
93 xfs_dq_logitem_t *logitem) 86 struct xfs_log_item *lip)
94{ 87{
95 xfs_dquot_t *dqp = logitem->qli_dquot; 88 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
96 89
97 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 90 ASSERT(XFS_DQ_IS_LOCKED(dqp));
98 atomic_inc(&dqp->q_pincount); 91 atomic_inc(&dqp->q_pincount);
@@ -104,28 +97,18 @@ xfs_qm_dquot_logitem_pin(
104 * dquot must have been previously pinned with a call to 97 * dquot must have been previously pinned with a call to
105 * xfs_qm_dquot_logitem_pin(). 98 * xfs_qm_dquot_logitem_pin().
106 */ 99 */
107/* ARGSUSED */
108STATIC void 100STATIC void
109xfs_qm_dquot_logitem_unpin( 101xfs_qm_dquot_logitem_unpin(
110 xfs_dq_logitem_t *logitem, 102 struct xfs_log_item *lip,
111 int stale) 103 int remove)
112{ 104{
113 xfs_dquot_t *dqp = logitem->qli_dquot; 105 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
114 106
115 ASSERT(atomic_read(&dqp->q_pincount) > 0); 107 ASSERT(atomic_read(&dqp->q_pincount) > 0);
116 if (atomic_dec_and_test(&dqp->q_pincount)) 108 if (atomic_dec_and_test(&dqp->q_pincount))
117 wake_up(&dqp->q_pinwait); 109 wake_up(&dqp->q_pinwait);
118} 110}
119 111
120/* ARGSUSED */
121STATIC void
122xfs_qm_dquot_logitem_unpin_remove(
123 xfs_dq_logitem_t *logitem,
124 xfs_trans_t *tp)
125{
126 xfs_qm_dquot_logitem_unpin(logitem, 0);
127}
128
129/* 112/*
130 * Given the logitem, this writes the corresponding dquot entry to disk 113 * Given the logitem, this writes the corresponding dquot entry to disk
131 * asynchronously. This is called with the dquot entry securely locked; 114 * asynchronously. This is called with the dquot entry securely locked;
@@ -134,12 +117,10 @@ xfs_qm_dquot_logitem_unpin_remove(
134 */ 117 */
135STATIC void 118STATIC void
136xfs_qm_dquot_logitem_push( 119xfs_qm_dquot_logitem_push(
137 xfs_dq_logitem_t *logitem) 120 struct xfs_log_item *lip)
138{ 121{
139 xfs_dquot_t *dqp; 122 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
140 int error; 123 int error;
141
142 dqp = logitem->qli_dquot;
143 124
144 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 125 ASSERT(XFS_DQ_IS_LOCKED(dqp));
145 ASSERT(!completion_done(&dqp->q_flush)); 126 ASSERT(!completion_done(&dqp->q_flush));
@@ -161,27 +142,25 @@ xfs_qm_dquot_logitem_push(
161 xfs_dqunlock(dqp); 142 xfs_dqunlock(dqp);
162} 143}
163 144
164/*ARGSUSED*/
165STATIC xfs_lsn_t 145STATIC xfs_lsn_t
166xfs_qm_dquot_logitem_committed( 146xfs_qm_dquot_logitem_committed(
167 xfs_dq_logitem_t *l, 147 struct xfs_log_item *lip,
168 xfs_lsn_t lsn) 148 xfs_lsn_t lsn)
169{ 149{
170 /* 150 /*
171 * We always re-log the entire dquot when it becomes dirty, 151 * We always re-log the entire dquot when it becomes dirty,
172 * so, the latest copy _is_ the only one that matters. 152 * so, the latest copy _is_ the only one that matters.
173 */ 153 */
174 return (lsn); 154 return lsn;
175} 155}
176 156
177
178/* 157/*
179 * This is called to wait for the given dquot to be unpinned. 158 * This is called to wait for the given dquot to be unpinned.
180 * Most of these pin/unpin routines are plagiarized from inode code. 159 * Most of these pin/unpin routines are plagiarized from inode code.
181 */ 160 */
182void 161void
183xfs_qm_dqunpin_wait( 162xfs_qm_dqunpin_wait(
184 xfs_dquot_t *dqp) 163 struct xfs_dquot *dqp)
185{ 164{
186 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 165 ASSERT(XFS_DQ_IS_LOCKED(dqp));
187 if (atomic_read(&dqp->q_pincount) == 0) 166 if (atomic_read(&dqp->q_pincount) == 0)
@@ -207,13 +186,12 @@ xfs_qm_dqunpin_wait(
207 */ 186 */
208STATIC void 187STATIC void
209xfs_qm_dquot_logitem_pushbuf( 188xfs_qm_dquot_logitem_pushbuf(
210 xfs_dq_logitem_t *qip) 189 struct xfs_log_item *lip)
211{ 190{
212 xfs_dquot_t *dqp; 191 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
213 xfs_mount_t *mp; 192 struct xfs_dquot *dqp = qlip->qli_dquot;
214 xfs_buf_t *bp; 193 struct xfs_buf *bp;
215 194
216 dqp = qip->qli_dquot;
217 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 195 ASSERT(XFS_DQ_IS_LOCKED(dqp));
218 196
219 /* 197 /*
@@ -221,22 +199,20 @@ xfs_qm_dquot_logitem_pushbuf(
221 * inode flush completed and the inode was taken off the AIL. 199 * inode flush completed and the inode was taken off the AIL.
222 * So, just get out. 200 * So, just get out.
223 */ 201 */
224 if (completion_done(&dqp->q_flush) || 202 if (completion_done(&dqp->q_flush) ||
225 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { 203 !(lip->li_flags & XFS_LI_IN_AIL)) {
226 xfs_dqunlock(dqp); 204 xfs_dqunlock(dqp);
227 return; 205 return;
228 } 206 }
229 mp = dqp->q_mount; 207
230 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 208 bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
231 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); 209 dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
232 xfs_dqunlock(dqp); 210 xfs_dqunlock(dqp);
233 if (!bp) 211 if (!bp)
234 return; 212 return;
235 if (XFS_BUF_ISDELAYWRITE(bp)) 213 if (XFS_BUF_ISDELAYWRITE(bp))
236 xfs_buf_delwri_promote(bp); 214 xfs_buf_delwri_promote(bp);
237 xfs_buf_relse(bp); 215 xfs_buf_relse(bp);
238 return;
239
240} 216}
241 217
242/* 218/*
@@ -251,15 +227,14 @@ xfs_qm_dquot_logitem_pushbuf(
251 */ 227 */
252STATIC uint 228STATIC uint
253xfs_qm_dquot_logitem_trylock( 229xfs_qm_dquot_logitem_trylock(
254 xfs_dq_logitem_t *qip) 230 struct xfs_log_item *lip)
255{ 231{
256 xfs_dquot_t *dqp; 232 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
257 233
258 dqp = qip->qli_dquot;
259 if (atomic_read(&dqp->q_pincount) > 0) 234 if (atomic_read(&dqp->q_pincount) > 0)
260 return XFS_ITEM_PINNED; 235 return XFS_ITEM_PINNED;
261 236
262 if (! xfs_qm_dqlock_nowait(dqp)) 237 if (!xfs_qm_dqlock_nowait(dqp))
263 return XFS_ITEM_LOCKED; 238 return XFS_ITEM_LOCKED;
264 239
265 if (!xfs_dqflock_nowait(dqp)) { 240 if (!xfs_dqflock_nowait(dqp)) {
@@ -270,11 +245,10 @@ xfs_qm_dquot_logitem_trylock(
270 return XFS_ITEM_PUSHBUF; 245 return XFS_ITEM_PUSHBUF;
271 } 246 }
272 247
273 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); 248 ASSERT(lip->li_flags & XFS_LI_IN_AIL);
274 return XFS_ITEM_SUCCESS; 249 return XFS_ITEM_SUCCESS;
275} 250}
276 251
277
278/* 252/*
279 * Unlock the dquot associated with the log item. 253 * Unlock the dquot associated with the log item.
280 * Clear the fields of the dquot and dquot log item that 254 * Clear the fields of the dquot and dquot log item that
@@ -283,12 +257,10 @@ xfs_qm_dquot_logitem_trylock(
283 */ 257 */
284STATIC void 258STATIC void
285xfs_qm_dquot_logitem_unlock( 259xfs_qm_dquot_logitem_unlock(
286 xfs_dq_logitem_t *ql) 260 struct xfs_log_item *lip)
287{ 261{
288 xfs_dquot_t *dqp; 262 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
289 263
290 ASSERT(ql != NULL);
291 dqp = ql->qli_dquot;
292 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 264 ASSERT(XFS_DQ_IS_LOCKED(dqp));
293 265
294 /* 266 /*
@@ -305,44 +277,32 @@ xfs_qm_dquot_logitem_unlock(
305 xfs_dqunlock(dqp); 277 xfs_dqunlock(dqp);
306} 278}
307 279
308
309/* 280/*
310 * this needs to stamp an lsn into the dquot, I think. 281 * this needs to stamp an lsn into the dquot, I think.
311 * rpc's that look at user dquot's would then have to 282 * rpc's that look at user dquot's would then have to
312 * push on the dependency recorded in the dquot 283 * push on the dependency recorded in the dquot
313 */ 284 */
314/* ARGSUSED */
315STATIC void 285STATIC void
316xfs_qm_dquot_logitem_committing( 286xfs_qm_dquot_logitem_committing(
317 xfs_dq_logitem_t *l, 287 struct xfs_log_item *lip,
318 xfs_lsn_t lsn) 288 xfs_lsn_t lsn)
319{ 289{
320 return;
321} 290}
322 291
323
324/* 292/*
325 * This is the ops vector for dquots 293 * This is the ops vector for dquots
326 */ 294 */
327static struct xfs_item_ops xfs_dquot_item_ops = { 295static struct xfs_item_ops xfs_dquot_item_ops = {
328 .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size, 296 .iop_size = xfs_qm_dquot_logitem_size,
329 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 297 .iop_format = xfs_qm_dquot_logitem_format,
330 xfs_qm_dquot_logitem_format, 298 .iop_pin = xfs_qm_dquot_logitem_pin,
331 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, 299 .iop_unpin = xfs_qm_dquot_logitem_unpin,
332 .iop_unpin = (void(*)(xfs_log_item_t*, int)) 300 .iop_trylock = xfs_qm_dquot_logitem_trylock,
333 xfs_qm_dquot_logitem_unpin, 301 .iop_unlock = xfs_qm_dquot_logitem_unlock,
334 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 302 .iop_committed = xfs_qm_dquot_logitem_committed,
335 xfs_qm_dquot_logitem_unpin_remove, 303 .iop_push = xfs_qm_dquot_logitem_push,
336 .iop_trylock = (uint(*)(xfs_log_item_t*)) 304 .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf,
337 xfs_qm_dquot_logitem_trylock, 305 .iop_committing = xfs_qm_dquot_logitem_committing
338 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
339 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
340 xfs_qm_dquot_logitem_committed,
341 .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
342 .iop_pushbuf = (void(*)(xfs_log_item_t*))
343 xfs_qm_dquot_logitem_pushbuf,
344 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
345 xfs_qm_dquot_logitem_committing
346}; 306};
347 307
348/* 308/*
@@ -352,14 +312,12 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
352 */ 312 */
353void 313void
354xfs_qm_dquot_logitem_init( 314xfs_qm_dquot_logitem_init(
355 struct xfs_dquot *dqp) 315 struct xfs_dquot *dqp)
356{ 316{
357 xfs_dq_logitem_t *lp; 317 struct xfs_dq_logitem *lp = &dqp->q_logitem;
358 lp = &dqp->q_logitem;
359 318
360 lp->qli_item.li_type = XFS_LI_DQUOT; 319 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
361 lp->qli_item.li_ops = &xfs_dquot_item_ops; 320 &xfs_dquot_item_ops);
362 lp->qli_item.li_mountp = dqp->q_mount;
363 lp->qli_dquot = dqp; 321 lp->qli_dquot = dqp;
364 lp->qli_format.qlf_type = XFS_LI_DQUOT; 322 lp->qli_format.qlf_type = XFS_LI_DQUOT;
365 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); 323 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -377,16 +335,22 @@ xfs_qm_dquot_logitem_init(
377 335
378/*------------------ QUOTAOFF LOG ITEMS -------------------*/ 336/*------------------ QUOTAOFF LOG ITEMS -------------------*/
379 337
338static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
339{
340 return container_of(lip, struct xfs_qoff_logitem, qql_item);
341}
342
343
380/* 344/*
381 * This returns the number of iovecs needed to log the given quotaoff item. 345 * This returns the number of iovecs needed to log the given quotaoff item.
382 * We only need 1 iovec for an quotaoff item. It just logs the 346 * We only need 1 iovec for an quotaoff item. It just logs the
383 * quotaoff_log_format structure. 347 * quotaoff_log_format structure.
384 */ 348 */
385/*ARGSUSED*/
386STATIC uint 349STATIC uint
387xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf) 350xfs_qm_qoff_logitem_size(
351 struct xfs_log_item *lip)
388{ 352{
389 return (1); 353 return 1;
390} 354}
391 355
392/* 356/*
@@ -397,53 +361,46 @@ xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
397 * slots in the quotaoff item have been filled. 361 * slots in the quotaoff item have been filled.
398 */ 362 */
399STATIC void 363STATIC void
400xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf, 364xfs_qm_qoff_logitem_format(
401 xfs_log_iovec_t *log_vector) 365 struct xfs_log_item *lip,
366 struct xfs_log_iovec *log_vector)
402{ 367{
403 ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF); 368 struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
369
370 ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
404 371
405 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); 372 log_vector->i_addr = &qflip->qql_format;
406 log_vector->i_len = sizeof(xfs_qoff_logitem_t); 373 log_vector->i_len = sizeof(xfs_qoff_logitem_t);
407 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; 374 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
408 qf->qql_format.qf_size = 1; 375 qflip->qql_format.qf_size = 1;
409} 376}
410 377
411
412/* 378/*
413 * Pinning has no meaning for an quotaoff item, so just return. 379 * Pinning has no meaning for an quotaoff item, so just return.
414 */ 380 */
415/*ARGSUSED*/
416STATIC void 381STATIC void
417xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) 382xfs_qm_qoff_logitem_pin(
383 struct xfs_log_item *lip)
418{ 384{
419 return;
420} 385}
421 386
422
423/* 387/*
424 * Since pinning has no meaning for an quotaoff item, unpinning does 388 * Since pinning has no meaning for an quotaoff item, unpinning does
425 * not either. 389 * not either.
426 */ 390 */
427/*ARGSUSED*/
428STATIC void 391STATIC void
429xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) 392xfs_qm_qoff_logitem_unpin(
393 struct xfs_log_item *lip,
394 int remove)
430{ 395{
431 return;
432}
433
434/*ARGSUSED*/
435STATIC void
436xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
437{
438 return;
439} 396}
440 397
441/* 398/*
442 * Quotaoff items have no locking, so just return success. 399 * Quotaoff items have no locking, so just return success.
443 */ 400 */
444/*ARGSUSED*/
445STATIC uint 401STATIC uint
446xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf) 402xfs_qm_qoff_logitem_trylock(
403 struct xfs_log_item *lip)
447{ 404{
448 return XFS_ITEM_LOCKED; 405 return XFS_ITEM_LOCKED;
449} 406}
@@ -452,53 +409,51 @@ xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
452 * Quotaoff items have no locking or pushing, so return failure 409 * Quotaoff items have no locking or pushing, so return failure
453 * so that the caller doesn't bother with us. 410 * so that the caller doesn't bother with us.
454 */ 411 */
455/*ARGSUSED*/
456STATIC void 412STATIC void
457xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf) 413xfs_qm_qoff_logitem_unlock(
414 struct xfs_log_item *lip)
458{ 415{
459 return;
460} 416}
461 417
462/* 418/*
463 * The quotaoff-start-item is logged only once and cannot be moved in the log, 419 * The quotaoff-start-item is logged only once and cannot be moved in the log,
464 * so simply return the lsn at which it's been logged. 420 * so simply return the lsn at which it's been logged.
465 */ 421 */
466/*ARGSUSED*/
467STATIC xfs_lsn_t 422STATIC xfs_lsn_t
468xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn) 423xfs_qm_qoff_logitem_committed(
424 struct xfs_log_item *lip,
425 xfs_lsn_t lsn)
469{ 426{
470 return (lsn); 427 return lsn;
471} 428}
472 429
473/* 430/*
474 * There isn't much you can do to push on an quotaoff item. It is simply 431 * There isn't much you can do to push on an quotaoff item. It is simply
475 * stuck waiting for the log to be flushed to disk. 432 * stuck waiting for the log to be flushed to disk.
476 */ 433 */
477/*ARGSUSED*/
478STATIC void 434STATIC void
479xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf) 435xfs_qm_qoff_logitem_push(
436 struct xfs_log_item *lip)
480{ 437{
481 return;
482} 438}
483 439
484 440
485/*ARGSUSED*/
486STATIC xfs_lsn_t 441STATIC xfs_lsn_t
487xfs_qm_qoffend_logitem_committed( 442xfs_qm_qoffend_logitem_committed(
488 xfs_qoff_logitem_t *qfe, 443 struct xfs_log_item *lip,
489 xfs_lsn_t lsn) 444 xfs_lsn_t lsn)
490{ 445{
491 xfs_qoff_logitem_t *qfs; 446 struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
492 struct xfs_ail *ailp; 447 struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
448 struct xfs_ail *ailp = qfs->qql_item.li_ailp;
493 449
494 qfs = qfe->qql_start_lip;
495 ailp = qfs->qql_item.li_ailp;
496 spin_lock(&ailp->xa_lock);
497 /* 450 /*
498 * Delete the qoff-start logitem from the AIL. 451 * Delete the qoff-start logitem from the AIL.
499 * xfs_trans_ail_delete() drops the AIL lock. 452 * xfs_trans_ail_delete() drops the AIL lock.
500 */ 453 */
454 spin_lock(&ailp->xa_lock);
501 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); 455 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
456
502 kmem_free(qfs); 457 kmem_free(qfs);
503 kmem_free(qfe); 458 kmem_free(qfe);
504 return (xfs_lsn_t)-1; 459 return (xfs_lsn_t)-1;
@@ -518,82 +473,58 @@ xfs_qm_qoffend_logitem_committed(
518 * (truly makes the quotaoff irrevocable). If we do something else, 473 * (truly makes the quotaoff irrevocable). If we do something else,
519 * then maybe we don't need two. 474 * then maybe we don't need two.
520 */ 475 */
521/* ARGSUSED */
522STATIC void
523xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
524{
525 return;
526}
527
528/* ARGSUSED */
529STATIC void 476STATIC void
530xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) 477xfs_qm_qoff_logitem_committing(
478 struct xfs_log_item *lip,
479 xfs_lsn_t commit_lsn)
531{ 480{
532 return;
533} 481}
534 482
535static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { 483static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
536 .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, 484 .iop_size = xfs_qm_qoff_logitem_size,
537 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 485 .iop_format = xfs_qm_qoff_logitem_format,
538 xfs_qm_qoff_logitem_format, 486 .iop_pin = xfs_qm_qoff_logitem_pin,
539 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 487 .iop_unpin = xfs_qm_qoff_logitem_unpin,
540 .iop_unpin = (void(*)(xfs_log_item_t* ,int)) 488 .iop_trylock = xfs_qm_qoff_logitem_trylock,
541 xfs_qm_qoff_logitem_unpin, 489 .iop_unlock = xfs_qm_qoff_logitem_unlock,
542 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 490 .iop_committed = xfs_qm_qoffend_logitem_committed,
543 xfs_qm_qoff_logitem_unpin_remove, 491 .iop_push = xfs_qm_qoff_logitem_push,
544 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 492 .iop_committing = xfs_qm_qoff_logitem_committing
545 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
546 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
547 xfs_qm_qoffend_logitem_committed,
548 .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
549 .iop_pushbuf = NULL,
550 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
551 xfs_qm_qoffend_logitem_committing
552}; 493};
553 494
554/* 495/*
555 * This is the ops vector shared by all quotaoff-start log items. 496 * This is the ops vector shared by all quotaoff-start log items.
556 */ 497 */
557static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { 498static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
558 .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, 499 .iop_size = xfs_qm_qoff_logitem_size,
559 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 500 .iop_format = xfs_qm_qoff_logitem_format,
560 xfs_qm_qoff_logitem_format, 501 .iop_pin = xfs_qm_qoff_logitem_pin,
561 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 502 .iop_unpin = xfs_qm_qoff_logitem_unpin,
562 .iop_unpin = (void(*)(xfs_log_item_t*, int)) 503 .iop_trylock = xfs_qm_qoff_logitem_trylock,
563 xfs_qm_qoff_logitem_unpin, 504 .iop_unlock = xfs_qm_qoff_logitem_unlock,
564 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 505 .iop_committed = xfs_qm_qoff_logitem_committed,
565 xfs_qm_qoff_logitem_unpin_remove, 506 .iop_push = xfs_qm_qoff_logitem_push,
566 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 507 .iop_committing = xfs_qm_qoff_logitem_committing
567 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
568 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
569 xfs_qm_qoff_logitem_committed,
570 .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
571 .iop_pushbuf = NULL,
572 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
573 xfs_qm_qoff_logitem_committing
574}; 508};
575 509
576/* 510/*
577 * Allocate and initialize an quotaoff item of the correct quota type(s). 511 * Allocate and initialize an quotaoff item of the correct quota type(s).
578 */ 512 */
579xfs_qoff_logitem_t * 513struct xfs_qoff_logitem *
580xfs_qm_qoff_logitem_init( 514xfs_qm_qoff_logitem_init(
581 struct xfs_mount *mp, 515 struct xfs_mount *mp,
582 xfs_qoff_logitem_t *start, 516 struct xfs_qoff_logitem *start,
583 uint flags) 517 uint flags)
584{ 518{
585 xfs_qoff_logitem_t *qf; 519 struct xfs_qoff_logitem *qf;
586 520
587 qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); 521 qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
588 522
589 qf->qql_item.li_type = XFS_LI_QUOTAOFF; 523 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
590 if (start) 524 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
591 qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
592 else
593 qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
594 qf->qql_item.li_mountp = mp; 525 qf->qql_item.li_mountp = mp;
595 qf->qql_format.qf_type = XFS_LI_QUOTAOFF; 526 qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
596 qf->qql_format.qf_flags = flags; 527 qf->qql_format.qf_flags = flags;
597 qf->qql_start_lip = start; 528 qf->qql_start_lip = start;
598 return (qf); 529 return qf;
599} 530}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 417e61e3d9dd..9a92407109a1 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -23,25 +23,18 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 31#include "xfs_dinode.h"
37#include "xfs_inode.h" 32#include "xfs_inode.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h" 33#include "xfs_ialloc.h"
40#include "xfs_itable.h" 34#include "xfs_itable.h"
41#include "xfs_rtalloc.h" 35#include "xfs_rtalloc.h"
42#include "xfs_error.h" 36#include "xfs_error.h"
43#include "xfs_bmap.h" 37#include "xfs_bmap.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 38#include "xfs_attr.h"
46#include "xfs_buf_item.h" 39#include "xfs_buf_item.h"
47#include "xfs_trans_space.h" 40#include "xfs_trans_space.h"
@@ -67,12 +60,9 @@ static cred_t xfs_zerocr;
67STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 60STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
68STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 61STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
69 62
70STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
72
73STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 63STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
74STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 64STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
75STATIC int xfs_qm_shake(int, gfp_t); 65STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t);
76 66
77static struct shrinker xfs_qm_shaker = { 67static struct shrinker xfs_qm_shaker = {
78 .shrink = xfs_qm_shake, 68 .shrink = xfs_qm_shake,
@@ -84,21 +74,25 @@ extern struct mutex qcheck_lock;
84#endif 74#endif
85 75
86#ifdef QUOTADEBUG 76#ifdef QUOTADEBUG
87#define XQM_LIST_PRINT(l, NXT, title) \ 77static void
88{ \ 78xfs_qm_dquot_list_print(
89 xfs_dquot_t *dqp; int i = 0; \ 79 struct xfs_mount *mp)
90 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ 80{
91 for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ 81 xfs_dquot_t *dqp;
92 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ 82 int i = 0;
93 "bcnt = %d, icnt = %d, refs = %d", \ 83
94 ++i, (int) be32_to_cpu(dqp->q_core.d_id), \ 84 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
95 DQFLAGTO_TYPESTR(dqp), \ 85 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" "
96 (int) be64_to_cpu(dqp->q_core.d_bcount), \ 86 "bcnt = %lld, icnt = %lld, refs = %d",
97 (int) be64_to_cpu(dqp->q_core.d_icount), \ 87 i++, be32_to_cpu(dqp->q_core.d_id),
98 (int) dqp->q_nrefs); } \ 88 DQFLAGTO_TYPESTR(dqp),
89 (long long)be64_to_cpu(dqp->q_core.d_bcount),
90 (long long)be64_to_cpu(dqp->q_core.d_icount),
91 dqp->q_nrefs);
92 }
99} 93}
100#else 94#else
101#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) 95static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
102#endif 96#endif
103 97
104/* 98/*
@@ -144,7 +138,9 @@ xfs_Gqm_init(void)
144 /* 138 /*
145 * Freelist of all dquots of all file systems 139 * Freelist of all dquots of all file systems
146 */ 140 */
147 xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); 141 INIT_LIST_HEAD(&xqm->qm_dqfrlist);
142 xqm->qm_dqfrlist_cnt = 0;
143 mutex_init(&xqm->qm_dqfrlist_lock);
148 144
149 /* 145 /*
150 * dquot zone. we register our own low-memory callback. 146 * dquot zone. we register our own low-memory callback.
@@ -189,6 +185,7 @@ STATIC void
189xfs_qm_destroy( 185xfs_qm_destroy(
190 struct xfs_qm *xqm) 186 struct xfs_qm *xqm)
191{ 187{
188 struct xfs_dquot *dqp, *n;
192 int hsize, i; 189 int hsize, i;
193 190
194 ASSERT(xqm != NULL); 191 ASSERT(xqm != NULL);
@@ -204,7 +201,21 @@ xfs_qm_destroy(
204 xqm->qm_usr_dqhtable = NULL; 201 xqm->qm_usr_dqhtable = NULL;
205 xqm->qm_grp_dqhtable = NULL; 202 xqm->qm_grp_dqhtable = NULL;
206 xqm->qm_dqhashmask = 0; 203 xqm->qm_dqhashmask = 0;
207 xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); 204
205 /* frlist cleanup */
206 mutex_lock(&xqm->qm_dqfrlist_lock);
207 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
208 xfs_dqlock(dqp);
209#ifdef QUOTADEBUG
210 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
211#endif
212 list_del_init(&dqp->q_freelist);
213 xfs_Gqm->qm_dqfrlist_cnt--;
214 xfs_dqunlock(dqp);
215 xfs_qm_dqdestroy(dqp);
216 }
217 mutex_unlock(&xqm->qm_dqfrlist_lock);
218 mutex_destroy(&xqm->qm_dqfrlist_lock);
208#ifdef DEBUG 219#ifdef DEBUG
209 mutex_destroy(&qcheck_lock); 220 mutex_destroy(&qcheck_lock);
210#endif 221#endif
@@ -231,8 +242,10 @@ xfs_qm_hold_quotafs_ref(
231 242
232 if (!xfs_Gqm) { 243 if (!xfs_Gqm) {
233 xfs_Gqm = xfs_Gqm_init(); 244 xfs_Gqm = xfs_Gqm_init();
234 if (!xfs_Gqm) 245 if (!xfs_Gqm) {
246 mutex_unlock(&xfs_Gqm_lock);
235 return ENOMEM; 247 return ENOMEM;
248 }
236 } 249 }
237 250
238 /* 251 /*
@@ -256,7 +269,7 @@ STATIC void
256xfs_qm_rele_quotafs_ref( 269xfs_qm_rele_quotafs_ref(
257 struct xfs_mount *mp) 270 struct xfs_mount *mp)
258{ 271{
259 xfs_dquot_t *dqp, *nextdqp; 272 xfs_dquot_t *dqp, *n;
260 273
261 ASSERT(xfs_Gqm); 274 ASSERT(xfs_Gqm);
262 ASSERT(xfs_Gqm->qm_nrefs > 0); 275 ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -264,26 +277,24 @@ xfs_qm_rele_quotafs_ref(
264 /* 277 /*
265 * Go thru the freelist and destroy all inactive dquots. 278 * Go thru the freelist and destroy all inactive dquots.
266 */ 279 */
267 xfs_qm_freelist_lock(xfs_Gqm); 280 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
268 281
269 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; 282 list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
270 dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
271 xfs_dqlock(dqp); 283 xfs_dqlock(dqp);
272 nextdqp = dqp->dq_flnext;
273 if (dqp->dq_flags & XFS_DQ_INACTIVE) { 284 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
274 ASSERT(dqp->q_mount == NULL); 285 ASSERT(dqp->q_mount == NULL);
275 ASSERT(! XFS_DQ_IS_DIRTY(dqp)); 286 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
276 ASSERT(dqp->HL_PREVP == NULL); 287 ASSERT(list_empty(&dqp->q_hashlist));
277 ASSERT(dqp->MPL_PREVP == NULL); 288 ASSERT(list_empty(&dqp->q_mplist));
278 XQM_FREELIST_REMOVE(dqp); 289 list_del_init(&dqp->q_freelist);
290 xfs_Gqm->qm_dqfrlist_cnt--;
279 xfs_dqunlock(dqp); 291 xfs_dqunlock(dqp);
280 xfs_qm_dqdestroy(dqp); 292 xfs_qm_dqdestroy(dqp);
281 } else { 293 } else {
282 xfs_dqunlock(dqp); 294 xfs_dqunlock(dqp);
283 } 295 }
284 dqp = nextdqp;
285 } 296 }
286 xfs_qm_freelist_unlock(xfs_Gqm); 297 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
287 298
288 /* 299 /*
289 * Destroy the entire XQM. If somebody mounts with quotaon, this'll 300 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -305,7 +316,7 @@ xfs_qm_unmount(
305 struct xfs_mount *mp) 316 struct xfs_mount *mp)
306{ 317{
307 if (mp->m_quotainfo) { 318 if (mp->m_quotainfo) {
308 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); 319 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
309 xfs_qm_destroy_quotainfo(mp); 320 xfs_qm_destroy_quotainfo(mp);
310 } 321 }
311} 322}
@@ -449,20 +460,21 @@ xfs_qm_unmount_quotas(
449 */ 460 */
450STATIC int 461STATIC int
451xfs_qm_dqflush_all( 462xfs_qm_dqflush_all(
452 xfs_mount_t *mp, 463 struct xfs_mount *mp,
453 int sync_mode) 464 int sync_mode)
454{ 465{
455 int recl; 466 struct xfs_quotainfo *q = mp->m_quotainfo;
456 xfs_dquot_t *dqp; 467 int recl;
457 int niters; 468 struct xfs_dquot *dqp;
458 int error; 469 int niters;
470 int error;
459 471
460 if (mp->m_quotainfo == NULL) 472 if (!q)
461 return 0; 473 return 0;
462 niters = 0; 474 niters = 0;
463again: 475again:
464 xfs_qm_mplist_lock(mp); 476 mutex_lock(&q->qi_dqlist_lock);
465 FOREACH_DQUOT_IN_MP(dqp, mp) { 477 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
466 xfs_dqlock(dqp); 478 xfs_dqlock(dqp);
467 if (! XFS_DQ_IS_DIRTY(dqp)) { 479 if (! XFS_DQ_IS_DIRTY(dqp)) {
468 xfs_dqunlock(dqp); 480 xfs_dqunlock(dqp);
@@ -470,7 +482,7 @@ again:
470 } 482 }
471 483
472 /* XXX a sentinel would be better */ 484 /* XXX a sentinel would be better */
473 recl = XFS_QI_MPLRECLAIMS(mp); 485 recl = q->qi_dqreclaims;
474 if (!xfs_dqflock_nowait(dqp)) { 486 if (!xfs_dqflock_nowait(dqp)) {
475 /* 487 /*
476 * If we can't grab the flush lock then check 488 * If we can't grab the flush lock then check
@@ -485,21 +497,21 @@ again:
485 * Let go of the mplist lock. We don't want to hold it 497 * Let go of the mplist lock. We don't want to hold it
486 * across a disk write. 498 * across a disk write.
487 */ 499 */
488 xfs_qm_mplist_unlock(mp); 500 mutex_unlock(&q->qi_dqlist_lock);
489 error = xfs_qm_dqflush(dqp, sync_mode); 501 error = xfs_qm_dqflush(dqp, sync_mode);
490 xfs_dqunlock(dqp); 502 xfs_dqunlock(dqp);
491 if (error) 503 if (error)
492 return error; 504 return error;
493 505
494 xfs_qm_mplist_lock(mp); 506 mutex_lock(&q->qi_dqlist_lock);
495 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 507 if (recl != q->qi_dqreclaims) {
496 xfs_qm_mplist_unlock(mp); 508 mutex_unlock(&q->qi_dqlist_lock);
497 /* XXX restart limit */ 509 /* XXX restart limit */
498 goto again; 510 goto again;
499 } 511 }
500 } 512 }
501 513
502 xfs_qm_mplist_unlock(mp); 514 mutex_unlock(&q->qi_dqlist_lock);
503 /* return ! busy */ 515 /* return ! busy */
504 return 0; 516 return 0;
505} 517}
@@ -509,15 +521,15 @@ again:
509 */ 521 */
510STATIC void 522STATIC void
511xfs_qm_detach_gdquots( 523xfs_qm_detach_gdquots(
512 xfs_mount_t *mp) 524 struct xfs_mount *mp)
513{ 525{
514 xfs_dquot_t *dqp, *gdqp; 526 struct xfs_quotainfo *q = mp->m_quotainfo;
515 int nrecl; 527 struct xfs_dquot *dqp, *gdqp;
528 int nrecl;
516 529
517 again: 530 again:
518 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 531 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
519 dqp = XFS_QI_MPLNEXT(mp); 532 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
520 while (dqp) {
521 xfs_dqlock(dqp); 533 xfs_dqlock(dqp);
522 if ((gdqp = dqp->q_gdquot)) { 534 if ((gdqp = dqp->q_gdquot)) {
523 xfs_dqlock(gdqp); 535 xfs_dqlock(gdqp);
@@ -530,15 +542,14 @@ xfs_qm_detach_gdquots(
530 * Can't hold the mplist lock across a dqput. 542 * Can't hold the mplist lock across a dqput.
531 * XXXmust convert to marker based iterations here. 543 * XXXmust convert to marker based iterations here.
532 */ 544 */
533 nrecl = XFS_QI_MPLRECLAIMS(mp); 545 nrecl = q->qi_dqreclaims;
534 xfs_qm_mplist_unlock(mp); 546 mutex_unlock(&q->qi_dqlist_lock);
535 xfs_qm_dqput(gdqp); 547 xfs_qm_dqput(gdqp);
536 548
537 xfs_qm_mplist_lock(mp); 549 mutex_lock(&q->qi_dqlist_lock);
538 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) 550 if (nrecl != q->qi_dqreclaims)
539 goto again; 551 goto again;
540 } 552 }
541 dqp = dqp->MPL_NEXT;
542 } 553 }
543} 554}
544 555
@@ -550,23 +561,23 @@ xfs_qm_detach_gdquots(
550 */ 561 */
551STATIC int 562STATIC int
552xfs_qm_dqpurge_int( 563xfs_qm_dqpurge_int(
553 xfs_mount_t *mp, 564 struct xfs_mount *mp,
554 uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ 565 uint flags)
555{ 566{
556 xfs_dquot_t *dqp; 567 struct xfs_quotainfo *q = mp->m_quotainfo;
557 uint dqtype; 568 struct xfs_dquot *dqp, *n;
558 int nrecl; 569 uint dqtype;
559 xfs_dquot_t *nextdqp; 570 int nrecl;
560 int nmisses; 571 int nmisses;
561 572
562 if (mp->m_quotainfo == NULL) 573 if (!q)
563 return 0; 574 return 0;
564 575
565 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; 576 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
566 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; 577 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
567 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; 578 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
568 579
569 xfs_qm_mplist_lock(mp); 580 mutex_lock(&q->qi_dqlist_lock);
570 581
571 /* 582 /*
572 * In the first pass through all incore dquots of this filesystem, 583 * In the first pass through all incore dquots of this filesystem,
@@ -578,28 +589,25 @@ xfs_qm_dqpurge_int(
578 589
579 again: 590 again:
580 nmisses = 0; 591 nmisses = 0;
581 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 592 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
582 /* 593 /*
583 * Try to get rid of all of the unwanted dquots. The idea is to 594 * Try to get rid of all of the unwanted dquots. The idea is to
584 * get them off mplist and hashlist, but leave them on freelist. 595 * get them off mplist and hashlist, but leave them on freelist.
585 */ 596 */
586 dqp = XFS_QI_MPLNEXT(mp); 597 list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
587 while (dqp) {
588 /* 598 /*
589 * It's OK to look at the type without taking dqlock here. 599 * It's OK to look at the type without taking dqlock here.
590 * We're holding the mplist lock here, and that's needed for 600 * We're holding the mplist lock here, and that's needed for
591 * a dqreclaim. 601 * a dqreclaim.
592 */ 602 */
593 if ((dqp->dq_flags & dqtype) == 0) { 603 if ((dqp->dq_flags & dqtype) == 0)
594 dqp = dqp->MPL_NEXT;
595 continue; 604 continue;
596 }
597 605
598 if (!mutex_trylock(&dqp->q_hash->qh_lock)) { 606 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
599 nrecl = XFS_QI_MPLRECLAIMS(mp); 607 nrecl = q->qi_dqreclaims;
600 xfs_qm_mplist_unlock(mp); 608 mutex_unlock(&q->qi_dqlist_lock);
601 mutex_lock(&dqp->q_hash->qh_lock); 609 mutex_lock(&dqp->q_hash->qh_lock);
602 xfs_qm_mplist_lock(mp); 610 mutex_lock(&q->qi_dqlist_lock);
603 611
604 /* 612 /*
605 * XXXTheoretically, we can get into a very long 613 * XXXTheoretically, we can get into a very long
@@ -607,7 +615,7 @@ xfs_qm_dqpurge_int(
607 * No one can be adding dquots to the mplist at 615 * No one can be adding dquots to the mplist at
608 * this point, but somebody might be taking things off. 616 * this point, but somebody might be taking things off.
609 */ 617 */
610 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { 618 if (nrecl != q->qi_dqreclaims) {
611 mutex_unlock(&dqp->q_hash->qh_lock); 619 mutex_unlock(&dqp->q_hash->qh_lock);
612 goto again; 620 goto again;
613 } 621 }
@@ -617,11 +625,9 @@ xfs_qm_dqpurge_int(
617 * Take the dquot off the mplist and hashlist. It may remain on 625 * Take the dquot off the mplist and hashlist. It may remain on
618 * freelist in INACTIVE state. 626 * freelist in INACTIVE state.
619 */ 627 */
620 nextdqp = dqp->MPL_NEXT;
621 nmisses += xfs_qm_dqpurge(dqp); 628 nmisses += xfs_qm_dqpurge(dqp);
622 dqp = nextdqp;
623 } 629 }
624 xfs_qm_mplist_unlock(mp); 630 mutex_unlock(&q->qi_dqlist_lock);
625 return nmisses; 631 return nmisses;
626} 632}
627 633
@@ -921,12 +927,13 @@ xfs_qm_dqdetach(
921 927
922int 928int
923xfs_qm_sync( 929xfs_qm_sync(
924 xfs_mount_t *mp, 930 struct xfs_mount *mp,
925 int flags) 931 int flags)
926{ 932{
927 int recl, restarts; 933 struct xfs_quotainfo *q = mp->m_quotainfo;
928 xfs_dquot_t *dqp; 934 int recl, restarts;
929 int error; 935 struct xfs_dquot *dqp;
936 int error;
930 937
931 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 938 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
932 return 0; 939 return 0;
@@ -934,18 +941,19 @@ xfs_qm_sync(
934 restarts = 0; 941 restarts = 0;
935 942
936 again: 943 again:
937 xfs_qm_mplist_lock(mp); 944 mutex_lock(&q->qi_dqlist_lock);
938 /* 945 /*
939 * dqpurge_all() also takes the mplist lock and iterate thru all dquots 946 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
940 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared 947 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
941 * when we have the mplist lock, we know that dquots will be consistent 948 * when we have the mplist lock, we know that dquots will be consistent
942 * as long as we have it locked. 949 * as long as we have it locked.
943 */ 950 */
944 if (! XFS_IS_QUOTA_ON(mp)) { 951 if (!XFS_IS_QUOTA_ON(mp)) {
945 xfs_qm_mplist_unlock(mp); 952 mutex_unlock(&q->qi_dqlist_lock);
946 return 0; 953 return 0;
947 } 954 }
948 FOREACH_DQUOT_IN_MP(dqp, mp) { 955 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
956 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
949 /* 957 /*
950 * If this is vfs_sync calling, then skip the dquots that 958 * If this is vfs_sync calling, then skip the dquots that
951 * don't 'seem' to be dirty. ie. don't acquire dqlock. 959 * don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -969,7 +977,7 @@ xfs_qm_sync(
969 } 977 }
970 978
971 /* XXX a sentinel would be better */ 979 /* XXX a sentinel would be better */
972 recl = XFS_QI_MPLRECLAIMS(mp); 980 recl = q->qi_dqreclaims;
973 if (!xfs_dqflock_nowait(dqp)) { 981 if (!xfs_dqflock_nowait(dqp)) {
974 if (flags & SYNC_TRYLOCK) { 982 if (flags & SYNC_TRYLOCK) {
975 xfs_dqunlock(dqp); 983 xfs_dqunlock(dqp);
@@ -989,7 +997,7 @@ xfs_qm_sync(
989 * Let go of the mplist lock. We don't want to hold it 997 * Let go of the mplist lock. We don't want to hold it
990 * across a disk write 998 * across a disk write
991 */ 999 */
992 xfs_qm_mplist_unlock(mp); 1000 mutex_unlock(&q->qi_dqlist_lock);
993 error = xfs_qm_dqflush(dqp, flags); 1001 error = xfs_qm_dqflush(dqp, flags);
994 xfs_dqunlock(dqp); 1002 xfs_dqunlock(dqp);
995 if (error && XFS_FORCED_SHUTDOWN(mp)) 1003 if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -997,17 +1005,17 @@ xfs_qm_sync(
997 else if (error) 1005 else if (error)
998 return error; 1006 return error;
999 1007
1000 xfs_qm_mplist_lock(mp); 1008 mutex_lock(&q->qi_dqlist_lock);
1001 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 1009 if (recl != q->qi_dqreclaims) {
1002 if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) 1010 if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
1003 break; 1011 break;
1004 1012
1005 xfs_qm_mplist_unlock(mp); 1013 mutex_unlock(&q->qi_dqlist_lock);
1006 goto again; 1014 goto again;
1007 } 1015 }
1008 } 1016 }
1009 1017
1010 xfs_qm_mplist_unlock(mp); 1018 mutex_unlock(&q->qi_dqlist_lock);
1011 return 0; 1019 return 0;
1012} 1020}
1013 1021
@@ -1052,8 +1060,9 @@ xfs_qm_init_quotainfo(
1052 return error; 1060 return error;
1053 } 1061 }
1054 1062
1055 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); 1063 INIT_LIST_HEAD(&qinf->qi_dqlist);
1056 lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); 1064 mutex_init(&qinf->qi_dqlist_lock);
1065 lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
1057 1066
1058 qinf->qi_dqreclaims = 0; 1067 qinf->qi_dqreclaims = 0;
1059 1068
@@ -1150,7 +1159,8 @@ xfs_qm_destroy_quotainfo(
1150 */ 1159 */
1151 xfs_qm_rele_quotafs_ref(mp); 1160 xfs_qm_rele_quotafs_ref(mp);
1152 1161
1153 xfs_qm_list_destroy(&qi->qi_dqlist); 1162 ASSERT(list_empty(&qi->qi_dqlist));
1163 mutex_destroy(&qi->qi_dqlist_lock);
1154 1164
1155 if (qi->qi_uquotaip) { 1165 if (qi->qi_uquotaip) {
1156 IRELE(qi->qi_uquotaip); 1166 IRELE(qi->qi_uquotaip);
@@ -1177,7 +1187,7 @@ xfs_qm_list_init(
1177 int n) 1187 int n)
1178{ 1188{
1179 mutex_init(&list->qh_lock); 1189 mutex_init(&list->qh_lock);
1180 list->qh_next = NULL; 1190 INIT_LIST_HEAD(&list->qh_list);
1181 list->qh_version = 0; 1191 list->qh_version = 0;
1182 list->qh_nelems = 0; 1192 list->qh_nelems = 0;
1183} 1193}
@@ -1316,9 +1326,6 @@ xfs_qm_qino_alloc(
1316 */ 1326 */
1317 spin_lock(&mp->m_sb_lock); 1327 spin_lock(&mp->m_sb_lock);
1318 if (flags & XFS_QMOPT_SBVERSION) { 1328 if (flags & XFS_QMOPT_SBVERSION) {
1319#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1320 unsigned oldv = mp->m_sb.sb_versionnum;
1321#endif
1322 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); 1329 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
1323 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 1330 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1324 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == 1331 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1331,11 +1338,6 @@ xfs_qm_qino_alloc(
1331 1338
1332 /* qflags will get updated _after_ quotacheck */ 1339 /* qflags will get updated _after_ quotacheck */
1333 mp->m_sb.sb_qflags = 0; 1340 mp->m_sb.sb_qflags = 0;
1334#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1335 cmn_err(CE_NOTE,
1336 "Old superblock version %x, converting to %x.",
1337 oldv, mp->m_sb.sb_versionnum);
1338#endif
1339 } 1341 }
1340 if (flags & XFS_QMOPT_UQUOTA) 1342 if (flags & XFS_QMOPT_UQUOTA)
1341 mp->m_sb.sb_uquotino = (*ip)->i_ino; 1343 mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1371,10 +1373,10 @@ xfs_qm_reset_dqcounts(
1371#ifdef DEBUG 1373#ifdef DEBUG
1372 j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); 1374 j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
1373 do_div(j, sizeof(xfs_dqblk_t)); 1375 do_div(j, sizeof(xfs_dqblk_t));
1374 ASSERT(XFS_QM_DQPERBLK(mp) == j); 1376 ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
1375#endif 1377#endif
1376 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); 1378 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
1377 for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { 1379 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
1378 /* 1380 /*
1379 * Do a sanity check, and if needed, repair the dqblk. Don't 1381 * Do a sanity check, and if needed, repair the dqblk. Don't
1380 * output any warnings because it's perfectly possible to 1382 * output any warnings because it's perfectly possible to
@@ -1429,7 +1431,7 @@ xfs_qm_dqiter_bufs(
1429 while (blkcnt--) { 1431 while (blkcnt--) {
1430 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 1432 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1431 XFS_FSB_TO_DADDR(mp, bno), 1433 XFS_FSB_TO_DADDR(mp, bno),
1432 (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); 1434 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
1433 if (error) 1435 if (error)
1434 break; 1436 break;
1435 1437
@@ -1439,7 +1441,7 @@ xfs_qm_dqiter_bufs(
1439 * goto the next block. 1441 * goto the next block.
1440 */ 1442 */
1441 bno++; 1443 bno++;
1442 firstid += XFS_QM_DQPERBLK(mp); 1444 firstid += mp->m_quotainfo->qi_dqperchunk;
1443 } 1445 }
1444 return error; 1446 return error;
1445} 1447}
@@ -1488,7 +1490,7 @@ xfs_qm_dqiterate(
1488 maxlblkcnt - lblkno, 1490 maxlblkcnt - lblkno,
1489 XFS_BMAPI_METADATA, 1491 XFS_BMAPI_METADATA,
1490 NULL, 1492 NULL,
1491 0, map, &nmaps, NULL, NULL); 1493 0, map, &nmaps, NULL);
1492 xfs_iunlock(qip, XFS_ILOCK_SHARED); 1494 xfs_iunlock(qip, XFS_ILOCK_SHARED);
1493 if (error) 1495 if (error)
1494 break; 1496 break;
@@ -1505,7 +1507,7 @@ xfs_qm_dqiterate(
1505 continue; 1507 continue;
1506 1508
1507 firstid = (xfs_dqid_t) map[i].br_startoff * 1509 firstid = (xfs_dqid_t) map[i].br_startoff *
1508 XFS_QM_DQPERBLK(mp); 1510 mp->m_quotainfo->qi_dqperchunk;
1509 /* 1511 /*
1510 * Do a read-ahead on the next extent. 1512 * Do a read-ahead on the next extent.
1511 */ 1513 */
@@ -1516,7 +1518,7 @@ xfs_qm_dqiterate(
1516 while (rablkcnt--) { 1518 while (rablkcnt--) {
1517 xfs_baread(mp->m_ddev_targp, 1519 xfs_baread(mp->m_ddev_targp,
1518 XFS_FSB_TO_DADDR(mp, rablkno), 1520 XFS_FSB_TO_DADDR(mp, rablkno),
1519 (int)XFS_QI_DQCHUNKLEN(mp)); 1521 mp->m_quotainfo->qi_dqchunklen);
1520 rablkno++; 1522 rablkno++;
1521 } 1523 }
1522 } 1524 }
@@ -1576,8 +1578,10 @@ xfs_qm_quotacheck_dqadjust(
1576 1578
1577 /* 1579 /*
1578 * Set default limits, adjust timers (since we changed usages) 1580 * Set default limits, adjust timers (since we changed usages)
1581 *
1582 * There are no timers for the default values set in the root dquot.
1579 */ 1583 */
1580 if (! XFS_IS_SUSER_DQUOT(dqp)) { 1584 if (dqp->q_core.d_id) {
1581 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); 1585 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
1582 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); 1586 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
1583 } 1587 }
@@ -1621,10 +1625,7 @@ xfs_qm_dqusage_adjust(
1621 xfs_ino_t ino, /* inode number to get data for */ 1625 xfs_ino_t ino, /* inode number to get data for */
1622 void __user *buffer, /* not used */ 1626 void __user *buffer, /* not used */
1623 int ubsize, /* not used */ 1627 int ubsize, /* not used */
1624 void *private_data, /* not used */
1625 xfs_daddr_t bno, /* starting block of inode cluster */
1626 int *ubused, /* not used */ 1628 int *ubused, /* not used */
1627 void *dip, /* on-disk inode pointer (not used) */
1628 int *res) /* result code value */ 1629 int *res) /* result code value */
1629{ 1630{
1630 xfs_inode_t *ip; 1631 xfs_inode_t *ip;
@@ -1649,7 +1650,7 @@ xfs_qm_dqusage_adjust(
1649 * the case in all other instances. It's OK that we do this because 1650 * the case in all other instances. It's OK that we do this because
1650 * quotacheck is done only at mount time. 1651 * quotacheck is done only at mount time.
1651 */ 1652 */
1652 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) { 1653 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) {
1653 *res = BULKSTAT_RV_NOTHING; 1654 *res = BULKSTAT_RV_NOTHING;
1654 return error; 1655 return error;
1655 } 1656 }
@@ -1661,7 +1662,8 @@ xfs_qm_dqusage_adjust(
1661 * making us disable quotas for the file system. 1662 * making us disable quotas for the file system.
1662 */ 1663 */
1663 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) { 1664 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
1664 xfs_iput(ip, XFS_ILOCK_EXCL); 1665 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1666 IRELE(ip);
1665 *res = BULKSTAT_RV_GIVEUP; 1667 *res = BULKSTAT_RV_GIVEUP;
1666 return error; 1668 return error;
1667 } 1669 }
@@ -1674,7 +1676,8 @@ xfs_qm_dqusage_adjust(
1674 * Walk thru the extent list and count the realtime blocks. 1676 * Walk thru the extent list and count the realtime blocks.
1675 */ 1677 */
1676 if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { 1678 if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
1677 xfs_iput(ip, XFS_ILOCK_EXCL); 1679 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1680 IRELE(ip);
1678 if (udqp) 1681 if (udqp)
1679 xfs_qm_dqput(udqp); 1682 xfs_qm_dqput(udqp);
1680 if (gdqp) 1683 if (gdqp)
@@ -1747,14 +1750,14 @@ xfs_qm_quotacheck(
1747 lastino = 0; 1750 lastino = 0;
1748 flags = 0; 1751 flags = 0;
1749 1752
1750 ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); 1753 ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
1751 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1754 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1752 1755
1753 /* 1756 /*
1754 * There should be no cached dquots. The (simplistic) quotacheck 1757 * There should be no cached dquots. The (simplistic) quotacheck
1755 * algorithm doesn't like that. 1758 * algorithm doesn't like that.
1756 */ 1759 */
1757 ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); 1760 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
1758 1761
1759 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); 1762 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
1760 1763
@@ -1763,15 +1766,19 @@ xfs_qm_quotacheck(
1763 * their counters to zero. We need a clean slate. 1766 * their counters to zero. We need a clean slate.
1764 * We don't log our changes till later. 1767 * We don't log our changes till later.
1765 */ 1768 */
1766 if ((uip = XFS_QI_UQIP(mp))) { 1769 uip = mp->m_quotainfo->qi_uquotaip;
1767 if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) 1770 if (uip) {
1771 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
1772 if (error)
1768 goto error_return; 1773 goto error_return;
1769 flags |= XFS_UQUOTA_CHKD; 1774 flags |= XFS_UQUOTA_CHKD;
1770 } 1775 }
1771 1776
1772 if ((gip = XFS_QI_GQIP(mp))) { 1777 gip = mp->m_quotainfo->qi_gquotaip;
1773 if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1778 if (gip) {
1774 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) 1779 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
1780 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
1781 if (error)
1775 goto error_return; 1782 goto error_return;
1776 flags |= XFS_OQUOTA_CHKD; 1783 flags |= XFS_OQUOTA_CHKD;
1777 } 1784 }
@@ -1781,12 +1788,13 @@ xfs_qm_quotacheck(
1781 * Iterate thru all the inodes in the file system, 1788 * Iterate thru all the inodes in the file system,
1782 * adjusting the corresponding dquot counters in core. 1789 * adjusting the corresponding dquot counters in core.
1783 */ 1790 */
1784 if ((error = xfs_bulkstat(mp, &lastino, &count, 1791 error = xfs_bulkstat(mp, &lastino, &count,
1785 xfs_qm_dqusage_adjust, NULL, 1792 xfs_qm_dqusage_adjust,
1786 structsz, NULL, BULKSTAT_FG_IGET, &done))) 1793 structsz, NULL, &done);
1794 if (error)
1787 break; 1795 break;
1788 1796
1789 } while (! done); 1797 } while (!done);
1790 1798
1791 /* 1799 /*
1792 * We've made all the changes that we need to make incore. 1800 * We've made all the changes that we need to make incore.
@@ -1804,7 +1812,7 @@ xfs_qm_quotacheck(
1804 * at this point (because we intentionally didn't in dqget_noattach). 1812 * at this point (because we intentionally didn't in dqget_noattach).
1805 */ 1813 */
1806 if (error) { 1814 if (error) {
1807 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); 1815 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
1808 goto error_return; 1816 goto error_return;
1809 } 1817 }
1810 1818
@@ -1825,7 +1833,7 @@ xfs_qm_quotacheck(
1825 mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); 1833 mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
1826 mp->m_qflags |= flags; 1834 mp->m_qflags |= flags;
1827 1835
1828 XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); 1836 xfs_qm_dquot_list_print(mp);
1829 1837
1830 error_return: 1838 error_return:
1831 if (error) { 1839 if (error) {
@@ -1874,14 +1882,14 @@ xfs_qm_init_quotainos(
1874 mp->m_sb.sb_uquotino != NULLFSINO) { 1882 mp->m_sb.sb_uquotino != NULLFSINO) {
1875 ASSERT(mp->m_sb.sb_uquotino > 0); 1883 ASSERT(mp->m_sb.sb_uquotino > 0);
1876 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 1884 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
1877 0, 0, &uip, 0))) 1885 0, 0, &uip)))
1878 return XFS_ERROR(error); 1886 return XFS_ERROR(error);
1879 } 1887 }
1880 if (XFS_IS_OQUOTA_ON(mp) && 1888 if (XFS_IS_OQUOTA_ON(mp) &&
1881 mp->m_sb.sb_gquotino != NULLFSINO) { 1889 mp->m_sb.sb_gquotino != NULLFSINO) {
1882 ASSERT(mp->m_sb.sb_gquotino > 0); 1890 ASSERT(mp->m_sb.sb_gquotino > 0);
1883 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 1891 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
1884 0, 0, &gip, 0))) { 1892 0, 0, &gip))) {
1885 if (uip) 1893 if (uip)
1886 IRELE(uip); 1894 IRELE(uip);
1887 return XFS_ERROR(error); 1895 return XFS_ERROR(error);
@@ -1920,59 +1928,53 @@ xfs_qm_init_quotainos(
1920 } 1928 }
1921 } 1929 }
1922 1930
1923 XFS_QI_UQIP(mp) = uip; 1931 mp->m_quotainfo->qi_uquotaip = uip;
1924 XFS_QI_GQIP(mp) = gip; 1932 mp->m_quotainfo->qi_gquotaip = gip;
1925 1933
1926 return 0; 1934 return 0;
1927} 1935}
1928 1936
1929 1937
1938
1930/* 1939/*
1931 * Traverse the freelist of dquots and attempt to reclaim a maximum of 1940 * Just pop the least recently used dquot off the freelist and
1932 * 'howmany' dquots. This operation races with dqlookup(), and attempts to 1941 * recycle it. The returned dquot is locked.
1933 * favor the lookup function ...
1934 * XXXsup merge this with qm_reclaim_one().
1935 */ 1942 */
1936STATIC int 1943STATIC xfs_dquot_t *
1937xfs_qm_shake_freelist( 1944xfs_qm_dqreclaim_one(void)
1938 int howmany)
1939{ 1945{
1940 int nreclaimed; 1946 xfs_dquot_t *dqpout;
1941 xfs_dqhash_t *hash; 1947 xfs_dquot_t *dqp;
1942 xfs_dquot_t *dqp, *nextdqp;
1943 int restarts; 1948 int restarts;
1944 int nflushes;
1945
1946 if (howmany <= 0)
1947 return 0;
1948 1949
1949 nreclaimed = 0;
1950 restarts = 0; 1950 restarts = 0;
1951 nflushes = 0; 1951 dqpout = NULL;
1952 1952
1953#ifdef QUOTADEBUG 1953 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1954 cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); 1954startagain:
1955#endif 1955 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1956 /* lock order is : hashchainlock, freelistlock, mplistlock */
1957 tryagain:
1958 xfs_qm_freelist_lock(xfs_Gqm);
1959 1956
1960 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; 1957 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
1961 ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && 1958 struct xfs_mount *mp = dqp->q_mount;
1962 nreclaimed < howmany); ) {
1963 xfs_dqlock(dqp); 1959 xfs_dqlock(dqp);
1964 1960
1965 /* 1961 /*
1966 * We are racing with dqlookup here. Naturally we don't 1962 * We are racing with dqlookup here. Naturally we don't
1967 * want to reclaim a dquot that lookup wants. 1963 * want to reclaim a dquot that lookup wants. We release the
1964 * freelist lock and start over, so that lookup will grab
1965 * both the dquot and the freelistlock.
1968 */ 1966 */
1969 if (dqp->dq_flags & XFS_DQ_WANT) { 1967 if (dqp->dq_flags & XFS_DQ_WANT) {
1968 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1969
1970 trace_xfs_dqreclaim_want(dqp);
1971
1970 xfs_dqunlock(dqp); 1972 xfs_dqunlock(dqp);
1971 xfs_qm_freelist_unlock(xfs_Gqm); 1973 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1972 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1974 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1973 return nreclaimed; 1975 return NULL;
1974 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1976 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1975 goto tryagain; 1977 goto startagain;
1976 } 1978 }
1977 1979
1978 /* 1980 /*
@@ -1981,23 +1983,27 @@ xfs_qm_shake_freelist(
1981 * life easier. 1983 * life easier.
1982 */ 1984 */
1983 if (dqp->dq_flags & XFS_DQ_INACTIVE) { 1985 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
1984 ASSERT(dqp->q_mount == NULL); 1986 ASSERT(mp == NULL);
1985 ASSERT(! XFS_DQ_IS_DIRTY(dqp)); 1987 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
1986 ASSERT(dqp->HL_PREVP == NULL); 1988 ASSERT(list_empty(&dqp->q_hashlist));
1987 ASSERT(dqp->MPL_PREVP == NULL); 1989 ASSERT(list_empty(&dqp->q_mplist));
1990 list_del_init(&dqp->q_freelist);
1991 xfs_Gqm->qm_dqfrlist_cnt--;
1992 xfs_dqunlock(dqp);
1993 dqpout = dqp;
1988 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1994 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1989 nextdqp = dqp->dq_flnext; 1995 break;
1990 goto off_freelist;
1991 } 1996 }
1992 1997
1993 ASSERT(dqp->MPL_PREVP); 1998 ASSERT(dqp->q_hash);
1999 ASSERT(!list_empty(&dqp->q_mplist));
2000
1994 /* 2001 /*
1995 * Try to grab the flush lock. If this dquot is in the process of 2002 * Try to grab the flush lock. If this dquot is in the process of
1996 * getting flushed to disk, we don't want to reclaim it. 2003 * getting flushed to disk, we don't want to reclaim it.
1997 */ 2004 */
1998 if (!xfs_dqflock_nowait(dqp)) { 2005 if (!xfs_dqflock_nowait(dqp)) {
1999 xfs_dqunlock(dqp); 2006 xfs_dqunlock(dqp);
2000 dqp = dqp->dq_flnext;
2001 continue; 2007 continue;
2002 } 2008 }
2003 2009
@@ -2010,21 +2016,21 @@ xfs_qm_shake_freelist(
2010 if (XFS_DQ_IS_DIRTY(dqp)) { 2016 if (XFS_DQ_IS_DIRTY(dqp)) {
2011 int error; 2017 int error;
2012 2018
2013 trace_xfs_dqshake_dirty(dqp); 2019 trace_xfs_dqreclaim_dirty(dqp);
2014 2020
2015 /* 2021 /*
2016 * We flush it delayed write, so don't bother 2022 * We flush it delayed write, so don't bother
2017 * releasing the mplock. 2023 * releasing the freelist lock.
2018 */ 2024 */
2019 error = xfs_qm_dqflush(dqp, 0); 2025 error = xfs_qm_dqflush(dqp, 0);
2020 if (error) { 2026 if (error) {
2021 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2027 xfs_fs_cmn_err(CE_WARN, mp,
2022 "xfs_qm_dqflush_all: dquot %p flush failed", dqp); 2028 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2023 } 2029 }
2024 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 2030 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2025 dqp = dqp->dq_flnext;
2026 continue; 2031 continue;
2027 } 2032 }
2033
2028 /* 2034 /*
2029 * We're trying to get the hashlock out of order. This races 2035 * We're trying to get the hashlock out of order. This races
2030 * with dqlookup; so, we giveup and goto the next dquot if 2036 * with dqlookup; so, we giveup and goto the next dquot if
@@ -2033,62 +2039,83 @@ xfs_qm_shake_freelist(
2033 * waiting for the freelist lock. 2039 * waiting for the freelist lock.
2034 */ 2040 */
2035 if (!mutex_trylock(&dqp->q_hash->qh_lock)) { 2041 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
2036 xfs_dqfunlock(dqp); 2042 restarts++;
2037 xfs_dqunlock(dqp); 2043 goto dqfunlock;
2038 dqp = dqp->dq_flnext;
2039 continue;
2040 } 2044 }
2045
2041 /* 2046 /*
2042 * This races with dquot allocation code as well as dqflush_all 2047 * This races with dquot allocation code as well as dqflush_all
2043 * and reclaim code. So, if we failed to grab the mplist lock, 2048 * and reclaim code. So, if we failed to grab the mplist lock,
2044 * giveup everything and start over. 2049 * giveup everything and start over.
2045 */ 2050 */
2046 hash = dqp->q_hash; 2051 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
2047 ASSERT(hash); 2052 restarts++;
2048 if (! xfs_qm_mplist_nowait(dqp->q_mount)) { 2053 mutex_unlock(&dqp->q_hash->qh_lock);
2049 /* XXX put a sentinel so that we can come back here */
2050 xfs_dqfunlock(dqp); 2054 xfs_dqfunlock(dqp);
2051 xfs_dqunlock(dqp); 2055 xfs_dqunlock(dqp);
2052 mutex_unlock(&hash->qh_lock); 2056 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2053 xfs_qm_freelist_unlock(xfs_Gqm); 2057 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
2054 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2058 return NULL;
2055 return nreclaimed; 2059 goto startagain;
2056 goto tryagain;
2057 } 2060 }
2058 2061
2059 trace_xfs_dqshake_unlink(dqp);
2060
2061#ifdef QUOTADEBUG
2062 cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
2063 dqp, be32_to_cpu(dqp->q_core.d_id));
2064#endif
2065 ASSERT(dqp->q_nrefs == 0); 2062 ASSERT(dqp->q_nrefs == 0);
2066 nextdqp = dqp->dq_flnext; 2063 list_del_init(&dqp->q_mplist);
2067 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); 2064 mp->m_quotainfo->qi_dquots--;
2068 XQM_HASHLIST_REMOVE(hash, dqp); 2065 mp->m_quotainfo->qi_dqreclaims++;
2066 list_del_init(&dqp->q_hashlist);
2067 dqp->q_hash->qh_version++;
2068 list_del_init(&dqp->q_freelist);
2069 xfs_Gqm->qm_dqfrlist_cnt--;
2070 dqpout = dqp;
2071 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
2072 mutex_unlock(&dqp->q_hash->qh_lock);
2073dqfunlock:
2069 xfs_dqfunlock(dqp); 2074 xfs_dqfunlock(dqp);
2070 xfs_qm_mplist_unlock(dqp->q_mount);
2071 mutex_unlock(&hash->qh_lock);
2072
2073 off_freelist:
2074 XQM_FREELIST_REMOVE(dqp);
2075 xfs_dqunlock(dqp); 2075 xfs_dqunlock(dqp);
2076 nreclaimed++; 2076 if (dqpout)
2077 XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); 2077 break;
2078 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2079 return NULL;
2080 }
2081 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2082 return dqpout;
2083}
2084
2085/*
2086 * Traverse the freelist of dquots and attempt to reclaim a maximum of
2087 * 'howmany' dquots. This operation races with dqlookup(), and attempts to
2088 * favor the lookup function ...
2089 */
2090STATIC int
2091xfs_qm_shake_freelist(
2092 int howmany)
2093{
2094 int nreclaimed = 0;
2095 xfs_dquot_t *dqp;
2096
2097 if (howmany <= 0)
2098 return 0;
2099
2100 while (nreclaimed < howmany) {
2101 dqp = xfs_qm_dqreclaim_one();
2102 if (!dqp)
2103 return nreclaimed;
2078 xfs_qm_dqdestroy(dqp); 2104 xfs_qm_dqdestroy(dqp);
2079 dqp = nextdqp; 2105 nreclaimed++;
2080 } 2106 }
2081 xfs_qm_freelist_unlock(xfs_Gqm);
2082 return nreclaimed; 2107 return nreclaimed;
2083} 2108}
2084 2109
2085
2086/* 2110/*
2087 * The kmem_shake interface is invoked when memory is running low. 2111 * The kmem_shake interface is invoked when memory is running low.
2088 */ 2112 */
2089/* ARGSUSED */ 2113/* ARGSUSED */
2090STATIC int 2114STATIC int
2091xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) 2115xfs_qm_shake(
2116 struct shrinker *shrink,
2117 int nr_to_scan,
2118 gfp_t gfp_mask)
2092{ 2119{
2093 int ndqused, nfree, n; 2120 int ndqused, nfree, n;
2094 2121
@@ -2097,7 +2124,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
2097 if (!xfs_Gqm) 2124 if (!xfs_Gqm)
2098 return 0; 2125 return 0;
2099 2126
2100 nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ 2127 nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
2101 /* incore dquots in all f/s's */ 2128 /* incore dquots in all f/s's */
2102 ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; 2129 ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
2103 2130
@@ -2113,131 +2140,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
2113} 2140}
2114 2141
2115 2142
2116/*
2117 * Just pop the least recently used dquot off the freelist and
2118 * recycle it. The returned dquot is locked.
2119 */
2120STATIC xfs_dquot_t *
2121xfs_qm_dqreclaim_one(void)
2122{
2123 xfs_dquot_t *dqpout;
2124 xfs_dquot_t *dqp;
2125 int restarts;
2126 int nflushes;
2127
2128 restarts = 0;
2129 dqpout = NULL;
2130 nflushes = 0;
2131
2132 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
2133 startagain:
2134 xfs_qm_freelist_lock(xfs_Gqm);
2135
2136 FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
2137 xfs_dqlock(dqp);
2138
2139 /*
2140 * We are racing with dqlookup here. Naturally we don't
2141 * want to reclaim a dquot that lookup wants. We release the
2142 * freelist lock and start over, so that lookup will grab
2143 * both the dquot and the freelistlock.
2144 */
2145 if (dqp->dq_flags & XFS_DQ_WANT) {
2146 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
2147
2148 trace_xfs_dqreclaim_want(dqp);
2149
2150 xfs_dqunlock(dqp);
2151 xfs_qm_freelist_unlock(xfs_Gqm);
2152 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2153 return NULL;
2154 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
2155 goto startagain;
2156 }
2157
2158 /*
2159 * If the dquot is inactive, we are assured that it is
2160 * not on the mplist or the hashlist, and that makes our
2161 * life easier.
2162 */
2163 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
2164 ASSERT(dqp->q_mount == NULL);
2165 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
2166 ASSERT(dqp->HL_PREVP == NULL);
2167 ASSERT(dqp->MPL_PREVP == NULL);
2168 XQM_FREELIST_REMOVE(dqp);
2169 xfs_dqunlock(dqp);
2170 dqpout = dqp;
2171 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
2172 break;
2173 }
2174
2175 ASSERT(dqp->q_hash);
2176 ASSERT(dqp->MPL_PREVP);
2177
2178 /*
2179 * Try to grab the flush lock. If this dquot is in the process of
2180 * getting flushed to disk, we don't want to reclaim it.
2181 */
2182 if (!xfs_dqflock_nowait(dqp)) {
2183 xfs_dqunlock(dqp);
2184 continue;
2185 }
2186
2187 /*
2188 * We have the flush lock so we know that this is not in the
2189 * process of being flushed. So, if this is dirty, flush it
2190 * DELWRI so that we don't get a freelist infested with
2191 * dirty dquots.
2192 */
2193 if (XFS_DQ_IS_DIRTY(dqp)) {
2194 int error;
2195
2196 trace_xfs_dqreclaim_dirty(dqp);
2197
2198 /*
2199 * We flush it delayed write, so don't bother
2200 * releasing the freelist lock.
2201 */
2202 error = xfs_qm_dqflush(dqp, 0);
2203 if (error) {
2204 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2205 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2206 }
2207 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2208 continue;
2209 }
2210
2211 if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
2212 xfs_dqfunlock(dqp);
2213 xfs_dqunlock(dqp);
2214 continue;
2215 }
2216
2217 if (!mutex_trylock(&dqp->q_hash->qh_lock))
2218 goto mplistunlock;
2219
2220 trace_xfs_dqreclaim_unlink(dqp);
2221
2222 ASSERT(dqp->q_nrefs == 0);
2223 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
2224 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
2225 XQM_FREELIST_REMOVE(dqp);
2226 dqpout = dqp;
2227 mutex_unlock(&dqp->q_hash->qh_lock);
2228 mplistunlock:
2229 xfs_qm_mplist_unlock(dqp->q_mount);
2230 xfs_dqfunlock(dqp);
2231 xfs_dqunlock(dqp);
2232 if (dqpout)
2233 break;
2234 }
2235
2236 xfs_qm_freelist_unlock(xfs_Gqm);
2237 return dqpout;
2238}
2239
2240
2241/*------------------------------------------------------------------*/ 2143/*------------------------------------------------------------------*/
2242 2144
2243/* 2145/*
@@ -2662,66 +2564,3 @@ xfs_qm_vop_create_dqattach(
2662 } 2564 }
2663} 2565}
2664 2566
2665/* ------------- list stuff -----------------*/
2666STATIC void
2667xfs_qm_freelist_init(xfs_frlist_t *ql)
2668{
2669 ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
2670 mutex_init(&ql->qh_lock);
2671 ql->qh_version = 0;
2672 ql->qh_nelems = 0;
2673}
2674
2675STATIC void
2676xfs_qm_freelist_destroy(xfs_frlist_t *ql)
2677{
2678 xfs_dquot_t *dqp, *nextdqp;
2679
2680 mutex_lock(&ql->qh_lock);
2681 for (dqp = ql->qh_next;
2682 dqp != (xfs_dquot_t *)ql; ) {
2683 xfs_dqlock(dqp);
2684 nextdqp = dqp->dq_flnext;
2685#ifdef QUOTADEBUG
2686 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
2687#endif
2688 XQM_FREELIST_REMOVE(dqp);
2689 xfs_dqunlock(dqp);
2690 xfs_qm_dqdestroy(dqp);
2691 dqp = nextdqp;
2692 }
2693 mutex_unlock(&ql->qh_lock);
2694 mutex_destroy(&ql->qh_lock);
2695
2696 ASSERT(ql->qh_nelems == 0);
2697}
2698
2699STATIC void
2700xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
2701{
2702 dq->dq_flnext = ql->qh_next;
2703 dq->dq_flprev = (xfs_dquot_t *)ql;
2704 ql->qh_next = dq;
2705 dq->dq_flnext->dq_flprev = dq;
2706 xfs_Gqm->qm_dqfreelist.qh_nelems++;
2707 xfs_Gqm->qm_dqfreelist.qh_version++;
2708}
2709
2710void
2711xfs_qm_freelist_unlink(xfs_dquot_t *dq)
2712{
2713 xfs_dquot_t *next = dq->dq_flnext;
2714 xfs_dquot_t *prev = dq->dq_flprev;
2715
2716 next->dq_flprev = prev;
2717 prev->dq_flnext = next;
2718 dq->dq_flnext = dq->dq_flprev = dq;
2719 xfs_Gqm->qm_dqfreelist.qh_nelems--;
2720 xfs_Gqm->qm_dqfreelist.qh_version++;
2721}
2722
2723void
2724xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
2725{
2726 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
2727}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b8af38..c9446f1c726d 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t *qm_dqtrxzone;
72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
73 73
74typedef xfs_dqhash_t xfs_dqlist_t; 74typedef xfs_dqhash_t xfs_dqlist_t;
75/*
76 * The freelist head. The first two fields match the first two in the
77 * xfs_dquot_t structure (in xfs_dqmarker_t)
78 */
79typedef struct xfs_frlist {
80 struct xfs_dquot *qh_next;
81 struct xfs_dquot *qh_prev;
82 struct mutex qh_lock;
83 uint qh_version;
84 uint qh_nelems;
85} xfs_frlist_t;
86 75
87/* 76/*
88 * Quota Manager (global) structure. Lives only in core. 77 * Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
91 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ 80 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
92 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ 81 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
93 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ 82 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
94 xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ 83 struct list_head qm_dqfrlist; /* freelist of dquots */
84 struct mutex qm_dqfrlist_lock;
85 int qm_dqfrlist_cnt;
95 atomic_t qm_totaldquots; /* total incore dquots */ 86 atomic_t qm_totaldquots; /* total incore dquots */
96 uint qm_nrefs; /* file systems with quota on */ 87 uint qm_nrefs; /* file systems with quota on */
97 int qm_dqfree_ratio;/* ratio of free to inuse dquots */ 88 int qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
106typedef struct xfs_quotainfo { 97typedef struct xfs_quotainfo {
107 xfs_inode_t *qi_uquotaip; /* user quota inode */ 98 xfs_inode_t *qi_uquotaip; /* user quota inode */
108 xfs_inode_t *qi_gquotaip; /* group quota inode */ 99 xfs_inode_t *qi_gquotaip; /* group quota inode */
109 xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ 100 struct list_head qi_dqlist; /* all dquots in filesys */
101 struct mutex qi_dqlist_lock;
102 int qi_dquots;
110 int qi_dqreclaims; /* a change here indicates 103 int qi_dqreclaims; /* a change here indicates
111 a removal in the dqlist */ 104 a removal in the dqlist */
112 time_t qi_btimelimit; /* limit for blks timer */ 105 time_t qi_btimelimit; /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
175extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); 168extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
176extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); 169extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
177 170
178/* list stuff */
179extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
180extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
181
182#ifdef DEBUG 171#ifdef DEBUG
183extern int xfs_qm_internalqcheck(xfs_mount_t *); 172extern int xfs_qm_internalqcheck(xfs_mount_t *);
184#else 173#else
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 97b410c12794..bea02d786c5d 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -23,25 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_ialloc.h"
39#include "xfs_itable.h" 31#include "xfs_itable.h"
40#include "xfs_btree.h"
41#include "xfs_bmap.h" 32#include "xfs_bmap.h"
42#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
43#include "xfs_error.h" 34#include "xfs_error.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_qm.h" 37#include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3e25fa..8671a0b32644 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -23,25 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_ialloc.h"
39#include "xfs_itable.h" 31#include "xfs_itable.h"
40#include "xfs_bmap.h" 32#include "xfs_bmap.h"
41#include "xfs_btree.h"
42#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
43#include "xfs_error.h" 34#include "xfs_error.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_qm.h" 37#include "xfs_qm.h"
@@ -55,7 +45,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
55 ndquot, 45 ndquot,
56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, 46 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, 47 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
58 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); 48 xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
59 return 0; 49 return 0;
60} 50}
61 51
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 50bee07d6b0e..45e5849df238 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -26,25 +26,15 @@
26#include "xfs_trans.h" 26#include "xfs_trans.h"
27#include "xfs_sb.h" 27#include "xfs_sb.h"
28#include "xfs_ag.h" 28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_alloc.h" 29#include "xfs_alloc.h"
31#include "xfs_dmapi.h"
32#include "xfs_quota.h" 30#include "xfs_quota.h"
33#include "xfs_mount.h" 31#include "xfs_mount.h"
34#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
35#include "xfs_alloc_btree.h"
36#include "xfs_ialloc_btree.h"
37#include "xfs_dir2_sf.h"
38#include "xfs_attr_sf.h"
39#include "xfs_dinode.h"
40#include "xfs_inode.h" 33#include "xfs_inode.h"
41#include "xfs_ialloc.h"
42#include "xfs_itable.h" 34#include "xfs_itable.h"
43#include "xfs_bmap.h" 35#include "xfs_bmap.h"
44#include "xfs_btree.h"
45#include "xfs_rtalloc.h" 36#include "xfs_rtalloc.h"
46#include "xfs_error.h" 37#include "xfs_error.h"
47#include "xfs_rw.h"
48#include "xfs_attr.h" 38#include "xfs_attr.h"
49#include "xfs_buf_item.h" 39#include "xfs_buf_item.h"
50#include "xfs_utils.h" 40#include "xfs_utils.h"
@@ -79,6 +69,7 @@ xfs_qm_scall_quotaoff(
79 xfs_mount_t *mp, 69 xfs_mount_t *mp,
80 uint flags) 70 uint flags)
81{ 71{
72 struct xfs_quotainfo *q = mp->m_quotainfo;
82 uint dqtype; 73 uint dqtype;
83 int error; 74 int error;
84 uint inactivate_flags; 75 uint inactivate_flags;
@@ -102,11 +93,8 @@ xfs_qm_scall_quotaoff(
102 * critical thing. 93 * critical thing.
103 * If quotaoff, then we must be dealing with the root filesystem. 94 * If quotaoff, then we must be dealing with the root filesystem.
104 */ 95 */
105 ASSERT(mp->m_quotainfo); 96 ASSERT(q);
106 if (mp->m_quotainfo) 97 mutex_lock(&q->qi_quotaofflock);
107 mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
108
109 ASSERT(mp->m_quotainfo);
110 98
111 /* 99 /*
112 * If we're just turning off quota enforcement, change mp and go. 100 * If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +105,7 @@ xfs_qm_scall_quotaoff(
117 spin_lock(&mp->m_sb_lock); 105 spin_lock(&mp->m_sb_lock);
118 mp->m_sb.sb_qflags = mp->m_qflags; 106 mp->m_sb.sb_qflags = mp->m_qflags;
119 spin_unlock(&mp->m_sb_lock); 107 spin_unlock(&mp->m_sb_lock);
120 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 108 mutex_unlock(&q->qi_quotaofflock);
121 109
122 /* XXX what to do if error ? Revert back to old vals incore ? */ 110 /* XXX what to do if error ? Revert back to old vals incore ? */
123 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); 111 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +138,8 @@ xfs_qm_scall_quotaoff(
150 * Nothing to do? Don't complain. This happens when we're just 138 * Nothing to do? Don't complain. This happens when we're just
151 * turning off quota enforcement. 139 * turning off quota enforcement.
152 */ 140 */
153 if ((mp->m_qflags & flags) == 0) { 141 if ((mp->m_qflags & flags) == 0)
154 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 142 goto out_unlock;
155 return (0);
156 }
157 143
158 /* 144 /*
159 * Write the LI_QUOTAOFF log record, and do SB changes atomically, 145 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +148,7 @@ xfs_qm_scall_quotaoff(
162 */ 148 */
163 error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); 149 error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
164 if (error) 150 if (error)
165 goto out_error; 151 goto out_unlock;
166 152
167 /* 153 /*
168 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct 154 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +190,7 @@ xfs_qm_scall_quotaoff(
204 * So, if we couldn't purge all the dquots from the filesystem, 190 * So, if we couldn't purge all the dquots from the filesystem,
205 * we can't get rid of the incore data structures. 191 * we can't get rid of the incore data structures.
206 */ 192 */
207 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) 193 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
208 delay(10 * nculprits); 194 delay(10 * nculprits);
209 195
210 /* 196 /*
@@ -222,7 +208,7 @@ xfs_qm_scall_quotaoff(
222 if (error) { 208 if (error) {
223 /* We're screwed now. Shutdown is the only option. */ 209 /* We're screwed now. Shutdown is the only option. */
224 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 210 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
225 goto out_error; 211 goto out_unlock;
226 } 212 }
227 213
228 /* 214 /*
@@ -230,27 +216,74 @@ xfs_qm_scall_quotaoff(
230 */ 216 */
231 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || 217 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
232 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { 218 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
233 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 219 mutex_unlock(&q->qi_quotaofflock);
234 xfs_qm_destroy_quotainfo(mp); 220 xfs_qm_destroy_quotainfo(mp);
235 return (0); 221 return (0);
236 } 222 }
237 223
238 /* 224 /*
239 * Release our quotainode references, and vn_purge them, 225 * Release our quotainode references if we don't need them anymore.
240 * if we don't need them anymore.
241 */ 226 */
242 if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { 227 if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
243 IRELE(XFS_QI_UQIP(mp)); 228 IRELE(q->qi_uquotaip);
244 XFS_QI_UQIP(mp) = NULL; 229 q->qi_uquotaip = NULL;
245 } 230 }
246 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { 231 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
247 IRELE(XFS_QI_GQIP(mp)); 232 IRELE(q->qi_gquotaip);
248 XFS_QI_GQIP(mp) = NULL; 233 q->qi_gquotaip = NULL;
249 } 234 }
250out_error:
251 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
252 235
253 return (error); 236out_unlock:
237 mutex_unlock(&q->qi_quotaofflock);
238 return error;
239}
240
241STATIC int
242xfs_qm_scall_trunc_qfile(
243 struct xfs_mount *mp,
244 xfs_ino_t ino)
245{
246 struct xfs_inode *ip;
247 struct xfs_trans *tp;
248 int error;
249
250 if (ino == NULLFSINO)
251 return 0;
252
253 error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
254 if (error)
255 return error;
256
257 xfs_ilock(ip, XFS_IOLOCK_EXCL);
258
259 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
260 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
261 XFS_TRANS_PERM_LOG_RES,
262 XFS_ITRUNCATE_LOG_COUNT);
263 if (error) {
264 xfs_trans_cancel(tp, 0);
265 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
266 goto out_put;
267 }
268
269 xfs_ilock(ip, XFS_ILOCK_EXCL);
270 xfs_trans_ijoin(tp, ip);
271
272 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1);
273 if (error) {
274 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
275 XFS_TRANS_ABORT);
276 goto out_unlock;
277 }
278
279 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
281
282out_unlock:
283 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
284out_put:
285 IRELE(ip);
286 return error;
254} 287}
255 288
256int 289int
@@ -259,34 +292,20 @@ xfs_qm_scall_trunc_qfiles(
259 uint flags) 292 uint flags)
260{ 293{
261 int error = 0, error2 = 0; 294 int error = 0, error2 = 0;
262 xfs_inode_t *qip;
263 295
264 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 296 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
265 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 297 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
266 return XFS_ERROR(EINVAL); 298 return XFS_ERROR(EINVAL);
267 } 299 }
268 300
269 if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { 301 if (flags & XFS_DQ_USER)
270 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0); 302 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
271 if (!error) { 303 if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
272 error = xfs_truncate_file(mp, qip); 304 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
273 IRELE(qip);
274 }
275 }
276
277 if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
278 mp->m_sb.sb_gquotino != NULLFSINO) {
279 error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
280 if (!error2) {
281 error2 = xfs_truncate_file(mp, qip);
282 IRELE(qip);
283 }
284 }
285 305
286 return error ? error : error2; 306 return error ? error : error2;
287} 307}
288 308
289
290/* 309/*
291 * Switch on (a given) quota enforcement for a filesystem. This takes 310 * Switch on (a given) quota enforcement for a filesystem. This takes
292 * effect immediately. 311 * effect immediately.
@@ -379,9 +398,9 @@ xfs_qm_scall_quotaon(
379 /* 398 /*
380 * Switch on quota enforcement in core. 399 * Switch on quota enforcement in core.
381 */ 400 */
382 mutex_lock(&(XFS_QI_QOFFLOCK(mp))); 401 mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
383 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); 402 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
384 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 403 mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
385 404
386 return (0); 405 return (0);
387} 406}
@@ -392,11 +411,12 @@ xfs_qm_scall_quotaon(
392 */ 411 */
393int 412int
394xfs_qm_scall_getqstat( 413xfs_qm_scall_getqstat(
395 xfs_mount_t *mp, 414 struct xfs_mount *mp,
396 fs_quota_stat_t *out) 415 struct fs_quota_stat *out)
397{ 416{
398 xfs_inode_t *uip, *gip; 417 struct xfs_quotainfo *q = mp->m_quotainfo;
399 boolean_t tempuqip, tempgqip; 418 struct xfs_inode *uip, *gip;
419 boolean_t tempuqip, tempgqip;
400 420
401 uip = gip = NULL; 421 uip = gip = NULL;
402 tempuqip = tempgqip = B_FALSE; 422 tempuqip = tempgqip = B_FALSE;
@@ -415,18 +435,18 @@ xfs_qm_scall_getqstat(
415 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; 435 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
416 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; 436 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
417 437
418 if (mp->m_quotainfo) { 438 if (q) {
419 uip = mp->m_quotainfo->qi_uquotaip; 439 uip = q->qi_uquotaip;
420 gip = mp->m_quotainfo->qi_gquotaip; 440 gip = q->qi_gquotaip;
421 } 441 }
422 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 442 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
423 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 443 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
424 0, 0, &uip, 0) == 0) 444 0, 0, &uip) == 0)
425 tempuqip = B_TRUE; 445 tempuqip = B_TRUE;
426 } 446 }
427 if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { 447 if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
428 if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 448 if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
429 0, 0, &gip, 0) == 0) 449 0, 0, &gip) == 0)
430 tempgqip = B_TRUE; 450 tempgqip = B_TRUE;
431 } 451 }
432 if (uip) { 452 if (uip) {
@@ -441,17 +461,20 @@ xfs_qm_scall_getqstat(
441 if (tempgqip) 461 if (tempgqip)
442 IRELE(gip); 462 IRELE(gip);
443 } 463 }
444 if (mp->m_quotainfo) { 464 if (q) {
445 out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); 465 out->qs_incoredqs = q->qi_dquots;
446 out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); 466 out->qs_btimelimit = q->qi_btimelimit;
447 out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); 467 out->qs_itimelimit = q->qi_itimelimit;
448 out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); 468 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
449 out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); 469 out->qs_bwarnlimit = q->qi_bwarnlimit;
450 out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); 470 out->qs_iwarnlimit = q->qi_iwarnlimit;
451 } 471 }
452 return (0); 472 return 0;
453} 473}
454 474
475#define XFS_DQ_MASK \
476 (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
477
455/* 478/*
456 * Adjust quota limits, and start/stop timers accordingly. 479 * Adjust quota limits, and start/stop timers accordingly.
457 */ 480 */
@@ -462,15 +485,17 @@ xfs_qm_scall_setqlim(
462 uint type, 485 uint type,
463 fs_disk_quota_t *newlim) 486 fs_disk_quota_t *newlim)
464{ 487{
488 struct xfs_quotainfo *q = mp->m_quotainfo;
465 xfs_disk_dquot_t *ddq; 489 xfs_disk_dquot_t *ddq;
466 xfs_dquot_t *dqp; 490 xfs_dquot_t *dqp;
467 xfs_trans_t *tp; 491 xfs_trans_t *tp;
468 int error; 492 int error;
469 xfs_qcnt_t hard, soft; 493 xfs_qcnt_t hard, soft;
470 494
471 if ((newlim->d_fieldmask & 495 if (newlim->d_fieldmask & ~XFS_DQ_MASK)
472 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) 496 return EINVAL;
473 return (0); 497 if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
498 return 0;
474 499
475 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); 500 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
476 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, 501 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
@@ -485,7 +510,7 @@ xfs_qm_scall_setqlim(
485 * a quotaoff from happening). (XXXThis doesn't currently happen 510 * a quotaoff from happening). (XXXThis doesn't currently happen
486 * because we take the vfslock before calling xfs_qm_sysent). 511 * because we take the vfslock before calling xfs_qm_sysent).
487 */ 512 */
488 mutex_lock(&(XFS_QI_QOFFLOCK(mp))); 513 mutex_lock(&q->qi_quotaofflock);
489 514
490 /* 515 /*
491 * Get the dquot (locked), and join it to the transaction. 516 * Get the dquot (locked), and join it to the transaction.
@@ -493,9 +518,8 @@ xfs_qm_scall_setqlim(
493 */ 518 */
494 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { 519 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
495 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 520 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
496 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
497 ASSERT(error != ENOENT); 521 ASSERT(error != ENOENT);
498 return (error); 522 goto out_unlock;
499 } 523 }
500 xfs_trans_dqjoin(tp, dqp); 524 xfs_trans_dqjoin(tp, dqp);
501 ddq = &dqp->q_core; 525 ddq = &dqp->q_core;
@@ -513,8 +537,8 @@ xfs_qm_scall_setqlim(
513 ddq->d_blk_hardlimit = cpu_to_be64(hard); 537 ddq->d_blk_hardlimit = cpu_to_be64(hard);
514 ddq->d_blk_softlimit = cpu_to_be64(soft); 538 ddq->d_blk_softlimit = cpu_to_be64(soft);
515 if (id == 0) { 539 if (id == 0) {
516 mp->m_quotainfo->qi_bhardlimit = hard; 540 q->qi_bhardlimit = hard;
517 mp->m_quotainfo->qi_bsoftlimit = soft; 541 q->qi_bsoftlimit = soft;
518 } 542 }
519 } else { 543 } else {
520 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); 544 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +553,8 @@ xfs_qm_scall_setqlim(
529 ddq->d_rtb_hardlimit = cpu_to_be64(hard); 553 ddq->d_rtb_hardlimit = cpu_to_be64(hard);
530 ddq->d_rtb_softlimit = cpu_to_be64(soft); 554 ddq->d_rtb_softlimit = cpu_to_be64(soft);
531 if (id == 0) { 555 if (id == 0) {
532 mp->m_quotainfo->qi_rtbhardlimit = hard; 556 q->qi_rtbhardlimit = hard;
533 mp->m_quotainfo->qi_rtbsoftlimit = soft; 557 q->qi_rtbsoftlimit = soft;
534 } 558 }
535 } else { 559 } else {
536 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); 560 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +570,8 @@ xfs_qm_scall_setqlim(
546 ddq->d_ino_hardlimit = cpu_to_be64(hard); 570 ddq->d_ino_hardlimit = cpu_to_be64(hard);
547 ddq->d_ino_softlimit = cpu_to_be64(soft); 571 ddq->d_ino_softlimit = cpu_to_be64(soft);
548 if (id == 0) { 572 if (id == 0) {
549 mp->m_quotainfo->qi_ihardlimit = hard; 573 q->qi_ihardlimit = hard;
550 mp->m_quotainfo->qi_isoftlimit = soft; 574 q->qi_isoftlimit = soft;
551 } 575 }
552 } else { 576 } else {
553 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); 577 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +596,23 @@ xfs_qm_scall_setqlim(
572 * for warnings. 596 * for warnings.
573 */ 597 */
574 if (newlim->d_fieldmask & FS_DQ_BTIMER) { 598 if (newlim->d_fieldmask & FS_DQ_BTIMER) {
575 mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; 599 q->qi_btimelimit = newlim->d_btimer;
576 ddq->d_btimer = cpu_to_be32(newlim->d_btimer); 600 ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
577 } 601 }
578 if (newlim->d_fieldmask & FS_DQ_ITIMER) { 602 if (newlim->d_fieldmask & FS_DQ_ITIMER) {
579 mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; 603 q->qi_itimelimit = newlim->d_itimer;
580 ddq->d_itimer = cpu_to_be32(newlim->d_itimer); 604 ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
581 } 605 }
582 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { 606 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
583 mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; 607 q->qi_rtbtimelimit = newlim->d_rtbtimer;
584 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); 608 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
585 } 609 }
586 if (newlim->d_fieldmask & FS_DQ_BWARNS) 610 if (newlim->d_fieldmask & FS_DQ_BWARNS)
587 mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; 611 q->qi_bwarnlimit = newlim->d_bwarns;
588 if (newlim->d_fieldmask & FS_DQ_IWARNS) 612 if (newlim->d_fieldmask & FS_DQ_IWARNS)
589 mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; 613 q->qi_iwarnlimit = newlim->d_iwarns;
590 if (newlim->d_fieldmask & FS_DQ_RTBWARNS) 614 if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
591 mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; 615 q->qi_rtbwarnlimit = newlim->d_rtbwarns;
592 } else { 616 } else {
593 /* 617 /*
594 * If the user is now over quota, start the timelimit. 618 * If the user is now over quota, start the timelimit.
@@ -605,8 +629,9 @@ xfs_qm_scall_setqlim(
605 error = xfs_trans_commit(tp, 0); 629 error = xfs_trans_commit(tp, 0);
606 xfs_qm_dqprint(dqp); 630 xfs_qm_dqprint(dqp);
607 xfs_qm_dqrele(dqp); 631 xfs_qm_dqrele(dqp);
608 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
609 632
633 out_unlock:
634 mutex_unlock(&q->qi_quotaofflock);
610 return error; 635 return error;
611} 636}
612 637
@@ -785,9 +810,9 @@ xfs_qm_export_dquot(
785 } 810 }
786 811
787#ifdef DEBUG 812#ifdef DEBUG
788 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) || 813 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
789 (XFS_IS_OQUOTA_ENFORCED(mp) && 814 (XFS_IS_OQUOTA_ENFORCED(mp) &&
790 (dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) && 815 (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
791 dst->d_id != 0) { 816 dst->d_id != 0) {
792 if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && 817 if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
793 (dst->d_blk_softlimit > 0)) { 818 (dst->d_blk_softlimit > 0)) {
@@ -808,17 +833,17 @@ xfs_qm_export_qtype_flags(
808 /* 833 /*
809 * Can't be more than one, or none. 834 * Can't be more than one, or none.
810 */ 835 */
811 ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) != 836 ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
812 (XFS_PROJ_QUOTA | XFS_USER_QUOTA)); 837 (FS_PROJ_QUOTA | FS_USER_QUOTA));
813 ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) != 838 ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
814 (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)); 839 (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
815 ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) != 840 ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
816 (XFS_USER_QUOTA | XFS_GROUP_QUOTA)); 841 (FS_USER_QUOTA | FS_GROUP_QUOTA));
817 ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0); 842 ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
818 843
819 return (flags & XFS_DQ_USER) ? 844 return (flags & XFS_DQ_USER) ?
820 XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? 845 FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
821 XFS_PROJ_QUOTA : XFS_GROUP_QUOTA; 846 FS_PROJ_QUOTA : FS_GROUP_QUOTA;
822} 847}
823 848
824STATIC uint 849STATIC uint
@@ -829,16 +854,16 @@ xfs_qm_export_flags(
829 854
830 uflags = 0; 855 uflags = 0;
831 if (flags & XFS_UQUOTA_ACCT) 856 if (flags & XFS_UQUOTA_ACCT)
832 uflags |= XFS_QUOTA_UDQ_ACCT; 857 uflags |= FS_QUOTA_UDQ_ACCT;
833 if (flags & XFS_PQUOTA_ACCT) 858 if (flags & XFS_PQUOTA_ACCT)
834 uflags |= XFS_QUOTA_PDQ_ACCT; 859 uflags |= FS_QUOTA_PDQ_ACCT;
835 if (flags & XFS_GQUOTA_ACCT) 860 if (flags & XFS_GQUOTA_ACCT)
836 uflags |= XFS_QUOTA_GDQ_ACCT; 861 uflags |= FS_QUOTA_GDQ_ACCT;
837 if (flags & XFS_UQUOTA_ENFD) 862 if (flags & XFS_UQUOTA_ENFD)
838 uflags |= XFS_QUOTA_UDQ_ENFD; 863 uflags |= FS_QUOTA_UDQ_ENFD;
839 if (flags & (XFS_OQUOTA_ENFD)) { 864 if (flags & (XFS_OQUOTA_ENFD)) {
840 uflags |= (flags & XFS_GQUOTA_ACCT) ? 865 uflags |= (flags & XFS_GQUOTA_ACCT) ?
841 XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD; 866 FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
842 } 867 }
843 return (uflags); 868 return (uflags);
844} 869}
@@ -853,7 +878,8 @@ xfs_dqrele_inode(
853 int error; 878 int error;
854 879
855 /* skip quota inodes */ 880 /* skip quota inodes */
856 if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) { 881 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
882 ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
857 ASSERT(ip->i_udquot == NULL); 883 ASSERT(ip->i_udquot == NULL);
858 ASSERT(ip->i_gdquot == NULL); 884 ASSERT(ip->i_gdquot == NULL);
859 read_unlock(&pag->pag_ici_lock); 885 read_unlock(&pag->pag_ici_lock);
@@ -873,8 +899,9 @@ xfs_dqrele_inode(
873 xfs_qm_dqrele(ip->i_gdquot); 899 xfs_qm_dqrele(ip->i_gdquot);
874 ip->i_gdquot = NULL; 900 ip->i_gdquot = NULL;
875 } 901 }
876 xfs_iput(ip, XFS_ILOCK_EXCL); 902 xfs_iunlock(ip, XFS_ILOCK_EXCL);
877 903
904 IRELE(ip);
878 return 0; 905 return 0;
879} 906}
880 907
@@ -931,7 +958,8 @@ struct mutex qcheck_lock;
931} 958}
932 959
933typedef struct dqtest { 960typedef struct dqtest {
934 xfs_dqmarker_t q_lists; 961 uint dq_flags; /* various flags (XFS_DQ_*) */
962 struct list_head q_hashlist;
935 xfs_dqhash_t *q_hash; /* the hashchain header */ 963 xfs_dqhash_t *q_hash; /* the hashchain header */
936 xfs_mount_t *q_mount; /* filesystem this relates to */ 964 xfs_mount_t *q_mount; /* filesystem this relates to */
937 xfs_dqid_t d_id; /* user id or group id */ 965 xfs_dqid_t d_id; /* user id or group id */
@@ -942,14 +970,9 @@ typedef struct dqtest {
942STATIC void 970STATIC void
943xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) 971xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
944{ 972{
945 xfs_dquot_t *d; 973 list_add(&dqp->q_hashlist, &h->qh_list);
946 if (((d) = (h)->qh_next)) 974 h->qh_version++;
947 (d)->HL_PREVP = &((dqp)->HL_NEXT); 975 h->qh_nelems++;
948 (dqp)->HL_NEXT = d;
949 (dqp)->HL_PREVP = &((h)->qh_next);
950 (h)->qh_next = (xfs_dquot_t *)dqp;
951 (h)->qh_version++;
952 (h)->qh_nelems++;
953} 976}
954STATIC void 977STATIC void
955xfs_qm_dqtest_print( 978xfs_qm_dqtest_print(
@@ -1061,9 +1084,7 @@ xfs_qm_internalqcheck_dqget(
1061 xfs_dqhash_t *h; 1084 xfs_dqhash_t *h;
1062 1085
1063 h = DQTEST_HASH(mp, id, type); 1086 h = DQTEST_HASH(mp, id, type);
1064 for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; 1087 list_for_each_entry(d, &h->qh_list, q_hashlist) {
1065 d = (xfs_dqtest_t *) d->HL_NEXT) {
1066 /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
1067 if (d->d_id == id && mp == d->q_mount) { 1088 if (d->d_id == id && mp == d->q_mount) {
1068 *O_dq = d; 1089 *O_dq = d;
1069 return (0); 1090 return (0);
@@ -1074,6 +1095,7 @@ xfs_qm_internalqcheck_dqget(
1074 d->d_id = id; 1095 d->d_id = id;
1075 d->q_mount = mp; 1096 d->q_mount = mp;
1076 d->q_hash = h; 1097 d->q_hash = h;
1098 INIT_LIST_HEAD(&d->q_hashlist);
1077 xfs_qm_hashinsert(h, d); 1099 xfs_qm_hashinsert(h, d);
1078 *O_dq = d; 1100 *O_dq = d;
1079 return (0); 1101 return (0);
@@ -1112,10 +1134,7 @@ xfs_qm_internalqcheck_adjust(
1112 xfs_ino_t ino, /* inode number to get data for */ 1134 xfs_ino_t ino, /* inode number to get data for */
1113 void __user *buffer, /* not used */ 1135 void __user *buffer, /* not used */
1114 int ubsize, /* not used */ 1136 int ubsize, /* not used */
1115 void *private_data, /* not used */
1116 xfs_daddr_t bno, /* starting block of inode cluster */
1117 int *ubused, /* not used */ 1137 int *ubused, /* not used */
1118 void *dip, /* not used */
1119 int *res) /* bulkstat result code */ 1138 int *res) /* bulkstat result code */
1120{ 1139{
1121 xfs_inode_t *ip; 1140 xfs_inode_t *ip;
@@ -1137,7 +1156,7 @@ xfs_qm_internalqcheck_adjust(
1137 ipreleased = B_FALSE; 1156 ipreleased = B_FALSE;
1138 again: 1157 again:
1139 lock_flags = XFS_ILOCK_SHARED; 1158 lock_flags = XFS_ILOCK_SHARED;
1140 if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) { 1159 if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) {
1141 *res = BULKSTAT_RV_NOTHING; 1160 *res = BULKSTAT_RV_NOTHING;
1142 return (error); 1161 return (error);
1143 } 1162 }
@@ -1149,7 +1168,8 @@ xfs_qm_internalqcheck_adjust(
1149 * of those now. 1168 * of those now.
1150 */ 1169 */
1151 if (! ipreleased) { 1170 if (! ipreleased) {
1152 xfs_iput(ip, lock_flags); 1171 xfs_iunlock(ip, lock_flags);
1172 IRELE(ip);
1153 ipreleased = B_TRUE; 1173 ipreleased = B_TRUE;
1154 goto again; 1174 goto again;
1155 } 1175 }
@@ -1166,7 +1186,8 @@ xfs_qm_internalqcheck_adjust(
1166 ASSERT(gd); 1186 ASSERT(gd);
1167 xfs_qm_internalqcheck_dqadjust(ip, gd); 1187 xfs_qm_internalqcheck_dqadjust(ip, gd);
1168 } 1188 }
1169 xfs_iput(ip, lock_flags); 1189 xfs_iunlock(ip, lock_flags);
1190 IRELE(ip);
1170 *res = BULKSTAT_RV_DIDONE; 1191 *res = BULKSTAT_RV_DIDONE;
1171 return (0); 1192 return (0);
1172} 1193}
@@ -1180,8 +1201,6 @@ xfs_qm_internalqcheck(
1180 xfs_ino_t lastino; 1201 xfs_ino_t lastino;
1181 int done, count; 1202 int done, count;
1182 int i; 1203 int i;
1183 xfs_dqtest_t *d, *e;
1184 xfs_dqhash_t *h1;
1185 int error; 1204 int error;
1186 1205
1187 lastino = 0; 1206 lastino = 0;
@@ -1210,30 +1229,29 @@ xfs_qm_internalqcheck(
1210 * Iterate thru all the inodes in the file system, 1229 * Iterate thru all the inodes in the file system,
1211 * adjusting the corresponding dquot counters 1230 * adjusting the corresponding dquot counters
1212 */ 1231 */
1213 if ((error = xfs_bulkstat(mp, &lastino, &count, 1232 error = xfs_bulkstat(mp, &lastino, &count,
1214 xfs_qm_internalqcheck_adjust, NULL, 1233 xfs_qm_internalqcheck_adjust,
1215 0, NULL, BULKSTAT_FG_IGET, &done))) { 1234 0, NULL, &done);
1235 if (error) {
1236 cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
1216 break; 1237 break;
1217 } 1238 }
1218 } while (! done); 1239 } while (!done);
1219 if (error) { 1240
1220 cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
1221 }
1222 cmn_err(CE_DEBUG, "Checking results against system dquots"); 1241 cmn_err(CE_DEBUG, "Checking results against system dquots");
1223 for (i = 0; i < qmtest_hashmask; i++) { 1242 for (i = 0; i < qmtest_hashmask; i++) {
1224 h1 = &qmtest_udqtab[i]; 1243 xfs_dqtest_t *d, *n;
1225 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { 1244 xfs_dqhash_t *h;
1245
1246 h = &qmtest_udqtab[i];
1247 list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
1226 xfs_dqtest_cmp(d); 1248 xfs_dqtest_cmp(d);
1227 e = (xfs_dqtest_t *) d->HL_NEXT;
1228 kmem_free(d); 1249 kmem_free(d);
1229 d = e;
1230 } 1250 }
1231 h1 = &qmtest_gdqtab[i]; 1251 h = &qmtest_gdqtab[i];
1232 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { 1252 list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
1233 xfs_dqtest_cmp(d); 1253 xfs_dqtest_cmp(d);
1234 e = (xfs_dqtest_t *) d->HL_NEXT;
1235 kmem_free(d); 1254 kmem_free(d);
1236 d = e;
1237 } 1255 }
1238 } 1256 }
1239 1257
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b2842b6b..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
24 */ 24 */
25#define XFS_DQITER_MAP_SIZE 10 25#define XFS_DQITER_MAP_SIZE 10
26 26
27/* Number of dquots that fit in to a dquot block */
28#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk)
29
30#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t))
31
32#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims)
33#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip)
34#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip)
35#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen)
36#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit)
37#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
38#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit)
39#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit)
40#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
41#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit)
42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
43
44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
45#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
46#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
47
48#define xfs_qm_mplist_lock(mp) \
49 mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
50#define xfs_qm_mplist_nowait(mp) \
51 mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
52#define xfs_qm_mplist_unlock(mp) \
53 mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
54#define XFS_QM_IS_MPLIST_LOCKED(mp) \
55 mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
56
57#define xfs_qm_freelist_lock(qm) \
58 mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
59#define xfs_qm_freelist_lock_nowait(qm) \
60 mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
61#define xfs_qm_freelist_unlock(qm) \
62 mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
63
64/* 27/*
65 * Hash into a bucket in the dquot hash table, based on <mp, id>. 28 * Hash into a bucket in the dquot hash table, based on <mp, id>.
66 */ 29 */
@@ -72,9 +35,6 @@
72 XFS_DQ_HASHVAL(mp, id)) : \ 35 XFS_DQ_HASHVAL(mp, id)) : \
73 (xfs_Gqm->qm_grp_dqhtable + \ 36 (xfs_Gqm->qm_grp_dqhtable + \
74 XFS_DQ_HASHVAL(mp, id))) 37 XFS_DQ_HASHVAL(mp, id)))
75#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \
76 XFS_IS_UQUOTA_ON(mp) : \
77 XFS_IS_OQUOTA_ON(mp))
78#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ 38#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
79 !dqp->q_core.d_blk_hardlimit && \ 39 !dqp->q_core.d_blk_hardlimit && \
80 !dqp->q_core.d_blk_softlimit && \ 40 !dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
86 !dqp->q_core.d_rtbcount && \ 46 !dqp->q_core.d_rtbcount && \
87 !dqp->q_core.d_icount) 47 !dqp->q_core.d_icount)
88 48
89#define HL_PREVP dq_hashlist.ql_prevp
90#define HL_NEXT dq_hashlist.ql_next
91#define MPL_PREVP dq_mplist.ql_prevp
92#define MPL_NEXT dq_mplist.ql_next
93
94
95#define _LIST_REMOVE(h, dqp, PVP, NXT) \
96 { \
97 xfs_dquot_t *d; \
98 if (((d) = (dqp)->NXT)) \
99 (d)->PVP = (dqp)->PVP; \
100 *((dqp)->PVP) = d; \
101 (dqp)->NXT = NULL; \
102 (dqp)->PVP = NULL; \
103 (h)->qh_version++; \
104 (h)->qh_nelems--; \
105 }
106
107#define _LIST_INSERT(h, dqp, PVP, NXT) \
108 { \
109 xfs_dquot_t *d; \
110 if (((d) = (h)->qh_next)) \
111 (d)->PVP = &((dqp)->NXT); \
112 (dqp)->NXT = d; \
113 (dqp)->PVP = &((h)->qh_next); \
114 (h)->qh_next = dqp; \
115 (h)->qh_version++; \
116 (h)->qh_nelems++; \
117 }
118
119#define FOREACH_DQUOT_IN_MP(dqp, mp) \
120 for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
121
122#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \
123for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
124 (dqp) = (dqp)->dq_flnext)
125
126#define XQM_HASHLIST_INSERT(h, dqp) \
127 _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
128
129#define XQM_FREELIST_INSERT(h, dqp) \
130 xfs_qm_freelist_append(h, dqp)
131
132#define XQM_MPLIST_INSERT(h, dqp) \
133 _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
134
135#define XQM_HASHLIST_REMOVE(h, dqp) \
136 _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
137#define XQM_FREELIST_REMOVE(dqp) \
138 xfs_qm_freelist_unlink(dqp)
139#define XQM_MPLIST_REMOVE(h, dqp) \
140 { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
141 XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
142
143#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp))
144
145#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \
146 (tp)->t_dqinfo->dqa_usrdquots : \
147 (tp)->t_dqinfo->dqa_grpdquots)
148#define XFS_IS_SUSER_DQUOT(dqp) \
149 (!((dqp)->q_core.d_id))
150
151#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ 49#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
152 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ 50 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
153 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) 51 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index c3ab75cb1d9a..7de91d1b75c0 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -23,25 +23,15 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h" 30#include "xfs_inode.h"
38#include "xfs_ialloc.h"
39#include "xfs_itable.h" 31#include "xfs_itable.h"
40#include "xfs_btree.h"
41#include "xfs_bmap.h" 32#include "xfs_bmap.h"
42#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
43#include "xfs_error.h" 34#include "xfs_error.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h" 35#include "xfs_attr.h"
46#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
47#include "xfs_trans_priv.h" 37#include "xfs_trans_priv.h"
@@ -59,17 +49,14 @@ xfs_trans_dqjoin(
59 xfs_trans_t *tp, 49 xfs_trans_t *tp,
60 xfs_dquot_t *dqp) 50 xfs_dquot_t *dqp)
61{ 51{
62 xfs_dq_logitem_t *lp; 52 ASSERT(dqp->q_transp != tp);
63
64 ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
65 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 53 ASSERT(XFS_DQ_IS_LOCKED(dqp));
66 ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); 54 ASSERT(dqp->q_logitem.qli_dquot == dqp);
67 lp = &dqp->q_logitem;
68 55
69 /* 56 /*
70 * Get a log_item_desc to point at the new item. 57 * Get a log_item_desc to point at the new item.
71 */ 58 */
72 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp)); 59 xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
73 60
74 /* 61 /*
75 * Initialize i_transp so we can later determine if this dquot is 62 * Initialize i_transp so we can later determine if this dquot is
@@ -94,16 +81,11 @@ xfs_trans_log_dquot(
94 xfs_trans_t *tp, 81 xfs_trans_t *tp,
95 xfs_dquot_t *dqp) 82 xfs_dquot_t *dqp)
96{ 83{
97 xfs_log_item_desc_t *lidp; 84 ASSERT(dqp->q_transp == tp);
98
99 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
100 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 85 ASSERT(XFS_DQ_IS_LOCKED(dqp));
101 86
102 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
103 ASSERT(lidp != NULL);
104
105 tp->t_flags |= XFS_TRANS_DIRTY; 87 tp->t_flags |= XFS_TRANS_DIRTY;
106 lidp->lid_flags |= XFS_LID_DIRTY; 88 dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
107} 89}
108 90
109/* 91/*
@@ -198,16 +180,16 @@ xfs_trans_get_dqtrx(
198 int i; 180 int i;
199 xfs_dqtrx_t *qa; 181 xfs_dqtrx_t *qa;
200 182
201 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 183 qa = XFS_QM_ISUDQ(dqp) ?
202 qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); 184 tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
203 185
186 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
204 if (qa[i].qt_dquot == NULL || 187 if (qa[i].qt_dquot == NULL ||
205 qa[i].qt_dquot == dqp) { 188 qa[i].qt_dquot == dqp)
206 return (&qa[i]); 189 return &qa[i];
207 }
208 } 190 }
209 191
210 return (NULL); 192 return NULL;
211} 193}
212 194
213/* 195/*
@@ -381,7 +363,7 @@ xfs_trans_apply_dquot_deltas(
381 break; 363 break;
382 364
383 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 365 ASSERT(XFS_DQ_IS_LOCKED(dqp));
384 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 366 ASSERT(dqp->q_transp == tp);
385 367
386 /* 368 /*
387 * adjust the actual number of blocks used 369 * adjust the actual number of blocks used
@@ -639,7 +621,7 @@ xfs_trans_dqresv(
639 softlimit = q->qi_bsoftlimit; 621 softlimit = q->qi_bsoftlimit;
640 timer = be32_to_cpu(dqp->q_core.d_btimer); 622 timer = be32_to_cpu(dqp->q_core.d_btimer);
641 warns = be16_to_cpu(dqp->q_core.d_bwarns); 623 warns = be16_to_cpu(dqp->q_core.d_bwarns);
642 warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); 624 warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
643 resbcountp = &dqp->q_res_bcount; 625 resbcountp = &dqp->q_res_bcount;
644 } else { 626 } else {
645 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); 627 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -651,7 +633,7 @@ xfs_trans_dqresv(
651 softlimit = q->qi_rtbsoftlimit; 633 softlimit = q->qi_rtbsoftlimit;
652 timer = be32_to_cpu(dqp->q_core.d_rtbtimer); 634 timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
653 warns = be16_to_cpu(dqp->q_core.d_rtbwarns); 635 warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
654 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); 636 warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
655 resbcountp = &dqp->q_res_rtbcount; 637 resbcountp = &dqp->q_res_rtbcount;
656 } 638 }
657 639
@@ -691,7 +673,7 @@ xfs_trans_dqresv(
691 count = be64_to_cpu(dqp->q_core.d_icount); 673 count = be64_to_cpu(dqp->q_core.d_icount);
692 timer = be32_to_cpu(dqp->q_core.d_itimer); 674 timer = be32_to_cpu(dqp->q_core.d_itimer);
693 warns = be16_to_cpu(dqp->q_core.d_iwarns); 675 warns = be16_to_cpu(dqp->q_core.d_iwarns);
694 warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); 676 warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
695 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); 677 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
696 if (!hardlimit) 678 if (!hardlimit)
697 hardlimit = q->qi_ihardlimit; 679 hardlimit = q->qi_ihardlimit;
@@ -875,9 +857,8 @@ xfs_trans_get_qoff_item(
875 /* 857 /*
876 * Get a log_item_desc to point at the new item. 858 * Get a log_item_desc to point at the new item.
877 */ 859 */
878 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q); 860 xfs_trans_add_item(tp, &q->qql_item);
879 861 return q;
880 return (q);
881} 862}
882 863
883 864
@@ -891,13 +872,8 @@ xfs_trans_log_quotaoff_item(
891 xfs_trans_t *tp, 872 xfs_trans_t *tp,
892 xfs_qoff_logitem_t *qlp) 873 xfs_qoff_logitem_t *qlp)
893{ 874{
894 xfs_log_item_desc_t *lidp;
895
896 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
897 ASSERT(lidp != NULL);
898
899 tp->t_flags |= XFS_TRANS_DIRTY; 875 tp->t_flags |= XFS_TRANS_DIRTY;
900 lidp->lid_flags |= XFS_LID_DIRTY; 876 qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
901} 877}
902 878
903STATIC void 879STATIC void
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 3f3610a7ee05..975aa10e1a47 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -22,7 +22,6 @@
22#include "xfs_sb.h" 22#include "xfs_sb.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dmapi.h"
26#include "xfs_mount.h" 25#include "xfs_mount.h"
27#include "xfs_error.h" 26#include "xfs_error.h"
28 27
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f8..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
49extern int posix_acl_access_exists(struct inode *inode); 49extern int posix_acl_access_exists(struct inode *inode);
50extern int posix_acl_default_exists(struct inode *inode); 50extern int posix_acl_default_exists(struct inode *inode);
51 51
52extern struct xattr_handler xfs_xattr_acl_access_handler; 52extern const struct xattr_handler xfs_xattr_acl_access_handler;
53extern struct xattr_handler xfs_xattr_acl_default_handler; 53extern const struct xattr_handler xfs_xattr_acl_default_handler;
54#else 54#else
55# define xfs_check_acl NULL 55# define xfs_check_acl NULL
56# define xfs_get_acl(inode, type) NULL 56# define xfs_get_acl(inode, type) NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c9..4917d4eed4ed 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
175} xfs_agfl_t; 175} xfs_agfl_t;
176 176
177/* 177/*
178 * Busy block/extent entry. Used in perag to mark blocks that have been freed 178 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
179 * but whose transactions aren't committed to disk yet. 179 * have been freed but whose transactions aren't committed to disk yet.
180 *
181 * Note that we use the transaction ID to record the transaction, not the
182 * transaction structure itself. See xfs_alloc_busy_insert() for details.
180 */ 183 */
181typedef struct xfs_perag_busy { 184struct xfs_busy_extent {
182 xfs_agblock_t busy_start; 185 struct rb_node rb_node; /* ag by-bno indexed search tree */
183 xfs_extlen_t busy_length; 186 struct list_head list; /* transaction busy extent list */
184 struct xfs_trans *busy_tp; /* transaction that did the free */ 187 xfs_agnumber_t agno;
185} xfs_perag_busy_t; 188 xfs_agblock_t bno;
189 xfs_extlen_t length;
190 xlog_tid_t tid; /* transaction that created this */
191};
186 192
187/* 193/*
188 * Per-ag incore structure, copies of information in agf and agi, 194 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,17 +222,16 @@ typedef struct xfs_perag {
216 xfs_agino_t pagl_leftrec; 222 xfs_agino_t pagl_leftrec;
217 xfs_agino_t pagl_rightrec; 223 xfs_agino_t pagl_rightrec;
218#ifdef __KERNEL__ 224#ifdef __KERNEL__
219 spinlock_t pagb_lock; /* lock for pagb_list */ 225 spinlock_t pagb_lock; /* lock for pagb_tree */
226 struct rb_root pagb_tree; /* ordered tree of busy extents */
220 227
221 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
222 229
223 int pag_ici_init; /* incore inode cache initialised */
224 rwlock_t pag_ici_lock; /* incore inode lock */ 230 rwlock_t pag_ici_lock; /* incore inode lock */
225 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
226 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
227#endif 233#endif
228 int pagb_count; /* pagb slots in use */ 234 int pagb_count; /* pagb slots in use */
229 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
230} xfs_perag_t; 235} xfs_perag_t;
231 236
232/* 237/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..af168faccc7a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -24,18 +24,13 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
38#include "xfs_ialloc.h"
39#include "xfs_alloc.h" 34#include "xfs_alloc.h"
40#include "xfs_error.h" 35#include "xfs_error.h"
41#include "xfs_trace.h" 36#include "xfs_trace.h"
@@ -46,11 +41,9 @@
46#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
47#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
48 43
49STATIC void 44static int
50xfs_alloc_search_busy(xfs_trans_t *tp, 45xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
51 xfs_agnumber_t agno, 46 xfs_agblock_t bno, xfs_extlen_t len);
52 xfs_agblock_t bno,
53 xfs_extlen_t len);
54 47
55/* 48/*
56 * Prototypes for per-ag allocation routines 49 * Prototypes for per-ag allocation routines
@@ -540,9 +533,16 @@ xfs_alloc_ag_vextent(
540 be32_to_cpu(agf->agf_length)); 533 be32_to_cpu(agf->agf_length));
541 xfs_alloc_log_agf(args->tp, args->agbp, 534 xfs_alloc_log_agf(args->tp, args->agbp,
542 XFS_AGF_FREEBLKS); 535 XFS_AGF_FREEBLKS);
543 /* search the busylist for these blocks */ 536 /*
544 xfs_alloc_search_busy(args->tp, args->agno, 537 * Search the busylist for these blocks and mark the
545 args->agbno, args->len); 538 * transaction as synchronous if blocks are found. This
539 * avoids the need to block due to a synchronous log
540 * force to ensure correct ordering as the synchronous
541 * transaction will guarantee that for us.
542 */
543 if (xfs_alloc_busy_search(args->mp, args->agno,
544 args->agbno, args->len))
545 xfs_trans_set_sync(args->tp);
546 } 546 }
547 if (!args->isfl) 547 if (!args->isfl)
548 xfs_trans_mod_sb(args->tp, 548 xfs_trans_mod_sb(args->tp,
@@ -683,8 +683,6 @@ xfs_alloc_ag_vextent_near(
683 xfs_agblock_t ltbno; /* start bno of left side entry */ 683 xfs_agblock_t ltbno; /* start bno of left side entry */
684 xfs_agblock_t ltbnoa; /* aligned ... */ 684 xfs_agblock_t ltbnoa; /* aligned ... */
685 xfs_extlen_t ltdiff; /* difference to left side entry */ 685 xfs_extlen_t ltdiff; /* difference to left side entry */
686 /*REFERENCED*/
687 xfs_agblock_t ltend; /* end bno of left side entry */
688 xfs_extlen_t ltlen; /* length of left side entry */ 686 xfs_extlen_t ltlen; /* length of left side entry */
689 xfs_extlen_t ltlena; /* aligned ... */ 687 xfs_extlen_t ltlena; /* aligned ... */
690 xfs_agblock_t ltnew; /* useful start bno of left side */ 688 xfs_agblock_t ltnew; /* useful start bno of left side */
@@ -809,8 +807,7 @@ xfs_alloc_ag_vextent_near(
809 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 807 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
810 goto error0; 808 goto error0;
811 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 809 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
812 ltend = ltbno + ltlen; 810 ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
813 ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
814 args->len = blen; 811 args->len = blen;
815 if (!xfs_alloc_fix_minleft(args)) { 812 if (!xfs_alloc_fix_minleft(args)) {
816 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 813 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -823,7 +820,7 @@ xfs_alloc_ag_vextent_near(
823 */ 820 */
824 args->agbno = bnew; 821 args->agbno = bnew;
825 ASSERT(bnew >= ltbno); 822 ASSERT(bnew >= ltbno);
826 ASSERT(bnew + blen <= ltend); 823 ASSERT(bnew + blen <= ltbno + ltlen);
827 /* 824 /*
828 * Set up a cursor for the by-bno tree. 825 * Set up a cursor for the by-bno tree.
829 */ 826 */
@@ -1152,7 +1149,6 @@ xfs_alloc_ag_vextent_near(
1152 /* 1149 /*
1153 * Fix up the length and compute the useful address. 1150 * Fix up the length and compute the useful address.
1154 */ 1151 */
1155 ltend = ltbno + ltlen;
1156 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1152 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1157 xfs_alloc_fix_len(args); 1153 xfs_alloc_fix_len(args);
1158 if (!xfs_alloc_fix_minleft(args)) { 1154 if (!xfs_alloc_fix_minleft(args)) {
@@ -1165,7 +1161,7 @@ xfs_alloc_ag_vextent_near(
1165 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, 1161 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
1166 ltlen, &ltnew); 1162 ltlen, &ltnew);
1167 ASSERT(ltnew >= ltbno); 1163 ASSERT(ltnew >= ltbno);
1168 ASSERT(ltnew + rlen <= ltend); 1164 ASSERT(ltnew + rlen <= ltbno + ltlen);
1169 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1165 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1170 args->agbno = ltnew; 1166 args->agbno = ltnew;
1171 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, 1167 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
@@ -1693,7 +1689,7 @@ xfs_free_ag_extent(
1693 * when the iclog commits to disk. If a busy block is allocated, 1689 * when the iclog commits to disk. If a busy block is allocated,
1694 * the iclog is pushed up to the LSN that freed the block. 1690 * the iclog is pushed up to the LSN that freed the block.
1695 */ 1691 */
1696 xfs_alloc_mark_busy(tp, agno, bno, len); 1692 xfs_alloc_busy_insert(tp, agno, bno, len);
1697 return 0; 1693 return 0;
1698 1694
1699 error0: 1695 error0:
@@ -1989,14 +1985,20 @@ xfs_alloc_get_freelist(
1989 *bnop = bno; 1985 *bnop = bno;
1990 1986
1991 /* 1987 /*
1992 * As blocks are freed, they are added to the per-ag busy list 1988 * As blocks are freed, they are added to the per-ag busy list and
1993 * and remain there until the freeing transaction is committed to 1989 * remain there until the freeing transaction is committed to disk.
1994 * disk. Now that we have allocated blocks, this list must be 1990 * Now that we have allocated blocks, this list must be searched to see
1995 * searched to see if a block is being reused. If one is, then 1991 * if a block is being reused. If one is, then the freeing transaction
1996 * the freeing transaction must be pushed to disk NOW by forcing 1992 * must be pushed to disk before this transaction.
1997 * to disk all iclogs up that transaction's LSN. 1993 *
1994 * We do this by setting the current transaction to a sync transaction
1995 * which guarantees that the freeing transaction is on disk before this
1996 * transaction. This is done instead of a synchronous log force here so
1997 * that we don't sit and wait with the AGF locked in the transaction
1998 * during the log force.
1998 */ 1999 */
1999 xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); 2000 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
2001 xfs_trans_set_sync(tp);
2000 return 0; 2002 return 0;
2001} 2003}
2002 2004
@@ -2201,7 +2203,7 @@ xfs_alloc_read_agf(
2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2203 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2202 spin_lock_init(&pag->pagb_lock); 2204 spin_lock_init(&pag->pagb_lock);
2203 pag->pagb_count = 0; 2205 pag->pagb_count = 0;
2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); 2206 pag->pagb_tree = RB_ROOT;
2205 pag->pagf_init = 1; 2207 pag->pagf_init = 1;
2206 } 2208 }
2207#ifdef DEBUG 2209#ifdef DEBUG
@@ -2479,127 +2481,263 @@ error0:
2479 * list is reused, the transaction that freed it must be forced to disk 2481 * list is reused, the transaction that freed it must be forced to disk
2480 * before continuing to use the block. 2482 * before continuing to use the block.
2481 * 2483 *
2482 * xfs_alloc_mark_busy - add to the per-ag busy list 2484 * xfs_alloc_busy_insert - add to the per-ag busy list
2483 * xfs_alloc_clear_busy - remove an item from the per-ag busy list 2485 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2486 * xfs_alloc_busy_search - search for a busy extent
2487 */
2488
2489/*
2490 * Insert a new extent into the busy tree.
2491 *
2492 * The busy extent tree is indexed by the start block of the busy extent.
2493 * there can be multiple overlapping ranges in the busy extent tree but only
2494 * ever one entry at a given start block. The reason for this is that
2495 * multi-block extents can be freed, then smaller chunks of that extent
2496 * allocated and freed again before the first transaction commit is on disk.
2497 * If the exact same start block is freed a second time, we have to wait for
2498 * that busy extent to pass out of the tree before the new extent is inserted.
2499 * There are two main cases we have to handle here.
2500 *
2501 * The first case is a transaction that triggers a "free - allocate - free"
2502 * cycle. This can occur during btree manipulations as a btree block is freed
2503 * to the freelist, then allocated from the free list, then freed again. In
2504 * this case, the second extxpnet free is what triggers the duplicate and as
2505 * such the transaction IDs should match. Because the extent was allocated in
2506 * this transaction, the transaction must be marked as synchronous. This is
2507 * true for all cases where the free/alloc/free occurs in the one transaction,
2508 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2509 * This serves to catch violations of the second case quite effectively.
2510 *
2511 * The second case is where the free/alloc/free occur in different
2512 * transactions. In this case, the thread freeing the extent the second time
2513 * can't mark the extent busy immediately because it is already tracked in a
2514 * transaction that may be committing. When the log commit for the existing
2515 * busy extent completes, the busy extent will be removed from the tree. If we
2516 * allow the second busy insert to continue using that busy extent structure,
2517 * it can be freed before this transaction is safely in the log. Hence our
2518 * only option in this case is to force the log to remove the existing busy
2519 * extent from the list before we insert the new one with the current
2520 * transaction ID.
2521 *
2522 * The problem we are trying to avoid in the free-alloc-free in separate
2523 * transactions is most easily described with a timeline:
2524 *
2525 * Thread 1 Thread 2 Thread 3 xfslogd
2526 * xact alloc
2527 * free X
2528 * mark busy
2529 * commit xact
2530 * free xact
2531 * xact alloc
2532 * alloc X
2533 * busy search
2534 * mark xact sync
2535 * commit xact
2536 * free xact
2537 * force log
2538 * checkpoint starts
2539 * ....
2540 * xact alloc
2541 * free X
2542 * mark busy
2543 * finds match
2544 * *** KABOOM! ***
2545 * ....
2546 * log IO completes
2547 * unbusy X
2548 * checkpoint completes
2549 *
2550 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2551 * the checkpoint completes, and the busy extent it matched will have been
2552 * removed from the tree when it is woken. Hence it can then continue safely.
2553 *
2554 * However, to ensure this matching process is robust, we need to use the
2555 * transaction ID for identifying transaction, as delayed logging results in
2556 * the busy extent and transaction lifecycles being different. i.e. the busy
2557 * extent is active for a lot longer than the transaction. Hence the
2558 * transaction structure can be freed and reallocated, then mark the same
2559 * extent busy again in the new transaction. In this case the new transaction
2560 * will have a different tid but can have the same address, and hence we need
2561 * to check against the tid.
2562 *
2563 * Future: for delayed logging, we could avoid the log force if the extent was
2564 * first freed in the current checkpoint sequence. This, however, requires the
2565 * ability to pin the current checkpoint in memory until this transaction
2566 * commits to ensure that both the original free and the current one combine
2567 * logically into the one checkpoint. If the checkpoint sequences are
2568 * different, however, we still need to wait on a log force.
2484 */ 2569 */
2485void 2570void
2486xfs_alloc_mark_busy(xfs_trans_t *tp, 2571xfs_alloc_busy_insert(
2487 xfs_agnumber_t agno, 2572 struct xfs_trans *tp,
2488 xfs_agblock_t bno, 2573 xfs_agnumber_t agno,
2489 xfs_extlen_t len) 2574 xfs_agblock_t bno,
2575 xfs_extlen_t len)
2490{ 2576{
2491 xfs_perag_busy_t *bsy; 2577 struct xfs_busy_extent *new;
2578 struct xfs_busy_extent *busyp;
2492 struct xfs_perag *pag; 2579 struct xfs_perag *pag;
2493 int n; 2580 struct rb_node **rbp;
2581 struct rb_node *parent;
2582 int match;
2494 2583
2495 pag = xfs_perag_get(tp->t_mountp, agno);
2496 spin_lock(&pag->pagb_lock);
2497 2584
2498 /* search pagb_list for an open slot */ 2585 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2499 for (bsy = pag->pagb_list, n = 0; 2586 if (!new) {
2500 n < XFS_PAGB_NUM_SLOTS; 2587 /*
2501 bsy++, n++) { 2588 * No Memory! Since it is now not possible to track the free
2502 if (bsy->busy_tp == NULL) { 2589 * block, make this a synchronous transaction to insure that
2503 break; 2590 * the block is not reused before this transaction commits.
2504 } 2591 */
2592 trace_xfs_alloc_busy(tp, agno, bno, len, 1);
2593 xfs_trans_set_sync(tp);
2594 return;
2505 } 2595 }
2506 2596
2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); 2597 new->agno = agno;
2598 new->bno = bno;
2599 new->length = len;
2600 new->tid = xfs_log_get_trans_ident(tp);
2508 2601
2509 if (n < XFS_PAGB_NUM_SLOTS) { 2602 INIT_LIST_HEAD(&new->list);
2510 bsy = &pag->pagb_list[n]; 2603
2511 pag->pagb_count++; 2604 /* trace before insert to be able to see failed inserts */
2512 bsy->busy_start = bno; 2605 trace_xfs_alloc_busy(tp, agno, bno, len, 0);
2513 bsy->busy_length = len; 2606
2514 bsy->busy_tp = tp; 2607 pag = xfs_perag_get(tp->t_mountp, new->agno);
2515 xfs_trans_add_busy(tp, agno, n); 2608restart:
2516 } else { 2609 spin_lock(&pag->pagb_lock);
2610 rbp = &pag->pagb_tree.rb_node;
2611 parent = NULL;
2612 busyp = NULL;
2613 match = 0;
2614 while (*rbp && match >= 0) {
2615 parent = *rbp;
2616 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2617
2618 if (new->bno < busyp->bno) {
2619 /* may overlap, but exact start block is lower */
2620 rbp = &(*rbp)->rb_left;
2621 if (new->bno + new->length > busyp->bno)
2622 match = busyp->tid == new->tid ? 1 : -1;
2623 } else if (new->bno > busyp->bno) {
2624 /* may overlap, but exact start block is higher */
2625 rbp = &(*rbp)->rb_right;
2626 if (bno < busyp->bno + busyp->length)
2627 match = busyp->tid == new->tid ? 1 : -1;
2628 } else {
2629 match = busyp->tid == new->tid ? 1 : -1;
2630 break;
2631 }
2632 }
2633 if (match < 0) {
2634 /* overlap marked busy in different transaction */
2635 spin_unlock(&pag->pagb_lock);
2636 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2637 goto restart;
2638 }
2639 if (match > 0) {
2517 /* 2640 /*
2518 * The busy list is full! Since it is now not possible to 2641 * overlap marked busy in same transaction. Update if exact
2519 * track the free block, make this a synchronous transaction 2642 * start block match, otherwise combine the busy extents into
2520 * to insure that the block is not reused before this 2643 * a single range.
2521 * transaction commits.
2522 */ 2644 */
2523 xfs_trans_set_sync(tp); 2645 if (busyp->bno == new->bno) {
2524 } 2646 busyp->length = max(busyp->length, new->length);
2647 spin_unlock(&pag->pagb_lock);
2648 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2649 xfs_perag_put(pag);
2650 kmem_free(new);
2651 return;
2652 }
2653 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2654 new->length = max(busyp->bno + busyp->length,
2655 new->bno + new->length) -
2656 min(busyp->bno, new->bno);
2657 new->bno = min(busyp->bno, new->bno);
2658 } else
2659 busyp = NULL;
2525 2660
2661 rb_link_node(&new->rb_node, parent, rbp);
2662 rb_insert_color(&new->rb_node, &pag->pagb_tree);
2663
2664 list_add(&new->list, &tp->t_busy);
2526 spin_unlock(&pag->pagb_lock); 2665 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag); 2666 xfs_perag_put(pag);
2667 kmem_free(busyp);
2528} 2668}
2529 2669
2530void 2670/*
2531xfs_alloc_clear_busy(xfs_trans_t *tp, 2671 * Search for a busy extent within the range of the extent we are about to
2532 xfs_agnumber_t agno, 2672 * allocate. You need to be holding the busy extent tree lock when calling
2533 int idx) 2673 * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
2674 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
2675 * match. This is done so that a non-zero return indicates an overlap that
2676 * will require a synchronous transaction, but it can still be
2677 * used to distinguish between a partial or exact match.
2678 */
2679static int
2680xfs_alloc_busy_search(
2681 struct xfs_mount *mp,
2682 xfs_agnumber_t agno,
2683 xfs_agblock_t bno,
2684 xfs_extlen_t len)
2534{ 2685{
2535 struct xfs_perag *pag; 2686 struct xfs_perag *pag;
2536 xfs_perag_busy_t *list; 2687 struct rb_node *rbp;
2688 struct xfs_busy_extent *busyp;
2689 int match = 0;
2537 2690
2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2691 pag = xfs_perag_get(mp, agno);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock); 2692 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2542 2693
2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); 2694 rbp = pag->pagb_tree.rb_node;
2544 2695
2545 if (list[idx].busy_tp == tp) { 2696 /* find closest start bno overlap */
2546 list[idx].busy_tp = NULL; 2697 while (rbp) {
2547 pag->pagb_count--; 2698 busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
2699 if (bno < busyp->bno) {
2700 /* may overlap, but exact start block is lower */
2701 if (bno + len > busyp->bno)
2702 match = -1;
2703 rbp = rbp->rb_left;
2704 } else if (bno > busyp->bno) {
2705 /* may overlap, but exact start block is higher */
2706 if (bno < busyp->bno + busyp->length)
2707 match = -1;
2708 rbp = rbp->rb_right;
2709 } else {
2710 /* bno matches busyp, length determines exact match */
2711 match = (busyp->length == len) ? 1 : -1;
2712 break;
2713 }
2548 } 2714 }
2549
2550 spin_unlock(&pag->pagb_lock); 2715 spin_unlock(&pag->pagb_lock);
2716 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2551 xfs_perag_put(pag); 2717 xfs_perag_put(pag);
2718 return match;
2552} 2719}
2553 2720
2554 2721void
2555/* 2722xfs_alloc_busy_clear(
2556 * If we find the extent in the busy list, force the log out to get the 2723 struct xfs_mount *mp,
2557 * extent out of the busy list so the caller can use it straight away. 2724 struct xfs_busy_extent *busyp)
2558 */
2559STATIC void
2560xfs_alloc_search_busy(xfs_trans_t *tp,
2561 xfs_agnumber_t agno,
2562 xfs_agblock_t bno,
2563 xfs_extlen_t len)
2564{ 2725{
2565 struct xfs_perag *pag; 2726 struct xfs_perag *pag;
2566 xfs_perag_busy_t *bsy;
2567 xfs_agblock_t uend, bend;
2568 xfs_lsn_t lsn = 0;
2569 int cnt;
2570 2727
2571 pag = xfs_perag_get(tp->t_mountp, agno); 2728 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
2572 spin_lock(&pag->pagb_lock); 2729 busyp->length);
2573 cnt = pag->pagb_count;
2574 2730
2575 /* 2731 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
2576 * search pagb_list for this slot, skipping open slots. We have to 2732 busyp->length) == 1);
2577 * search the entire array as there may be multiple overlaps and
2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range.
2580 */
2581 uend = bno + len - 1;
2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2583 bsy = &pag->pagb_list[cnt];
2584 if (!bsy->busy_tp)
2585 continue;
2586 2733
2587 bend = bsy->busy_start + bsy->busy_length - 1; 2734 list_del_init(&busyp->list);
2588 if (bno > bend || uend < bsy->busy_start)
2589 continue;
2590 2735
2591 /* (start1,length1) within (start2, length2) */ 2736 pag = xfs_perag_get(mp, busyp->agno);
2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) 2737 spin_lock(&pag->pagb_lock);
2593 lsn = bsy->busy_tp->t_commit_lsn; 2738 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2594 }
2595 spin_unlock(&pag->pagb_lock); 2739 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag); 2740 xfs_perag_put(pag);
2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2598 2741
2599 /* 2742 kmem_free(busyp);
2600 * If a block was found, force the log through the LSN of the
2601 * transaction that freed the block
2602 */
2603 if (lsn)
2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2605} 2743}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..895009a97271 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,20 +22,21 @@ struct xfs_buf;
22struct xfs_mount; 22struct xfs_mount;
23struct xfs_perag; 23struct xfs_perag;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_busy_extent;
25 26
26/* 27/*
27 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 28 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
28 */ 29 */
29typedef enum xfs_alloctype 30#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */
30{ 31#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */
31 XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */ 32#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */
32 XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */ 33#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */
33 XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */ 34#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */
34 XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */ 35#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */
35 XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */ 36#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */
36 XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */ 37
37 XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */ 38/* this should become an enum again when the tracing code is fixed */
38} xfs_alloctype_t; 39typedef unsigned int xfs_alloctype_t;
39 40
40#define XFS_ALLOC_TYPES \ 41#define XFS_ALLOC_TYPES \
41 { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ 42 { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
119#ifdef __KERNEL__ 120#ifdef __KERNEL__
120 121
121void 122void
122xfs_alloc_mark_busy(xfs_trans_t *tp, 123xfs_alloc_busy_insert(xfs_trans_t *tp,
123 xfs_agnumber_t agno, 124 xfs_agnumber_t agno,
124 xfs_agblock_t bno, 125 xfs_agblock_t bno,
125 xfs_extlen_t len); 126 xfs_extlen_t len);
126 127
127void 128void
128xfs_alloc_clear_busy(xfs_trans_t *tp, 129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
129 xfs_agnumber_t ag,
130 int idx);
131 130
132#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
133 132
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..97f7328967fd 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -24,19 +24,14 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
38#include "xfs_btree_trace.h" 34#include "xfs_btree_trace.h"
39#include "xfs_ialloc.h"
40#include "xfs_alloc.h" 35#include "xfs_alloc.h"
41#include "xfs_error.h" 36#include "xfs_error.h"
42#include "xfs_trace.h" 37#include "xfs_trace.h"
@@ -134,7 +129,7 @@ xfs_allocbt_free_block(
134 * disk. If a busy block is allocated, the iclog is pushed up to the 129 * disk. If a busy block is allocated, the iclog is pushed up to the
135 * LSN that freed the block. 130 * LSN that freed the block.
136 */ 131 */
137 xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 132 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
138 xfs_trans_agbtree_delta(cur->bc_tp, -1); 133 xfs_trans_agbtree_delta(cur->bc_tp, -1);
139 return 0; 134 return 0;
140} 135}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b9c196a53c42..c2568242a901 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -25,19 +25,13 @@
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h" 27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
32#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h" 31#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 32#include "xfs_dinode.h"
38#include "xfs_inode.h" 33#include "xfs_inode.h"
39#include "xfs_alloc.h" 34#include "xfs_alloc.h"
40#include "xfs_btree.h"
41#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
42#include "xfs_bmap.h" 36#include "xfs_bmap.h"
43#include "xfs_attr.h" 37#include "xfs_attr.h"
@@ -325,8 +319,7 @@ xfs_attr_set_int(
325 return (error); 319 return (error);
326 } 320 }
327 321
328 xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); 322 xfs_trans_ijoin(args.trans, dp);
329 xfs_trans_ihold(args.trans, dp);
330 323
331 /* 324 /*
332 * If the attribute list is non-existent or a shortform list, 325 * If the attribute list is non-existent or a shortform list,
@@ -396,10 +389,8 @@ xfs_attr_set_int(
396 * bmap_finish() may have committed the last trans and started 389 * bmap_finish() may have committed the last trans and started
397 * a new one. We need the inode to be in all transactions. 390 * a new one. We need the inode to be in all transactions.
398 */ 391 */
399 if (committed) { 392 if (committed)
400 xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); 393 xfs_trans_ijoin(args.trans, dp);
401 xfs_trans_ihold(args.trans, dp);
402 }
403 394
404 /* 395 /*
405 * Commit the leaf transformation. We'll need another (linked) 396 * Commit the leaf transformation. We'll need another (linked)
@@ -544,8 +535,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
544 * No need to make quota reservations here. We expect to release some 535 * No need to make quota reservations here. We expect to release some
545 * blocks not allocate in the common case. 536 * blocks not allocate in the common case.
546 */ 537 */
547 xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); 538 xfs_trans_ijoin(args.trans, dp);
548 xfs_trans_ihold(args.trans, dp);
549 539
550 /* 540 /*
551 * Decide on what work routines to call based on the inode size. 541 * Decide on what work routines to call based on the inode size.
@@ -821,8 +811,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
821 * No need to make quota reservations here. We expect to release some 811 * No need to make quota reservations here. We expect to release some
822 * blocks, not allocate, in the common case. 812 * blocks, not allocate, in the common case.
823 */ 813 */
824 xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); 814 xfs_trans_ijoin(trans, dp);
825 xfs_trans_ihold(trans, dp);
826 815
827 /* 816 /*
828 * Decide on what work routines to call based on the inode size. 817 * Decide on what work routines to call based on the inode size.
@@ -981,10 +970,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
981 * bmap_finish() may have committed the last trans and started 970 * bmap_finish() may have committed the last trans and started
982 * a new one. We need the inode to be in all transactions. 971 * a new one. We need the inode to be in all transactions.
983 */ 972 */
984 if (committed) { 973 if (committed)
985 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 974 xfs_trans_ijoin(args->trans, dp);
986 xfs_trans_ihold(args->trans, dp);
987 }
988 975
989 /* 976 /*
990 * Commit the current trans (including the inode) and start 977 * Commit the current trans (including the inode) and start
@@ -1085,10 +1072,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1085 * and started a new one. We need the inode to be 1072 * and started a new one. We need the inode to be
1086 * in all transactions. 1073 * in all transactions.
1087 */ 1074 */
1088 if (committed) { 1075 if (committed)
1089 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1076 xfs_trans_ijoin(args->trans, dp);
1090 xfs_trans_ihold(args->trans, dp);
1091 }
1092 } else 1077 } else
1093 xfs_da_buf_done(bp); 1078 xfs_da_buf_done(bp);
1094 1079
@@ -1161,10 +1146,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1161 * bmap_finish() may have committed the last trans and started 1146 * bmap_finish() may have committed the last trans and started
1162 * a new one. We need the inode to be in all transactions. 1147 * a new one. We need the inode to be in all transactions.
1163 */ 1148 */
1164 if (committed) { 1149 if (committed)
1165 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1150 xfs_trans_ijoin(args->trans, dp);
1166 xfs_trans_ihold(args->trans, dp);
1167 }
1168 } else 1151 } else
1169 xfs_da_buf_done(bp); 1152 xfs_da_buf_done(bp);
1170 return(0); 1153 return(0);
@@ -1317,10 +1300,8 @@ restart:
1317 * and started a new one. We need the inode to be 1300 * and started a new one. We need the inode to be
1318 * in all transactions. 1301 * in all transactions.
1319 */ 1302 */
1320 if (committed) { 1303 if (committed)
1321 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1304 xfs_trans_ijoin(args->trans, dp);
1322 xfs_trans_ihold(args->trans, dp);
1323 }
1324 1305
1325 /* 1306 /*
1326 * Commit the node conversion and start the next 1307 * Commit the node conversion and start the next
@@ -1356,10 +1337,8 @@ restart:
1356 * bmap_finish() may have committed the last trans and started 1337 * bmap_finish() may have committed the last trans and started
1357 * a new one. We need the inode to be in all transactions. 1338 * a new one. We need the inode to be in all transactions.
1358 */ 1339 */
1359 if (committed) { 1340 if (committed)
1360 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1341 xfs_trans_ijoin(args->trans, dp);
1361 xfs_trans_ihold(args->trans, dp);
1362 }
1363 } else { 1342 } else {
1364 /* 1343 /*
1365 * Addition succeeded, update Btree hashvals. 1344 * Addition succeeded, update Btree hashvals.
@@ -1470,10 +1449,8 @@ restart:
1470 * and started a new one. We need the inode to be 1449 * and started a new one. We need the inode to be
1471 * in all transactions. 1450 * in all transactions.
1472 */ 1451 */
1473 if (committed) { 1452 if (committed)
1474 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1453 xfs_trans_ijoin(args->trans, dp);
1475 xfs_trans_ihold(args->trans, dp);
1476 }
1477 } 1454 }
1478 1455
1479 /* 1456 /*
@@ -1604,10 +1581,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1604 * bmap_finish() may have committed the last trans and started 1581 * bmap_finish() may have committed the last trans and started
1605 * a new one. We need the inode to be in all transactions. 1582 * a new one. We need the inode to be in all transactions.
1606 */ 1583 */
1607 if (committed) { 1584 if (committed)
1608 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1585 xfs_trans_ijoin(args->trans, dp);
1609 xfs_trans_ihold(args->trans, dp);
1610 }
1611 1586
1612 /* 1587 /*
1613 * Commit the Btree join operation and start a new trans. 1588 * Commit the Btree join operation and start a new trans.
@@ -1658,10 +1633,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1658 * and started a new one. We need the inode to be 1633 * and started a new one. We need the inode to be
1659 * in all transactions. 1634 * in all transactions.
1660 */ 1635 */
1661 if (committed) { 1636 if (committed)
1662 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 1637 xfs_trans_ijoin(args->trans, dp);
1663 xfs_trans_ihold(args->trans, dp);
1664 }
1665 } else 1638 } else
1666 xfs_da_brelse(args->trans, bp); 1639 xfs_da_brelse(args->trans, bp);
1667 } 1640 }
@@ -2004,7 +1977,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
2004 error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, 1977 error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
2005 args->rmtblkcnt, 1978 args->rmtblkcnt,
2006 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 1979 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2007 NULL, 0, map, &nmap, NULL, NULL); 1980 NULL, 0, map, &nmap, NULL);
2008 if (error) 1981 if (error)
2009 return(error); 1982 return(error);
2010 ASSERT(nmap >= 1); 1983 ASSERT(nmap >= 1);
@@ -2083,7 +2056,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2083 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | 2056 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
2084 XFS_BMAPI_WRITE, 2057 XFS_BMAPI_WRITE,
2085 args->firstblock, args->total, &map, &nmap, 2058 args->firstblock, args->total, &map, &nmap,
2086 args->flist, NULL); 2059 args->flist);
2087 if (!error) { 2060 if (!error) {
2088 error = xfs_bmap_finish(&args->trans, args->flist, 2061 error = xfs_bmap_finish(&args->trans, args->flist,
2089 &committed); 2062 &committed);
@@ -2099,10 +2072,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2099 * bmap_finish() may have committed the last trans and started 2072 * bmap_finish() may have committed the last trans and started
2100 * a new one. We need the inode to be in all transactions. 2073 * a new one. We need the inode to be in all transactions.
2101 */ 2074 */
2102 if (committed) { 2075 if (committed)
2103 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); 2076 xfs_trans_ijoin(args->trans, dp);
2104 xfs_trans_ihold(args->trans, dp);
2105 }
2106 2077
2107 ASSERT(nmap == 1); 2078 ASSERT(nmap == 1);
2108 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 2079 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2136,7 +2107,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2136 args->rmtblkcnt, 2107 args->rmtblkcnt,
2137 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2108 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2138 args->firstblock, 0, &map, &nmap, 2109 args->firstblock, 0, &map, &nmap,
2139 NULL, NULL); 2110 NULL);
2140 if (error) { 2111 if (error) {
2141 return(error); 2112 return(error);
2142 } 2113 }
@@ -2201,7 +2172,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2201 args->rmtblkcnt, 2172 args->rmtblkcnt,
2202 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2173 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2203 args->firstblock, 0, &map, &nmap, 2174 args->firstblock, 0, &map, &nmap,
2204 args->flist, NULL); 2175 args->flist);
2205 if (error) { 2176 if (error) {
2206 return(error); 2177 return(error);
2207 } 2178 }
@@ -2239,7 +2210,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2239 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, 2210 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
2240 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2211 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2241 1, args->firstblock, args->flist, 2212 1, args->firstblock, args->flist,
2242 NULL, &done); 2213 &done);
2243 if (!error) { 2214 if (!error) {
2244 error = xfs_bmap_finish(&args->trans, args->flist, 2215 error = xfs_bmap_finish(&args->trans, args->flist,
2245 &committed); 2216 &committed);
@@ -2255,10 +2226,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2255 * bmap_finish() may have committed the last trans and started 2226 * bmap_finish() may have committed the last trans and started
2256 * a new one. We need the inode to be in all transactions. 2227 * a new one. We need the inode to be in all transactions.
2257 */ 2228 */
2258 if (committed) { 2229 if (committed)
2259 xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL); 2230 xfs_trans_ijoin(args->trans, args->dp);
2260 xfs_trans_ihold(args->trans, args->dp);
2261 }
2262 2231
2263 /* 2232 /*
2264 * Close out trans and start the next one in the chain. 2233 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a90ce74fc256..a6cff8edcdb6 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -24,8 +24,6 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
@@ -33,7 +31,6 @@
33#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
34#include "xfs_alloc.h" 32#include "xfs_alloc.h"
35#include "xfs_btree.h" 33#include "xfs_btree.h"
36#include "xfs_dir2_sf.h"
37#include "xfs_attr_sf.h" 34#include "xfs_attr_sf.h"
38#include "xfs_dinode.h" 35#include "xfs_dinode.h"
39#include "xfs_inode.h" 36#include "xfs_inode.h"
@@ -2931,7 +2928,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2931 nmap = 1; 2928 nmap = 1;
2932 error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, 2929 error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
2933 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2930 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2934 NULL, 0, &map, &nmap, NULL, NULL); 2931 NULL, 0, &map, &nmap, NULL);
2935 if (error) { 2932 if (error) {
2936 return(error); 2933 return(error);
2937 } 2934 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5c11e4d17010..f90dadd5a968 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -30,13 +30,10 @@
30#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
32#include "xfs_dir2_sf.h" 32#include "xfs_dir2_sf.h"
33#include "xfs_attr_sf.h"
34#include "xfs_dinode.h" 33#include "xfs_dinode.h"
35#include "xfs_inode.h" 34#include "xfs_inode.h"
36#include "xfs_btree.h" 35#include "xfs_btree.h"
37#include "xfs_dmapi.h"
38#include "xfs_mount.h" 36#include "xfs_mount.h"
39#include "xfs_ialloc.h"
40#include "xfs_itable.h" 37#include "xfs_itable.h"
41#include "xfs_dir2_data.h" 38#include "xfs_dir2_data.h"
42#include "xfs_dir2_leaf.h" 39#include "xfs_dir2_leaf.h"
@@ -104,7 +101,6 @@ xfs_bmap_add_extent(
104 xfs_fsblock_t *first, /* pointer to firstblock variable */ 101 xfs_fsblock_t *first, /* pointer to firstblock variable */
105 xfs_bmap_free_t *flist, /* list of extents to be freed */ 102 xfs_bmap_free_t *flist, /* list of extents to be freed */
106 int *logflagsp, /* inode logging flags */ 103 int *logflagsp, /* inode logging flags */
107 xfs_extdelta_t *delta, /* Change made to incore extents */
108 int whichfork, /* data or attr fork */ 104 int whichfork, /* data or attr fork */
109 int rsvd); /* OK to allocate reserved blocks */ 105 int rsvd); /* OK to allocate reserved blocks */
110 106
@@ -122,7 +118,6 @@ xfs_bmap_add_extent_delay_real(
122 xfs_fsblock_t *first, /* pointer to firstblock variable */ 118 xfs_fsblock_t *first, /* pointer to firstblock variable */
123 xfs_bmap_free_t *flist, /* list of extents to be freed */ 119 xfs_bmap_free_t *flist, /* list of extents to be freed */
124 int *logflagsp, /* inode logging flags */ 120 int *logflagsp, /* inode logging flags */
125 xfs_extdelta_t *delta, /* Change made to incore extents */
126 int rsvd); /* OK to allocate reserved blocks */ 121 int rsvd); /* OK to allocate reserved blocks */
127 122
128/* 123/*
@@ -135,7 +130,6 @@ xfs_bmap_add_extent_hole_delay(
135 xfs_extnum_t idx, /* extent number to update/insert */ 130 xfs_extnum_t idx, /* extent number to update/insert */
136 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 131 xfs_bmbt_irec_t *new, /* new data to add to file extents */
137 int *logflagsp,/* inode logging flags */ 132 int *logflagsp,/* inode logging flags */
138 xfs_extdelta_t *delta, /* Change made to incore extents */
139 int rsvd); /* OK to allocate reserved blocks */ 133 int rsvd); /* OK to allocate reserved blocks */
140 134
141/* 135/*
@@ -149,7 +143,6 @@ xfs_bmap_add_extent_hole_real(
149 xfs_btree_cur_t *cur, /* if null, not a btree */ 143 xfs_btree_cur_t *cur, /* if null, not a btree */
150 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 144 xfs_bmbt_irec_t *new, /* new data to add to file extents */
151 int *logflagsp, /* inode logging flags */ 145 int *logflagsp, /* inode logging flags */
152 xfs_extdelta_t *delta, /* Change made to incore extents */
153 int whichfork); /* data or attr fork */ 146 int whichfork); /* data or attr fork */
154 147
155/* 148/*
@@ -162,8 +155,7 @@ xfs_bmap_add_extent_unwritten_real(
162 xfs_extnum_t idx, /* extent number to update/insert */ 155 xfs_extnum_t idx, /* extent number to update/insert */
163 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 156 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
164 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 157 xfs_bmbt_irec_t *new, /* new data to add to file extents */
165 int *logflagsp, /* inode logging flags */ 158 int *logflagsp); /* inode logging flags */
166 xfs_extdelta_t *delta); /* Change made to incore extents */
167 159
168/* 160/*
169 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. 161 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
@@ -200,7 +192,6 @@ xfs_bmap_del_extent(
200 xfs_btree_cur_t *cur, /* if null, not a btree */ 192 xfs_btree_cur_t *cur, /* if null, not a btree */
201 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 193 xfs_bmbt_irec_t *new, /* new data to add to file extents */
202 int *logflagsp,/* inode logging flags */ 194 int *logflagsp,/* inode logging flags */
203 xfs_extdelta_t *delta, /* Change made to incore extents */
204 int whichfork, /* data or attr fork */ 195 int whichfork, /* data or attr fork */
205 int rsvd); /* OK to allocate reserved blocks */ 196 int rsvd); /* OK to allocate reserved blocks */
206 197
@@ -489,7 +480,6 @@ xfs_bmap_add_extent(
489 xfs_fsblock_t *first, /* pointer to firstblock variable */ 480 xfs_fsblock_t *first, /* pointer to firstblock variable */
490 xfs_bmap_free_t *flist, /* list of extents to be freed */ 481 xfs_bmap_free_t *flist, /* list of extents to be freed */
491 int *logflagsp, /* inode logging flags */ 482 int *logflagsp, /* inode logging flags */
492 xfs_extdelta_t *delta, /* Change made to incore extents */
493 int whichfork, /* data or attr fork */ 483 int whichfork, /* data or attr fork */
494 int rsvd) /* OK to use reserved data blocks */ 484 int rsvd) /* OK to use reserved data blocks */
495{ 485{
@@ -524,15 +514,6 @@ xfs_bmap_add_extent(
524 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 514 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
525 } else 515 } else
526 logflags = 0; 516 logflags = 0;
527 /* DELTA: single new extent */
528 if (delta) {
529 if (delta->xed_startoff > new->br_startoff)
530 delta->xed_startoff = new->br_startoff;
531 if (delta->xed_blockcount <
532 new->br_startoff + new->br_blockcount)
533 delta->xed_blockcount = new->br_startoff +
534 new->br_blockcount;
535 }
536 } 517 }
537 /* 518 /*
538 * Any kind of new delayed allocation goes here. 519 * Any kind of new delayed allocation goes here.
@@ -542,7 +523,7 @@ xfs_bmap_add_extent(
542 ASSERT((cur->bc_private.b.flags & 523 ASSERT((cur->bc_private.b.flags &
543 XFS_BTCUR_BPRV_WASDEL) == 0); 524 XFS_BTCUR_BPRV_WASDEL) == 0);
544 if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, 525 if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
545 &logflags, delta, rsvd))) 526 &logflags, rsvd)))
546 goto done; 527 goto done;
547 } 528 }
548 /* 529 /*
@@ -553,7 +534,7 @@ xfs_bmap_add_extent(
553 ASSERT((cur->bc_private.b.flags & 534 ASSERT((cur->bc_private.b.flags &
554 XFS_BTCUR_BPRV_WASDEL) == 0); 535 XFS_BTCUR_BPRV_WASDEL) == 0);
555 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, 536 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
556 &logflags, delta, whichfork))) 537 &logflags, whichfork)))
557 goto done; 538 goto done;
558 } else { 539 } else {
559 xfs_bmbt_irec_t prev; /* old extent at offset idx */ 540 xfs_bmbt_irec_t prev; /* old extent at offset idx */
@@ -578,17 +559,17 @@ xfs_bmap_add_extent(
578 XFS_BTCUR_BPRV_WASDEL); 559 XFS_BTCUR_BPRV_WASDEL);
579 if ((error = xfs_bmap_add_extent_delay_real(ip, 560 if ((error = xfs_bmap_add_extent_delay_real(ip,
580 idx, &cur, new, &da_new, first, flist, 561 idx, &cur, new, &da_new, first, flist,
581 &logflags, delta, rsvd))) 562 &logflags, rsvd)))
582 goto done; 563 goto done;
583 } else if (new->br_state == XFS_EXT_NORM) { 564 } else if (new->br_state == XFS_EXT_NORM) {
584 ASSERT(new->br_state == XFS_EXT_NORM); 565 ASSERT(new->br_state == XFS_EXT_NORM);
585 if ((error = xfs_bmap_add_extent_unwritten_real( 566 if ((error = xfs_bmap_add_extent_unwritten_real(
586 ip, idx, &cur, new, &logflags, delta))) 567 ip, idx, &cur, new, &logflags)))
587 goto done; 568 goto done;
588 } else { 569 } else {
589 ASSERT(new->br_state == XFS_EXT_UNWRITTEN); 570 ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
590 if ((error = xfs_bmap_add_extent_unwritten_real( 571 if ((error = xfs_bmap_add_extent_unwritten_real(
591 ip, idx, &cur, new, &logflags, delta))) 572 ip, idx, &cur, new, &logflags)))
592 goto done; 573 goto done;
593 } 574 }
594 ASSERT(*curp == cur || *curp == NULL); 575 ASSERT(*curp == cur || *curp == NULL);
@@ -601,7 +582,7 @@ xfs_bmap_add_extent(
601 ASSERT((cur->bc_private.b.flags & 582 ASSERT((cur->bc_private.b.flags &
602 XFS_BTCUR_BPRV_WASDEL) == 0); 583 XFS_BTCUR_BPRV_WASDEL) == 0);
603 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, 584 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
604 new, &logflags, delta, whichfork))) 585 new, &logflags, whichfork)))
605 goto done; 586 goto done;
606 } 587 }
607 } 588 }
@@ -666,7 +647,6 @@ xfs_bmap_add_extent_delay_real(
666 xfs_fsblock_t *first, /* pointer to firstblock variable */ 647 xfs_fsblock_t *first, /* pointer to firstblock variable */
667 xfs_bmap_free_t *flist, /* list of extents to be freed */ 648 xfs_bmap_free_t *flist, /* list of extents to be freed */
668 int *logflagsp, /* inode logging flags */ 649 int *logflagsp, /* inode logging flags */
669 xfs_extdelta_t *delta, /* Change made to incore extents */
670 int rsvd) /* OK to use reserved data block allocation */ 650 int rsvd) /* OK to use reserved data block allocation */
671{ 651{
672 xfs_btree_cur_t *cur; /* btree cursor */ 652 xfs_btree_cur_t *cur; /* btree cursor */
@@ -797,11 +777,6 @@ xfs_bmap_add_extent_delay_real(
797 goto done; 777 goto done;
798 } 778 }
799 *dnew = 0; 779 *dnew = 0;
800 /* DELTA: Three in-core extents are replaced by one. */
801 temp = LEFT.br_startoff;
802 temp2 = LEFT.br_blockcount +
803 PREV.br_blockcount +
804 RIGHT.br_blockcount;
805 break; 780 break;
806 781
807 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: 782 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -832,10 +807,6 @@ xfs_bmap_add_extent_delay_real(
832 goto done; 807 goto done;
833 } 808 }
834 *dnew = 0; 809 *dnew = 0;
835 /* DELTA: Two in-core extents are replaced by one. */
836 temp = LEFT.br_startoff;
837 temp2 = LEFT.br_blockcount +
838 PREV.br_blockcount;
839 break; 810 break;
840 811
841 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 812 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -867,10 +838,6 @@ xfs_bmap_add_extent_delay_real(
867 goto done; 838 goto done;
868 } 839 }
869 *dnew = 0; 840 *dnew = 0;
870 /* DELTA: Two in-core extents are replaced by one. */
871 temp = PREV.br_startoff;
872 temp2 = PREV.br_blockcount +
873 RIGHT.br_blockcount;
874 break; 841 break;
875 842
876 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: 843 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -900,9 +867,6 @@ xfs_bmap_add_extent_delay_real(
900 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 867 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
901 } 868 }
902 *dnew = 0; 869 *dnew = 0;
903 /* DELTA: The in-core extent described by new changed type. */
904 temp = new->br_startoff;
905 temp2 = new->br_blockcount;
906 break; 870 break;
907 871
908 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: 872 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -942,10 +906,6 @@ xfs_bmap_add_extent_delay_real(
942 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 906 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
943 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 907 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
944 *dnew = temp; 908 *dnew = temp;
945 /* DELTA: The boundary between two in-core extents moved. */
946 temp = LEFT.br_startoff;
947 temp2 = LEFT.br_blockcount +
948 PREV.br_blockcount;
949 break; 909 break;
950 910
951 case BMAP_LEFT_FILLING: 911 case BMAP_LEFT_FILLING:
@@ -990,9 +950,6 @@ xfs_bmap_add_extent_delay_real(
990 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 950 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
991 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 951 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
992 *dnew = temp; 952 *dnew = temp;
993 /* DELTA: One in-core extent is split in two. */
994 temp = PREV.br_startoff;
995 temp2 = PREV.br_blockcount;
996 break; 953 break;
997 954
998 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 955 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1031,10 +988,6 @@ xfs_bmap_add_extent_delay_real(
1031 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 988 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1032 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 989 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1033 *dnew = temp; 990 *dnew = temp;
1034 /* DELTA: The boundary between two in-core extents moved. */
1035 temp = PREV.br_startoff;
1036 temp2 = PREV.br_blockcount +
1037 RIGHT.br_blockcount;
1038 break; 991 break;
1039 992
1040 case BMAP_RIGHT_FILLING: 993 case BMAP_RIGHT_FILLING:
@@ -1078,9 +1031,6 @@ xfs_bmap_add_extent_delay_real(
1078 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 1031 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1079 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1032 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1080 *dnew = temp; 1033 *dnew = temp;
1081 /* DELTA: One in-core extent is split in two. */
1082 temp = PREV.br_startoff;
1083 temp2 = PREV.br_blockcount;
1084 break; 1034 break;
1085 1035
1086 case 0: 1036 case 0:
@@ -1161,9 +1111,6 @@ xfs_bmap_add_extent_delay_real(
1161 nullstartblock((int)temp2)); 1111 nullstartblock((int)temp2));
1162 trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); 1112 trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
1163 *dnew = temp + temp2; 1113 *dnew = temp + temp2;
1164 /* DELTA: One in-core extent is split in three. */
1165 temp = PREV.br_startoff;
1166 temp2 = PREV.br_blockcount;
1167 break; 1114 break;
1168 1115
1169 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: 1116 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1179,13 +1126,6 @@ xfs_bmap_add_extent_delay_real(
1179 ASSERT(0); 1126 ASSERT(0);
1180 } 1127 }
1181 *curp = cur; 1128 *curp = cur;
1182 if (delta) {
1183 temp2 += temp;
1184 if (delta->xed_startoff > temp)
1185 delta->xed_startoff = temp;
1186 if (delta->xed_blockcount < temp2)
1187 delta->xed_blockcount = temp2;
1188 }
1189done: 1129done:
1190 *logflagsp = rval; 1130 *logflagsp = rval;
1191 return error; 1131 return error;
@@ -1204,8 +1144,7 @@ xfs_bmap_add_extent_unwritten_real(
1204 xfs_extnum_t idx, /* extent number to update/insert */ 1144 xfs_extnum_t idx, /* extent number to update/insert */
1205 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 1145 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
1206 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1146 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1207 int *logflagsp, /* inode logging flags */ 1147 int *logflagsp) /* inode logging flags */
1208 xfs_extdelta_t *delta) /* Change made to incore extents */
1209{ 1148{
1210 xfs_btree_cur_t *cur; /* btree cursor */ 1149 xfs_btree_cur_t *cur; /* btree cursor */
1211 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ 1150 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
@@ -1219,8 +1158,6 @@ xfs_bmap_add_extent_unwritten_real(
1219 /* left is 0, right is 1, prev is 2 */ 1158 /* left is 0, right is 1, prev is 2 */
1220 int rval=0; /* return value (logging flags) */ 1159 int rval=0; /* return value (logging flags) */
1221 int state = 0;/* state bits, accessed thru macros */ 1160 int state = 0;/* state bits, accessed thru macros */
1222 xfs_filblks_t temp=0;
1223 xfs_filblks_t temp2=0;
1224 1161
1225#define LEFT r[0] 1162#define LEFT r[0]
1226#define RIGHT r[1] 1163#define RIGHT r[1]
@@ -1341,11 +1278,6 @@ xfs_bmap_add_extent_unwritten_real(
1341 RIGHT.br_blockcount, LEFT.br_state))) 1278 RIGHT.br_blockcount, LEFT.br_state)))
1342 goto done; 1279 goto done;
1343 } 1280 }
1344 /* DELTA: Three in-core extents are replaced by one. */
1345 temp = LEFT.br_startoff;
1346 temp2 = LEFT.br_blockcount +
1347 PREV.br_blockcount +
1348 RIGHT.br_blockcount;
1349 break; 1281 break;
1350 1282
1351 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: 1283 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -1382,10 +1314,6 @@ xfs_bmap_add_extent_unwritten_real(
1382 LEFT.br_state))) 1314 LEFT.br_state)))
1383 goto done; 1315 goto done;
1384 } 1316 }
1385 /* DELTA: Two in-core extents are replaced by one. */
1386 temp = LEFT.br_startoff;
1387 temp2 = LEFT.br_blockcount +
1388 PREV.br_blockcount;
1389 break; 1317 break;
1390 1318
1391 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 1319 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1422,10 +1350,6 @@ xfs_bmap_add_extent_unwritten_real(
1422 newext))) 1350 newext)))
1423 goto done; 1351 goto done;
1424 } 1352 }
1425 /* DELTA: Two in-core extents are replaced by one. */
1426 temp = PREV.br_startoff;
1427 temp2 = PREV.br_blockcount +
1428 RIGHT.br_blockcount;
1429 break; 1353 break;
1430 1354
1431 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: 1355 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -1453,9 +1377,6 @@ xfs_bmap_add_extent_unwritten_real(
1453 newext))) 1377 newext)))
1454 goto done; 1378 goto done;
1455 } 1379 }
1456 /* DELTA: The in-core extent described by new changed type. */
1457 temp = new->br_startoff;
1458 temp2 = new->br_blockcount;
1459 break; 1380 break;
1460 1381
1461 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: 1382 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -1501,10 +1422,6 @@ xfs_bmap_add_extent_unwritten_real(
1501 LEFT.br_state)) 1422 LEFT.br_state))
1502 goto done; 1423 goto done;
1503 } 1424 }
1504 /* DELTA: The boundary between two in-core extents moved. */
1505 temp = LEFT.br_startoff;
1506 temp2 = LEFT.br_blockcount +
1507 PREV.br_blockcount;
1508 break; 1425 break;
1509 1426
1510 case BMAP_LEFT_FILLING: 1427 case BMAP_LEFT_FILLING:
@@ -1544,9 +1461,6 @@ xfs_bmap_add_extent_unwritten_real(
1544 goto done; 1461 goto done;
1545 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1462 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1546 } 1463 }
1547 /* DELTA: One in-core extent is split in two. */
1548 temp = PREV.br_startoff;
1549 temp2 = PREV.br_blockcount;
1550 break; 1464 break;
1551 1465
1552 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 1466 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1587,10 +1501,6 @@ xfs_bmap_add_extent_unwritten_real(
1587 newext))) 1501 newext)))
1588 goto done; 1502 goto done;
1589 } 1503 }
1590 /* DELTA: The boundary between two in-core extents moved. */
1591 temp = PREV.br_startoff;
1592 temp2 = PREV.br_blockcount +
1593 RIGHT.br_blockcount;
1594 break; 1504 break;
1595 1505
1596 case BMAP_RIGHT_FILLING: 1506 case BMAP_RIGHT_FILLING:
@@ -1630,9 +1540,6 @@ xfs_bmap_add_extent_unwritten_real(
1630 goto done; 1540 goto done;
1631 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1541 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1632 } 1542 }
1633 /* DELTA: One in-core extent is split in two. */
1634 temp = PREV.br_startoff;
1635 temp2 = PREV.br_blockcount;
1636 break; 1543 break;
1637 1544
1638 case 0: 1545 case 0:
@@ -1692,9 +1599,6 @@ xfs_bmap_add_extent_unwritten_real(
1692 goto done; 1599 goto done;
1693 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1600 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1694 } 1601 }
1695 /* DELTA: One in-core extent is split in three. */
1696 temp = PREV.br_startoff;
1697 temp2 = PREV.br_blockcount;
1698 break; 1602 break;
1699 1603
1700 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: 1604 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1710,13 +1614,6 @@ xfs_bmap_add_extent_unwritten_real(
1710 ASSERT(0); 1614 ASSERT(0);
1711 } 1615 }
1712 *curp = cur; 1616 *curp = cur;
1713 if (delta) {
1714 temp2 += temp;
1715 if (delta->xed_startoff > temp)
1716 delta->xed_startoff = temp;
1717 if (delta->xed_blockcount < temp2)
1718 delta->xed_blockcount = temp2;
1719 }
1720done: 1617done:
1721 *logflagsp = rval; 1618 *logflagsp = rval;
1722 return error; 1619 return error;
@@ -1736,7 +1633,6 @@ xfs_bmap_add_extent_hole_delay(
1736 xfs_extnum_t idx, /* extent number to update/insert */ 1633 xfs_extnum_t idx, /* extent number to update/insert */
1737 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1634 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1738 int *logflagsp, /* inode logging flags */ 1635 int *logflagsp, /* inode logging flags */
1739 xfs_extdelta_t *delta, /* Change made to incore extents */
1740 int rsvd) /* OK to allocate reserved blocks */ 1636 int rsvd) /* OK to allocate reserved blocks */
1741{ 1637{
1742 xfs_bmbt_rec_host_t *ep; /* extent record for idx */ 1638 xfs_bmbt_rec_host_t *ep; /* extent record for idx */
@@ -1747,7 +1643,6 @@ xfs_bmap_add_extent_hole_delay(
1747 xfs_bmbt_irec_t right; /* right neighbor extent entry */ 1643 xfs_bmbt_irec_t right; /* right neighbor extent entry */
1748 int state; /* state bits, accessed thru macros */ 1644 int state; /* state bits, accessed thru macros */
1749 xfs_filblks_t temp=0; /* temp for indirect calculations */ 1645 xfs_filblks_t temp=0; /* temp for indirect calculations */
1750 xfs_filblks_t temp2=0;
1751 1646
1752 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1647 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1753 ep = xfs_iext_get_ext(ifp, idx); 1648 ep = xfs_iext_get_ext(ifp, idx);
@@ -1819,9 +1714,6 @@ xfs_bmap_add_extent_hole_delay(
1819 1714
1820 xfs_iext_remove(ip, idx, 1, state); 1715 xfs_iext_remove(ip, idx, 1, state);
1821 ip->i_df.if_lastex = idx - 1; 1716 ip->i_df.if_lastex = idx - 1;
1822 /* DELTA: Two in-core extents were replaced by one. */
1823 temp2 = temp;
1824 temp = left.br_startoff;
1825 break; 1717 break;
1826 1718
1827 case BMAP_LEFT_CONTIG: 1719 case BMAP_LEFT_CONTIG:
@@ -1841,9 +1733,6 @@ xfs_bmap_add_extent_hole_delay(
1841 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1733 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
1842 1734
1843 ip->i_df.if_lastex = idx - 1; 1735 ip->i_df.if_lastex = idx - 1;
1844 /* DELTA: One in-core extent grew into a hole. */
1845 temp2 = temp;
1846 temp = left.br_startoff;
1847 break; 1736 break;
1848 1737
1849 case BMAP_RIGHT_CONTIG: 1738 case BMAP_RIGHT_CONTIG:
@@ -1862,9 +1751,6 @@ xfs_bmap_add_extent_hole_delay(
1862 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1751 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
1863 1752
1864 ip->i_df.if_lastex = idx; 1753 ip->i_df.if_lastex = idx;
1865 /* DELTA: One in-core extent grew into a hole. */
1866 temp2 = temp;
1867 temp = new->br_startoff;
1868 break; 1754 break;
1869 1755
1870 case 0: 1756 case 0:
@@ -1876,9 +1762,6 @@ xfs_bmap_add_extent_hole_delay(
1876 oldlen = newlen = 0; 1762 oldlen = newlen = 0;
1877 xfs_iext_insert(ip, idx, 1, new, state); 1763 xfs_iext_insert(ip, idx, 1, new, state);
1878 ip->i_df.if_lastex = idx; 1764 ip->i_df.if_lastex = idx;
1879 /* DELTA: A new in-core extent was added in a hole. */
1880 temp2 = new->br_blockcount;
1881 temp = new->br_startoff;
1882 break; 1765 break;
1883 } 1766 }
1884 if (oldlen != newlen) { 1767 if (oldlen != newlen) {
@@ -1889,13 +1772,6 @@ xfs_bmap_add_extent_hole_delay(
1889 * Nothing to do for disk quota accounting here. 1772 * Nothing to do for disk quota accounting here.
1890 */ 1773 */
1891 } 1774 }
1892 if (delta) {
1893 temp2 += temp;
1894 if (delta->xed_startoff > temp)
1895 delta->xed_startoff = temp;
1896 if (delta->xed_blockcount < temp2)
1897 delta->xed_blockcount = temp2;
1898 }
1899 *logflagsp = 0; 1775 *logflagsp = 0;
1900 return 0; 1776 return 0;
1901} 1777}
@@ -1911,7 +1787,6 @@ xfs_bmap_add_extent_hole_real(
1911 xfs_btree_cur_t *cur, /* if null, not a btree */ 1787 xfs_btree_cur_t *cur, /* if null, not a btree */
1912 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1788 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1913 int *logflagsp, /* inode logging flags */ 1789 int *logflagsp, /* inode logging flags */
1914 xfs_extdelta_t *delta, /* Change made to incore extents */
1915 int whichfork) /* data or attr fork */ 1790 int whichfork) /* data or attr fork */
1916{ 1791{
1917 xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */ 1792 xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */
@@ -1922,8 +1797,6 @@ xfs_bmap_add_extent_hole_real(
1922 xfs_bmbt_irec_t right; /* right neighbor extent entry */ 1797 xfs_bmbt_irec_t right; /* right neighbor extent entry */
1923 int rval=0; /* return value (logging flags) */ 1798 int rval=0; /* return value (logging flags) */
1924 int state; /* state bits, accessed thru macros */ 1799 int state; /* state bits, accessed thru macros */
1925 xfs_filblks_t temp=0;
1926 xfs_filblks_t temp2=0;
1927 1800
1928 ifp = XFS_IFORK_PTR(ip, whichfork); 1801 ifp = XFS_IFORK_PTR(ip, whichfork);
1929 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 1802 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
@@ -2020,11 +1893,6 @@ xfs_bmap_add_extent_hole_real(
2020 left.br_state))) 1893 left.br_state)))
2021 goto done; 1894 goto done;
2022 } 1895 }
2023 /* DELTA: Two in-core extents were replaced by one. */
2024 temp = left.br_startoff;
2025 temp2 = left.br_blockcount +
2026 new->br_blockcount +
2027 right.br_blockcount;
2028 break; 1896 break;
2029 1897
2030 case BMAP_LEFT_CONTIG: 1898 case BMAP_LEFT_CONTIG:
@@ -2056,10 +1924,6 @@ xfs_bmap_add_extent_hole_real(
2056 left.br_state))) 1924 left.br_state)))
2057 goto done; 1925 goto done;
2058 } 1926 }
2059 /* DELTA: One in-core extent grew. */
2060 temp = left.br_startoff;
2061 temp2 = left.br_blockcount +
2062 new->br_blockcount;
2063 break; 1927 break;
2064 1928
2065 case BMAP_RIGHT_CONTIG: 1929 case BMAP_RIGHT_CONTIG:
@@ -2092,10 +1956,6 @@ xfs_bmap_add_extent_hole_real(
2092 right.br_state))) 1956 right.br_state)))
2093 goto done; 1957 goto done;
2094 } 1958 }
2095 /* DELTA: One in-core extent grew. */
2096 temp = new->br_startoff;
2097 temp2 = new->br_blockcount +
2098 right.br_blockcount;
2099 break; 1959 break;
2100 1960
2101 case 0: 1961 case 0:
@@ -2123,18 +1983,8 @@ xfs_bmap_add_extent_hole_real(
2123 goto done; 1983 goto done;
2124 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1984 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2125 } 1985 }
2126 /* DELTA: A new extent was added in a hole. */
2127 temp = new->br_startoff;
2128 temp2 = new->br_blockcount;
2129 break; 1986 break;
2130 } 1987 }
2131 if (delta) {
2132 temp2 += temp;
2133 if (delta->xed_startoff > temp)
2134 delta->xed_startoff = temp;
2135 if (delta->xed_blockcount < temp2)
2136 delta->xed_blockcount = temp2;
2137 }
2138done: 1988done:
2139 *logflagsp = rval; 1989 *logflagsp = rval;
2140 return error; 1990 return error;
@@ -2959,7 +2809,6 @@ xfs_bmap_del_extent(
2959 xfs_btree_cur_t *cur, /* if null, not a btree */ 2809 xfs_btree_cur_t *cur, /* if null, not a btree */
2960 xfs_bmbt_irec_t *del, /* data to remove from extents */ 2810 xfs_bmbt_irec_t *del, /* data to remove from extents */
2961 int *logflagsp, /* inode logging flags */ 2811 int *logflagsp, /* inode logging flags */
2962 xfs_extdelta_t *delta, /* Change made to incore extents */
2963 int whichfork, /* data or attr fork */ 2812 int whichfork, /* data or attr fork */
2964 int rsvd) /* OK to allocate reserved blocks */ 2813 int rsvd) /* OK to allocate reserved blocks */
2965{ 2814{
@@ -3265,14 +3114,6 @@ xfs_bmap_del_extent(
3265 if (da_old > da_new) 3114 if (da_old > da_new)
3266 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), 3115 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new),
3267 rsvd); 3116 rsvd);
3268 if (delta) {
3269 /* DELTA: report the original extent. */
3270 if (delta->xed_startoff > got.br_startoff)
3271 delta->xed_startoff = got.br_startoff;
3272 if (delta->xed_blockcount < got.br_startoff+got.br_blockcount)
3273 delta->xed_blockcount = got.br_startoff +
3274 got.br_blockcount;
3275 }
3276done: 3117done:
3277 *logflagsp = flags; 3118 *logflagsp = flags;
3278 return error; 3119 return error;
@@ -3754,9 +3595,10 @@ xfs_bmap_add_attrfork(
3754 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 3595 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
3755 } 3596 }
3756 ASSERT(ip->i_d.di_anextents == 0); 3597 ASSERT(ip->i_d.di_anextents == 0);
3757 IHOLD(ip); 3598
3758 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 3599 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
3759 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 3600 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3601
3760 switch (ip->i_d.di_format) { 3602 switch (ip->i_d.di_format) {
3761 case XFS_DINODE_FMT_DEV: 3603 case XFS_DINODE_FMT_DEV:
3762 ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; 3604 ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
@@ -3829,7 +3671,7 @@ xfs_bmap_add_attrfork(
3829 } 3671 }
3830 if ((error = xfs_bmap_finish(&tp, &flist, &committed))) 3672 if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
3831 goto error2; 3673 goto error2;
3832 error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES); 3674 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3833 ASSERT(ip->i_df.if_ext_max == 3675 ASSERT(ip->i_df.if_ext_max ==
3834 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); 3676 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3835 return error; 3677 return error;
@@ -4483,8 +4325,7 @@ xfs_bmapi(
4483 xfs_extlen_t total, /* total blocks needed */ 4325 xfs_extlen_t total, /* total blocks needed */
4484 xfs_bmbt_irec_t *mval, /* output: map values */ 4326 xfs_bmbt_irec_t *mval, /* output: map values */
4485 int *nmap, /* i/o: mval size/count */ 4327 int *nmap, /* i/o: mval size/count */
4486 xfs_bmap_free_t *flist, /* i/o: list extents to free */ 4328 xfs_bmap_free_t *flist) /* i/o: list extents to free */
4487 xfs_extdelta_t *delta) /* o: change made to incore extents */
4488{ 4329{
4489 xfs_fsblock_t abno; /* allocated block number */ 4330 xfs_fsblock_t abno; /* allocated block number */
4490 xfs_extlen_t alen; /* allocated extent length */ 4331 xfs_extlen_t alen; /* allocated extent length */
@@ -4596,10 +4437,7 @@ xfs_bmapi(
4596 end = bno + len; 4437 end = bno + len;
4597 obno = bno; 4438 obno = bno;
4598 bma.ip = NULL; 4439 bma.ip = NULL;
4599 if (delta) { 4440
4600 delta->xed_startoff = NULLFILEOFF;
4601 delta->xed_blockcount = 0;
4602 }
4603 while (bno < end && n < *nmap) { 4441 while (bno < end && n < *nmap) {
4604 /* 4442 /*
4605 * Reading past eof, act as though there's a hole 4443 * Reading past eof, act as though there's a hole
@@ -4620,19 +4458,13 @@ xfs_bmapi(
4620 * allocate the stuff asked for in this bmap call 4458 * allocate the stuff asked for in this bmap call
4621 * but that wouldn't be as good. 4459 * but that wouldn't be as good.
4622 */ 4460 */
4623 if (wasdelay && !(flags & XFS_BMAPI_EXACT)) { 4461 if (wasdelay) {
4624 alen = (xfs_extlen_t)got.br_blockcount; 4462 alen = (xfs_extlen_t)got.br_blockcount;
4625 aoff = got.br_startoff; 4463 aoff = got.br_startoff;
4626 if (lastx != NULLEXTNUM && lastx) { 4464 if (lastx != NULLEXTNUM && lastx) {
4627 ep = xfs_iext_get_ext(ifp, lastx - 1); 4465 ep = xfs_iext_get_ext(ifp, lastx - 1);
4628 xfs_bmbt_get_all(ep, &prev); 4466 xfs_bmbt_get_all(ep, &prev);
4629 } 4467 }
4630 } else if (wasdelay) {
4631 alen = (xfs_extlen_t)
4632 XFS_FILBLKS_MIN(len,
4633 (got.br_startoff +
4634 got.br_blockcount) - bno);
4635 aoff = bno;
4636 } else { 4468 } else {
4637 alen = (xfs_extlen_t) 4469 alen = (xfs_extlen_t)
4638 XFS_FILBLKS_MIN(len, MAXEXTLEN); 4470 XFS_FILBLKS_MIN(len, MAXEXTLEN);
@@ -4831,7 +4663,7 @@ xfs_bmapi(
4831 got.br_state = XFS_EXT_UNWRITTEN; 4663 got.br_state = XFS_EXT_UNWRITTEN;
4832 } 4664 }
4833 error = xfs_bmap_add_extent(ip, lastx, &cur, &got, 4665 error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
4834 firstblock, flist, &tmp_logflags, delta, 4666 firstblock, flist, &tmp_logflags,
4835 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4667 whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
4836 logflags |= tmp_logflags; 4668 logflags |= tmp_logflags;
4837 if (error) 4669 if (error)
@@ -4927,7 +4759,7 @@ xfs_bmapi(
4927 } 4759 }
4928 mval->br_state = XFS_EXT_NORM; 4760 mval->br_state = XFS_EXT_NORM;
4929 error = xfs_bmap_add_extent(ip, lastx, &cur, mval, 4761 error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
4930 firstblock, flist, &tmp_logflags, delta, 4762 firstblock, flist, &tmp_logflags,
4931 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4763 whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
4932 logflags |= tmp_logflags; 4764 logflags |= tmp_logflags;
4933 if (error) 4765 if (error)
@@ -5017,14 +4849,6 @@ xfs_bmapi(
5017 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || 4849 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
5018 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); 4850 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
5019 error = 0; 4851 error = 0;
5020 if (delta && delta->xed_startoff != NULLFILEOFF) {
5021 /* A change was actually made.
5022 * Note that delta->xed_blockount is an offset at this
5023 * point and needs to be converted to a block count.
5024 */
5025 ASSERT(delta->xed_blockcount > delta->xed_startoff);
5026 delta->xed_blockcount -= delta->xed_startoff;
5027 }
5028error0: 4852error0:
5029 /* 4853 /*
5030 * Log everything. Do this after conversion, there's no point in 4854 * Log everything. Do this after conversion, there's no point in
@@ -5136,8 +4960,6 @@ xfs_bunmapi(
5136 xfs_fsblock_t *firstblock, /* first allocated block 4960 xfs_fsblock_t *firstblock, /* first allocated block
5137 controls a.g. for allocs */ 4961 controls a.g. for allocs */
5138 xfs_bmap_free_t *flist, /* i/o: list extents to free */ 4962 xfs_bmap_free_t *flist, /* i/o: list extents to free */
5139 xfs_extdelta_t *delta, /* o: change made to incore
5140 extents */
5141 int *done) /* set if not done yet */ 4963 int *done) /* set if not done yet */
5142{ 4964{
5143 xfs_btree_cur_t *cur; /* bmap btree cursor */ 4965 xfs_btree_cur_t *cur; /* bmap btree cursor */
@@ -5196,10 +5018,7 @@ xfs_bunmapi(
5196 bno = start + len - 1; 5018 bno = start + len - 1;
5197 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, 5019 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
5198 &prev); 5020 &prev);
5199 if (delta) { 5021
5200 delta->xed_startoff = NULLFILEOFF;
5201 delta->xed_blockcount = 0;
5202 }
5203 /* 5022 /*
5204 * Check to see if the given block number is past the end of the 5023 * Check to see if the given block number is past the end of the
5205 * file, back up to the last block if so... 5024 * file, back up to the last block if so...
@@ -5297,7 +5116,7 @@ xfs_bunmapi(
5297 } 5116 }
5298 del.br_state = XFS_EXT_UNWRITTEN; 5117 del.br_state = XFS_EXT_UNWRITTEN;
5299 error = xfs_bmap_add_extent(ip, lastx, &cur, &del, 5118 error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
5300 firstblock, flist, &logflags, delta, 5119 firstblock, flist, &logflags,
5301 XFS_DATA_FORK, 0); 5120 XFS_DATA_FORK, 0);
5302 if (error) 5121 if (error)
5303 goto error0; 5122 goto error0;
@@ -5352,7 +5171,7 @@ xfs_bunmapi(
5352 prev.br_state = XFS_EXT_UNWRITTEN; 5171 prev.br_state = XFS_EXT_UNWRITTEN;
5353 error = xfs_bmap_add_extent(ip, lastx - 1, &cur, 5172 error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
5354 &prev, firstblock, flist, &logflags, 5173 &prev, firstblock, flist, &logflags,
5355 delta, XFS_DATA_FORK, 0); 5174 XFS_DATA_FORK, 0);
5356 if (error) 5175 if (error)
5357 goto error0; 5176 goto error0;
5358 goto nodelete; 5177 goto nodelete;
@@ -5361,7 +5180,7 @@ xfs_bunmapi(
5361 del.br_state = XFS_EXT_UNWRITTEN; 5180 del.br_state = XFS_EXT_UNWRITTEN;
5362 error = xfs_bmap_add_extent(ip, lastx, &cur, 5181 error = xfs_bmap_add_extent(ip, lastx, &cur,
5363 &del, firstblock, flist, &logflags, 5182 &del, firstblock, flist, &logflags,
5364 delta, XFS_DATA_FORK, 0); 5183 XFS_DATA_FORK, 0);
5365 if (error) 5184 if (error)
5366 goto error0; 5185 goto error0;
5367 goto nodelete; 5186 goto nodelete;
@@ -5414,7 +5233,7 @@ xfs_bunmapi(
5414 goto error0; 5233 goto error0;
5415 } 5234 }
5416 error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, 5235 error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
5417 &tmp_logflags, delta, whichfork, rsvd); 5236 &tmp_logflags, whichfork, rsvd);
5418 logflags |= tmp_logflags; 5237 logflags |= tmp_logflags;
5419 if (error) 5238 if (error)
5420 goto error0; 5239 goto error0;
@@ -5471,14 +5290,6 @@ nodelete:
5471 ASSERT(ifp->if_ext_max == 5290 ASSERT(ifp->if_ext_max ==
5472 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); 5291 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5473 error = 0; 5292 error = 0;
5474 if (delta && delta->xed_startoff != NULLFILEOFF) {
5475 /* A change was actually made.
5476 * Note that delta->xed_blockount is an offset at this
5477 * point and needs to be converted to a block count.
5478 */
5479 ASSERT(delta->xed_blockcount > delta->xed_startoff);
5480 delta->xed_blockcount -= delta->xed_startoff;
5481 }
5482error0: 5293error0:
5483 /* 5294 /*
5484 * Log everything. Do this after conversion, there's no point in 5295 * Log everything. Do this after conversion, there's no point in
@@ -5605,28 +5416,6 @@ xfs_getbmap(
5605 prealloced = 0; 5416 prealloced = 0;
5606 fixlen = 1LL << 32; 5417 fixlen = 1LL << 32;
5607 } else { 5418 } else {
5608 /*
5609 * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
5610 * not generate a DMAPI read event. Otherwise, if the
5611 * DM_EVENT_READ bit is set for the file, generate a read
5612 * event in order that the DMAPI application may do its thing
5613 * before we return the extents. Usually this means restoring
5614 * user file data to regions of the file that look like holes.
5615 *
5616 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
5617 * BMV_IF_NO_DMAPI_READ so that read events are generated.
5618 * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
5619 * could misinterpret holes in a DMAPI file as true holes,
5620 * when in fact they may represent offline user data.
5621 */
5622 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
5623 !(iflags & BMV_IF_NO_DMAPI_READ)) {
5624 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
5625 0, 0, 0, NULL);
5626 if (error)
5627 return XFS_ERROR(error);
5628 }
5629
5630 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && 5419 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
5631 ip->i_d.di_format != XFS_DINODE_FMT_BTREE && 5420 ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
5632 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) 5421 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
@@ -5713,7 +5502,7 @@ xfs_getbmap(
5713 error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), 5502 error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
5714 XFS_BB_TO_FSB(mp, bmv->bmv_length), 5503 XFS_BB_TO_FSB(mp, bmv->bmv_length),
5715 bmapi_flags, NULL, 0, map, &nmap, 5504 bmapi_flags, NULL, 0, map, &nmap,
5716 NULL, NULL); 5505 NULL);
5717 if (error) 5506 if (error)
5718 goto out_free_map; 5507 goto out_free_map;
5719 ASSERT(nmap <= subnex); 5508 ASSERT(nmap <= subnex);
@@ -5744,12 +5533,24 @@ xfs_getbmap(
5744 map[i].br_startblock)) 5533 map[i].br_startblock))
5745 goto out_free_map; 5534 goto out_free_map;
5746 5535
5747 nexleft--;
5748 bmv->bmv_offset = 5536 bmv->bmv_offset =
5749 out[cur_ext].bmv_offset + 5537 out[cur_ext].bmv_offset +
5750 out[cur_ext].bmv_length; 5538 out[cur_ext].bmv_length;
5751 bmv->bmv_length = 5539 bmv->bmv_length =
5752 max_t(__int64_t, 0, bmvend - bmv->bmv_offset); 5540 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
5541
5542 /*
5543 * In case we don't want to return the hole,
5544 * don't increase cur_ext so that we can reuse
5545 * it in the next loop.
5546 */
5547 if ((iflags & BMV_IF_NO_HOLES) &&
5548 map[i].br_startblock == HOLESTARTBLOCK) {
5549 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
5550 continue;
5551 }
5552
5553 nexleft--;
5753 bmv->bmv_entries++; 5554 bmv->bmv_entries++;
5754 cur_ext++; 5555 cur_ext++;
5755 } 5556 }
@@ -5859,66 +5660,34 @@ xfs_bmap_eof(
5859} 5660}
5860 5661
5861#ifdef DEBUG 5662#ifdef DEBUG
5862STATIC 5663STATIC struct xfs_buf *
5863xfs_buf_t *
5864xfs_bmap_get_bp( 5664xfs_bmap_get_bp(
5865 xfs_btree_cur_t *cur, 5665 struct xfs_btree_cur *cur,
5866 xfs_fsblock_t bno) 5666 xfs_fsblock_t bno)
5867{ 5667{
5868 int i; 5668 struct xfs_log_item_desc *lidp;
5869 xfs_buf_t *bp; 5669 int i;
5870 5670
5871 if (!cur) 5671 if (!cur)
5872 return(NULL); 5672 return NULL;
5873
5874 bp = NULL;
5875 for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
5876 bp = cur->bc_bufs[i];
5877 if (!bp) break;
5878 if (XFS_BUF_ADDR(bp) == bno)
5879 break; /* Found it */
5880 }
5881 if (i == XFS_BTREE_MAXLEVELS)
5882 bp = NULL;
5883
5884 if (!bp) { /* Chase down all the log items to see if the bp is there */
5885 xfs_log_item_chunk_t *licp;
5886 xfs_trans_t *tp;
5887
5888 tp = cur->bc_tp;
5889 licp = &tp->t_items;
5890 while (!bp && licp != NULL) {
5891 if (xfs_lic_are_all_free(licp)) {
5892 licp = licp->lic_next;
5893 continue;
5894 }
5895 for (i = 0; i < licp->lic_unused; i++) {
5896 xfs_log_item_desc_t *lidp;
5897 xfs_log_item_t *lip;
5898 xfs_buf_log_item_t *bip;
5899 xfs_buf_t *lbp;
5900
5901 if (xfs_lic_isfree(licp, i)) {
5902 continue;
5903 }
5904
5905 lidp = xfs_lic_slot(licp, i);
5906 lip = lidp->lid_item;
5907 if (lip->li_type != XFS_LI_BUF)
5908 continue;
5909 5673
5910 bip = (xfs_buf_log_item_t *)lip; 5674 for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
5911 lbp = bip->bli_buf; 5675 if (!cur->bc_bufs[i])
5676 break;
5677 if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
5678 return cur->bc_bufs[i];
5679 }
5912 5680
5913 if (XFS_BUF_ADDR(lbp) == bno) { 5681 /* Chase down all the log items to see if the bp is there */
5914 bp = lbp; 5682 list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
5915 break; /* Found it */ 5683 struct xfs_buf_log_item *bip;
5916 } 5684 bip = (struct xfs_buf_log_item *)lidp->lid_item;
5917 } 5685 if (bip->bli_item.li_type == XFS_LI_BUF &&
5918 licp = licp->lic_next; 5686 XFS_BUF_ADDR(bip->bli_buf) == bno)
5919 } 5687 return bip->bli_buf;
5920 } 5688 }
5921 return(bp); 5689
5690 return NULL;
5922} 5691}
5923 5692
5924STATIC void 5693STATIC void
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 419dafb9d87d..b13569a6179b 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -28,20 +28,6 @@ struct xfs_trans;
28extern kmem_zone_t *xfs_bmap_free_item_zone; 28extern kmem_zone_t *xfs_bmap_free_item_zone;
29 29
30/* 30/*
31 * DELTA: describe a change to the in-core extent list.
32 *
33 * Internally the use of xed_blockount is somewhat funky.
34 * xed_blockcount contains an offset much of the time because this
35 * makes merging changes easier. (xfs_fileoff_t and xfs_filblks_t are
36 * the same underlying type).
37 */
38typedef struct xfs_extdelta
39{
40 xfs_fileoff_t xed_startoff; /* offset of range */
41 xfs_filblks_t xed_blockcount; /* blocks in range */
42} xfs_extdelta_t;
43
44/*
45 * List of extents to be free "later". 31 * List of extents to be free "later".
46 * The list is kept sorted on xbf_startblock. 32 * The list is kept sorted on xbf_startblock.
47 */ 33 */
@@ -82,16 +68,13 @@ typedef struct xfs_bmap_free
82#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ 68#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */
83#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ 69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
84#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ 70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
85#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */ 71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
86#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */ 72#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */
87#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */ 73#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
88#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */ 74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
89#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */
90#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */
91 /* combine contig. space */ 75 /* combine contig. space */
92#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */ 76#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */
93/* XFS_BMAPI_DIRECT_IO 0x800 */ 77#define XFS_BMAPI_CONVERT 0x200 /* unwritten extent conversion - */
94#define XFS_BMAPI_CONVERT 0x1000 /* unwritten extent conversion - */
95 /* need write cache flushing and no */ 78 /* need write cache flushing and no */
96 /* additional allocation alignments */ 79 /* additional allocation alignments */
97 80
@@ -100,9 +83,7 @@ typedef struct xfs_bmap_free
100 { XFS_BMAPI_DELAY, "DELAY" }, \ 83 { XFS_BMAPI_DELAY, "DELAY" }, \
101 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 84 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
102 { XFS_BMAPI_METADATA, "METADATA" }, \ 85 { XFS_BMAPI_METADATA, "METADATA" }, \
103 { XFS_BMAPI_EXACT, "EXACT" }, \
104 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ 86 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
105 { XFS_BMAPI_ASYNC, "ASYNC" }, \
106 { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \ 87 { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \
107 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ 88 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
108 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ 89 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
@@ -310,9 +291,7 @@ xfs_bmapi(
310 xfs_extlen_t total, /* total blocks needed */ 291 xfs_extlen_t total, /* total blocks needed */
311 struct xfs_bmbt_irec *mval, /* output: map values */ 292 struct xfs_bmbt_irec *mval, /* output: map values */
312 int *nmap, /* i/o: mval size/count */ 293 int *nmap, /* i/o: mval size/count */
313 xfs_bmap_free_t *flist, /* i/o: list extents to free */ 294 xfs_bmap_free_t *flist); /* i/o: list extents to free */
314 xfs_extdelta_t *delta); /* o: change made to incore
315 extents */
316 295
317/* 296/*
318 * Map file blocks to filesystem blocks, simple version. 297 * Map file blocks to filesystem blocks, simple version.
@@ -346,8 +325,6 @@ xfs_bunmapi(
346 xfs_fsblock_t *firstblock, /* first allocated block 325 xfs_fsblock_t *firstblock, /* first allocated block
347 controls a.g. for allocs */ 326 controls a.g. for allocs */
348 xfs_bmap_free_t *flist, /* i/o: list extents to free */ 327 xfs_bmap_free_t *flist, /* i/o: list extents to free */
349 xfs_extdelta_t *delta, /* o: change made to incore
350 extents */
351 int *done); /* set if not done yet */ 328 int *done); /* set if not done yet */
352 329
353/* 330/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 416e47e54b83..87d3c10b6954 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,21 +24,16 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
38#include "xfs_alloc.h" 34#include "xfs_alloc.h"
39#include "xfs_btree.h" 35#include "xfs_btree.h"
40#include "xfs_btree_trace.h" 36#include "xfs_btree_trace.h"
41#include "xfs_ialloc.h"
42#include "xfs_itable.h" 37#include "xfs_itable.h"
43#include "xfs_bmap.h" 38#include "xfs_bmap.h"
44#include "xfs_error.h" 39#include "xfs_error.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 96be4b0f2496..829af92f0fba 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -24,20 +24,15 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
38#include "xfs_btree.h" 34#include "xfs_btree.h"
39#include "xfs_btree_trace.h" 35#include "xfs_btree_trace.h"
40#include "xfs_ialloc.h"
41#include "xfs_error.h" 36#include "xfs_error.h"
42#include "xfs_trace.h" 37#include "xfs_trace.h"
43 38
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f3c49e69eab9..1b09d7a280df 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -24,7 +24,6 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_buf_item.h" 28#include "xfs_buf_item.h"
30#include "xfs_trans_priv.h" 29#include "xfs_trans_priv.h"
@@ -34,6 +33,12 @@
34 33
35kmem_zone_t *xfs_buf_item_zone; 34kmem_zone_t *xfs_buf_item_zone;
36 35
36static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
37{
38 return container_of(lip, struct xfs_buf_log_item, bli_item);
39}
40
41
37#ifdef XFS_TRANS_DEBUG 42#ifdef XFS_TRANS_DEBUG
38/* 43/*
39 * This function uses an alternate strategy for tracking the bytes 44 * This function uses an alternate strategy for tracking the bytes
@@ -64,7 +69,7 @@ xfs_buf_item_log_debug(
64 nbytes = last - first + 1; 69 nbytes = last - first + 1;
65 bfset(bip->bli_logged, first, nbytes); 70 bfset(bip->bli_logged, first, nbytes);
66 for (x = 0; x < nbytes; x++) { 71 for (x = 0; x < nbytes; x++) {
67 chunk_num = byte >> XFS_BLI_SHIFT; 72 chunk_num = byte >> XFS_BLF_SHIFT;
68 word_num = chunk_num >> BIT_TO_WORD_SHIFT; 73 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
69 bit_num = chunk_num & (NBWORD - 1); 74 bit_num = chunk_num & (NBWORD - 1);
70 wordp = &(bip->bli_format.blf_data_map[word_num]); 75 wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -151,12 +156,13 @@ STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
151 */ 156 */
152STATIC uint 157STATIC uint
153xfs_buf_item_size( 158xfs_buf_item_size(
154 xfs_buf_log_item_t *bip) 159 struct xfs_log_item *lip)
155{ 160{
156 uint nvecs; 161 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
157 int next_bit; 162 struct xfs_buf *bp = bip->bli_buf;
158 int last_bit; 163 uint nvecs;
159 xfs_buf_t *bp; 164 int next_bit;
165 int last_bit;
160 166
161 ASSERT(atomic_read(&bip->bli_refcount) > 0); 167 ASSERT(atomic_read(&bip->bli_refcount) > 0);
162 if (bip->bli_flags & XFS_BLI_STALE) { 168 if (bip->bli_flags & XFS_BLI_STALE) {
@@ -166,11 +172,10 @@ xfs_buf_item_size(
166 * cancel flag in it. 172 * cancel flag in it.
167 */ 173 */
168 trace_xfs_buf_item_size_stale(bip); 174 trace_xfs_buf_item_size_stale(bip);
169 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 175 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
170 return 1; 176 return 1;
171 } 177 }
172 178
173 bp = bip->bli_buf;
174 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 179 ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
175 nvecs = 1; 180 nvecs = 1;
176 last_bit = xfs_next_bit(bip->bli_format.blf_data_map, 181 last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
@@ -197,9 +202,9 @@ xfs_buf_item_size(
197 } else if (next_bit != last_bit + 1) { 202 } else if (next_bit != last_bit + 1) {
198 last_bit = next_bit; 203 last_bit = next_bit;
199 nvecs++; 204 nvecs++;
200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != 205 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
201 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + 206 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
202 XFS_BLI_CHUNK)) { 207 XFS_BLF_CHUNK)) {
203 last_bit = next_bit; 208 last_bit = next_bit;
204 nvecs++; 209 nvecs++;
205 } else { 210 } else {
@@ -219,13 +224,13 @@ xfs_buf_item_size(
219 */ 224 */
220STATIC void 225STATIC void
221xfs_buf_item_format( 226xfs_buf_item_format(
222 xfs_buf_log_item_t *bip, 227 struct xfs_log_item *lip,
223 xfs_log_iovec_t *log_vector) 228 struct xfs_log_iovec *vecp)
224{ 229{
230 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
231 struct xfs_buf *bp = bip->bli_buf;
225 uint base_size; 232 uint base_size;
226 uint nvecs; 233 uint nvecs;
227 xfs_log_iovec_t *vecp;
228 xfs_buf_t *bp;
229 int first_bit; 234 int first_bit;
230 int last_bit; 235 int last_bit;
231 int next_bit; 236 int next_bit;
@@ -235,8 +240,6 @@ xfs_buf_item_format(
235 ASSERT(atomic_read(&bip->bli_refcount) > 0); 240 ASSERT(atomic_read(&bip->bli_refcount) > 0);
236 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 241 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
237 (bip->bli_flags & XFS_BLI_STALE)); 242 (bip->bli_flags & XFS_BLI_STALE));
238 bp = bip->bli_buf;
239 vecp = log_vector;
240 243
241 /* 244 /*
242 * The size of the base structure is the size of the 245 * The size of the base structure is the size of the
@@ -248,12 +251,26 @@ xfs_buf_item_format(
248 base_size = 251 base_size =
249 (uint)(sizeof(xfs_buf_log_format_t) + 252 (uint)(sizeof(xfs_buf_log_format_t) +
250 ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); 253 ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
251 vecp->i_addr = (xfs_caddr_t)&bip->bli_format; 254 vecp->i_addr = &bip->bli_format;
252 vecp->i_len = base_size; 255 vecp->i_len = base_size;
253 vecp->i_type = XLOG_REG_TYPE_BFORMAT; 256 vecp->i_type = XLOG_REG_TYPE_BFORMAT;
254 vecp++; 257 vecp++;
255 nvecs = 1; 258 nvecs = 1;
256 259
260 /*
261 * If it is an inode buffer, transfer the in-memory state to the
262 * format flags and clear the in-memory state. We do not transfer
263 * this state if the inode buffer allocation has not yet been committed
264 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
265 * correct replay of the inode allocation.
266 */
267 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
268 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
269 xfs_log_item_in_current_chkpt(lip)))
270 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
271 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
272 }
273
257 if (bip->bli_flags & XFS_BLI_STALE) { 274 if (bip->bli_flags & XFS_BLI_STALE) {
258 /* 275 /*
259 * The buffer is stale, so all we need to log 276 * The buffer is stale, so all we need to log
@@ -261,7 +278,7 @@ xfs_buf_item_format(
261 * cancel flag in it. 278 * cancel flag in it.
262 */ 279 */
263 trace_xfs_buf_item_format_stale(bip); 280 trace_xfs_buf_item_format_stale(bip);
264 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 281 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
265 bip->bli_format.blf_size = nvecs; 282 bip->bli_format.blf_size = nvecs;
266 return; 283 return;
267 } 284 }
@@ -294,28 +311,28 @@ xfs_buf_item_format(
294 * keep counting and scanning. 311 * keep counting and scanning.
295 */ 312 */
296 if (next_bit == -1) { 313 if (next_bit == -1) {
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 314 buffer_offset = first_bit * XFS_BLF_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 315 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 316 vecp->i_len = nbits * XFS_BLF_CHUNK;
300 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 317 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 318 nvecs++;
302 break; 319 break;
303 } else if (next_bit != last_bit + 1) { 320 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 321 buffer_offset = first_bit * XFS_BLF_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 322 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 323 vecp->i_len = nbits * XFS_BLF_CHUNK;
307 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 324 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 325 nvecs++;
309 vecp++; 326 vecp++;
310 first_bit = next_bit; 327 first_bit = next_bit;
311 last_bit = next_bit; 328 last_bit = next_bit;
312 nbits = 1; 329 nbits = 1;
313 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != 330 } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
314 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + 331 (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
315 XFS_BLI_CHUNK)) { 332 XFS_BLF_CHUNK)) {
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 333 buffer_offset = first_bit * XFS_BLF_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 334 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 335 vecp->i_len = nbits * XFS_BLF_CHUNK;
319 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 336 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 337/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 338 * this number is used by recovery, and it gets confused by the boundary
@@ -341,61 +358,91 @@ xfs_buf_item_format(
341} 358}
342 359
343/* 360/*
344 * This is called to pin the buffer associated with the buf log 361 * This is called to pin the buffer associated with the buf log item in memory
345 * item in memory so it cannot be written out. Simply call bpin() 362 * so it cannot be written out.
346 * on the buffer to do this. 363 *
364 * We also always take a reference to the buffer log item here so that the bli
365 * is held while the item is pinned in memory. This means that we can
366 * unconditionally drop the reference count a transaction holds when the
367 * transaction is completed.
347 */ 368 */
348STATIC void 369STATIC void
349xfs_buf_item_pin( 370xfs_buf_item_pin(
350 xfs_buf_log_item_t *bip) 371 struct xfs_log_item *lip)
351{ 372{
352 xfs_buf_t *bp; 373 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
353 374
354 bp = bip->bli_buf; 375 ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
355 ASSERT(XFS_BUF_ISBUSY(bp));
356 ASSERT(atomic_read(&bip->bli_refcount) > 0); 376 ASSERT(atomic_read(&bip->bli_refcount) > 0);
357 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 377 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
358 (bip->bli_flags & XFS_BLI_STALE)); 378 (bip->bli_flags & XFS_BLI_STALE));
379
359 trace_xfs_buf_item_pin(bip); 380 trace_xfs_buf_item_pin(bip);
360 xfs_bpin(bp);
361}
362 381
382 atomic_inc(&bip->bli_refcount);
383 atomic_inc(&bip->bli_buf->b_pin_count);
384}
363 385
364/* 386/*
365 * This is called to unpin the buffer associated with the buf log 387 * This is called to unpin the buffer associated with the buf log
366 * item which was previously pinned with a call to xfs_buf_item_pin(). 388 * item which was previously pinned with a call to xfs_buf_item_pin().
367 * Just call bunpin() on the buffer to do this.
368 * 389 *
369 * Also drop the reference to the buf item for the current transaction. 390 * Also drop the reference to the buf item for the current transaction.
370 * If the XFS_BLI_STALE flag is set and we are the last reference, 391 * If the XFS_BLI_STALE flag is set and we are the last reference,
371 * then free up the buf log item and unlock the buffer. 392 * then free up the buf log item and unlock the buffer.
393 *
394 * If the remove flag is set we are called from uncommit in the
395 * forced-shutdown path. If that is true and the reference count on
396 * the log item is going to drop to zero we need to free the item's
397 * descriptor in the transaction.
372 */ 398 */
373STATIC void 399STATIC void
374xfs_buf_item_unpin( 400xfs_buf_item_unpin(
375 xfs_buf_log_item_t *bip, 401 struct xfs_log_item *lip,
376 int stale) 402 int remove)
377{ 403{
378 struct xfs_ail *ailp; 404 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
379 xfs_buf_t *bp; 405 xfs_buf_t *bp = bip->bli_buf;
406 struct xfs_ail *ailp = lip->li_ailp;
407 int stale = bip->bli_flags & XFS_BLI_STALE;
380 int freed; 408 int freed;
381 409
382 bp = bip->bli_buf;
383 ASSERT(bp != NULL);
384 ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); 410 ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
385 ASSERT(atomic_read(&bip->bli_refcount) > 0); 411 ASSERT(atomic_read(&bip->bli_refcount) > 0);
412
386 trace_xfs_buf_item_unpin(bip); 413 trace_xfs_buf_item_unpin(bip);
387 414
388 freed = atomic_dec_and_test(&bip->bli_refcount); 415 freed = atomic_dec_and_test(&bip->bli_refcount);
389 ailp = bip->bli_item.li_ailp; 416
390 xfs_bunpin(bp); 417 if (atomic_dec_and_test(&bp->b_pin_count))
418 wake_up_all(&bp->b_waiters);
419
391 if (freed && stale) { 420 if (freed && stale) {
392 ASSERT(bip->bli_flags & XFS_BLI_STALE); 421 ASSERT(bip->bli_flags & XFS_BLI_STALE);
393 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 422 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
394 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 423 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
395 ASSERT(XFS_BUF_ISSTALE(bp)); 424 ASSERT(XFS_BUF_ISSTALE(bp));
396 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 425 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
426
397 trace_xfs_buf_item_unpin_stale(bip); 427 trace_xfs_buf_item_unpin_stale(bip);
398 428
429 if (remove) {
430 /*
431 * We have to remove the log item from the transaction
432 * as we are about to release our reference to the
433 * buffer. If we don't, the unlock that occurs later
434 * in xfs_trans_uncommit() will ry to reference the
435 * buffer which we no longer have a hold on.
436 */
437 xfs_trans_del_item(lip);
438
439 /*
440 * Since the transaction no longer refers to the buffer,
441 * the buffer should no longer refer to the transaction.
442 */
443 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
444 }
445
399 /* 446 /*
400 * If we get called here because of an IO error, we may 447 * If we get called here because of an IO error, we may
401 * or may not have the item on the AIL. xfs_trans_ail_delete() 448 * or may not have the item on the AIL. xfs_trans_ail_delete()
@@ -417,54 +464,6 @@ xfs_buf_item_unpin(
417} 464}
418 465
419/* 466/*
420 * this is called from uncommit in the forced-shutdown path.
421 * we need to check to see if the reference count on the log item
422 * is going to drop to zero. If so, unpin will free the log item
423 * so we need to free the item's descriptor (that points to the item)
424 * in the transaction.
425 */
426STATIC void
427xfs_buf_item_unpin_remove(
428 xfs_buf_log_item_t *bip,
429 xfs_trans_t *tp)
430{
431 xfs_buf_t *bp;
432 xfs_log_item_desc_t *lidp;
433 int stale = 0;
434
435 bp = bip->bli_buf;
436 /*
437 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
438 */
439 if ((atomic_read(&bip->bli_refcount) == 1) &&
440 (bip->bli_flags & XFS_BLI_STALE)) {
441 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
442 trace_xfs_buf_item_unpin_stale(bip);
443
444 /*
445 * yes -- clear the xaction descriptor in-use flag
446 * and free the chunk if required. We can safely
447 * do some work here and then call buf_item_unpin
448 * to do the rest because if the if is true, then
449 * we are holding the buffer locked so no one else
450 * will be able to bump up the refcount.
451 */
452 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
453 stale = lidp->lid_flags & XFS_LID_BUF_STALE;
454 xfs_trans_free_item(tp, lidp);
455 /*
456 * Since the transaction no longer refers to the buffer,
457 * the buffer should no longer refer to the transaction.
458 */
459 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
460 }
461
462 xfs_buf_item_unpin(bip, stale);
463
464 return;
465}
466
467/*
468 * This is called to attempt to lock the buffer associated with this 467 * This is called to attempt to lock the buffer associated with this
469 * buf log item. Don't sleep on the buffer lock. If we can't get 468 * buf log item. Don't sleep on the buffer lock. If we can't get
470 * the lock right away, return 0. If we can get the lock, take a 469 * the lock right away, return 0. If we can get the lock, take a
@@ -474,11 +473,11 @@ xfs_buf_item_unpin_remove(
474 */ 473 */
475STATIC uint 474STATIC uint
476xfs_buf_item_trylock( 475xfs_buf_item_trylock(
477 xfs_buf_log_item_t *bip) 476 struct xfs_log_item *lip)
478{ 477{
479 xfs_buf_t *bp; 478 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
479 struct xfs_buf *bp = bip->bli_buf;
480 480
481 bp = bip->bli_buf;
482 if (XFS_BUF_ISPINNED(bp)) 481 if (XFS_BUF_ISPINNED(bp))
483 return XFS_ITEM_PINNED; 482 return XFS_ITEM_PINNED;
484 if (!XFS_BUF_CPSEMA(bp)) 483 if (!XFS_BUF_CPSEMA(bp))
@@ -495,98 +494,81 @@ xfs_buf_item_trylock(
495} 494}
496 495
497/* 496/*
498 * Release the buffer associated with the buf log item. 497 * Release the buffer associated with the buf log item. If there is no dirty
499 * If there is no dirty logged data associated with the 498 * logged data associated with the buffer recorded in the buf log item, then
500 * buffer recorded in the buf log item, then free the 499 * free the buf log item and remove the reference to it in the buffer.
501 * buf log item and remove the reference to it in the 500 *
502 * buffer. 501 * This call ignores the recursion count. It is only called when the buffer
502 * should REALLY be unlocked, regardless of the recursion count.
503 * 503 *
504 * This call ignores the recursion count. It is only called 504 * We unconditionally drop the transaction's reference to the log item. If the
505 * when the buffer should REALLY be unlocked, regardless 505 * item was logged, then another reference was taken when it was pinned, so we
506 * of the recursion count. 506 * can safely drop the transaction reference now. This also allows us to avoid
507 * potential races with the unpin code freeing the bli by not referencing the
508 * bli after we've dropped the reference count.
507 * 509 *
508 * If the XFS_BLI_HOLD flag is set in the buf log item, then 510 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
509 * free the log item if necessary but do not unlock the buffer. 511 * if necessary but do not unlock the buffer. This is for support of
510 * This is for support of xfs_trans_bhold(). Make sure the 512 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
511 * XFS_BLI_HOLD field is cleared if we don't free the item. 513 * free the item.
512 */ 514 */
513STATIC void 515STATIC void
514xfs_buf_item_unlock( 516xfs_buf_item_unlock(
515 xfs_buf_log_item_t *bip) 517 struct xfs_log_item *lip)
516{ 518{
517 int aborted; 519 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
518 xfs_buf_t *bp; 520 struct xfs_buf *bp = bip->bli_buf;
519 uint hold; 521 int aborted;
522 uint hold;
520 523
521 bp = bip->bli_buf; 524 /* Clear the buffer's association with this transaction. */
525 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
522 526
523 /* 527 /*
524 * Clear the buffer's association with this transaction. 528 * If this is a transaction abort, don't return early. Instead, allow
529 * the brelse to happen. Normally it would be done for stale
530 * (cancelled) buffers at unpin time, but we'll never go through the
531 * pin/unpin cycle if we abort inside commit.
525 */ 532 */
526 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 533 aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
527 534
528 /* 535 /*
529 * If this is a transaction abort, don't return early. 536 * Before possibly freeing the buf item, determine if we should
530 * Instead, allow the brelse to happen. 537 * release the buffer at the end of this routine.
531 * Normally it would be done for stale (cancelled) buffers
532 * at unpin time, but we'll never go through the pin/unpin
533 * cycle if we abort inside commit.
534 */ 538 */
535 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; 539 hold = bip->bli_flags & XFS_BLI_HOLD;
540
541 /* Clear the per transaction state. */
542 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
536 543
537 /* 544 /*
538 * If the buf item is marked stale, then don't do anything. 545 * If the buf item is marked stale, then don't do anything. We'll
539 * We'll unlock the buffer and free the buf item when the 546 * unlock the buffer and free the buf item when the buffer is unpinned
540 * buffer is unpinned for the last time. 547 * for the last time.
541 */ 548 */
542 if (bip->bli_flags & XFS_BLI_STALE) { 549 if (bip->bli_flags & XFS_BLI_STALE) {
543 bip->bli_flags &= ~XFS_BLI_LOGGED;
544 trace_xfs_buf_item_unlock_stale(bip); 550 trace_xfs_buf_item_unlock_stale(bip);
545 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 551 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
546 if (!aborted) 552 if (!aborted) {
553 atomic_dec(&bip->bli_refcount);
547 return; 554 return;
555 }
548 } 556 }
549 557
550 /*
551 * Drop the transaction's reference to the log item if
552 * it was not logged as part of the transaction. Otherwise
553 * we'll drop the reference in xfs_buf_item_unpin() when
554 * the transaction is really through with the buffer.
555 */
556 if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
557 atomic_dec(&bip->bli_refcount);
558 } else {
559 /*
560 * Clear the logged flag since this is per
561 * transaction state.
562 */
563 bip->bli_flags &= ~XFS_BLI_LOGGED;
564 }
565
566 /*
567 * Before possibly freeing the buf item, determine if we should
568 * release the buffer at the end of this routine.
569 */
570 hold = bip->bli_flags & XFS_BLI_HOLD;
571 trace_xfs_buf_item_unlock(bip); 558 trace_xfs_buf_item_unlock(bip);
572 559
573 /* 560 /*
574 * If the buf item isn't tracking any data, free it. 561 * If the buf item isn't tracking any data, free it, otherwise drop the
575 * Otherwise, if XFS_BLI_HOLD is set clear it. 562 * reference we hold to it.
576 */ 563 */
577 if (xfs_bitmap_empty(bip->bli_format.blf_data_map, 564 if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
578 bip->bli_format.blf_map_size)) { 565 bip->bli_format.blf_map_size))
579 xfs_buf_item_relse(bp); 566 xfs_buf_item_relse(bp);
580 } else if (hold) { 567 else
581 bip->bli_flags &= ~XFS_BLI_HOLD; 568 atomic_dec(&bip->bli_refcount);
582 }
583 569
584 /* 570 if (!hold)
585 * Release the buffer if XFS_BLI_HOLD was not set.
586 */
587 if (!hold) {
588 xfs_buf_relse(bp); 571 xfs_buf_relse(bp);
589 }
590} 572}
591 573
592/* 574/*
@@ -609,16 +591,16 @@ xfs_buf_item_unlock(
609 */ 591 */
610STATIC xfs_lsn_t 592STATIC xfs_lsn_t
611xfs_buf_item_committed( 593xfs_buf_item_committed(
612 xfs_buf_log_item_t *bip, 594 struct xfs_log_item *lip,
613 xfs_lsn_t lsn) 595 xfs_lsn_t lsn)
614{ 596{
597 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
598
615 trace_xfs_buf_item_committed(bip); 599 trace_xfs_buf_item_committed(bip);
616 600
617 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 601 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
618 (bip->bli_item.li_lsn != 0)) { 602 return lip->li_lsn;
619 return bip->bli_item.li_lsn; 603 return lsn;
620 }
621 return (lsn);
622} 604}
623 605
624/* 606/*
@@ -628,15 +610,16 @@ xfs_buf_item_committed(
628 */ 610 */
629STATIC void 611STATIC void
630xfs_buf_item_push( 612xfs_buf_item_push(
631 xfs_buf_log_item_t *bip) 613 struct xfs_log_item *lip)
632{ 614{
633 xfs_buf_t *bp; 615 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
616 struct xfs_buf *bp = bip->bli_buf;
634 617
635 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 618 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
619 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
620
636 trace_xfs_buf_item_push(bip); 621 trace_xfs_buf_item_push(bip);
637 622
638 bp = bip->bli_buf;
639 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
640 xfs_buf_relse(bp); 623 xfs_buf_relse(bp);
641} 624}
642 625
@@ -648,22 +631,24 @@ xfs_buf_item_push(
648 */ 631 */
649STATIC void 632STATIC void
650xfs_buf_item_pushbuf( 633xfs_buf_item_pushbuf(
651 xfs_buf_log_item_t *bip) 634 struct xfs_log_item *lip)
652{ 635{
653 xfs_buf_t *bp; 636 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
637 struct xfs_buf *bp = bip->bli_buf;
654 638
655 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 639 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
640 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
641
656 trace_xfs_buf_item_pushbuf(bip); 642 trace_xfs_buf_item_pushbuf(bip);
657 643
658 bp = bip->bli_buf;
659 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
660 xfs_buf_delwri_promote(bp); 644 xfs_buf_delwri_promote(bp);
661 xfs_buf_relse(bp); 645 xfs_buf_relse(bp);
662} 646}
663 647
664/* ARGSUSED */
665STATIC void 648STATIC void
666xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) 649xfs_buf_item_committing(
650 struct xfs_log_item *lip,
651 xfs_lsn_t commit_lsn)
667{ 652{
668} 653}
669 654
@@ -671,21 +656,16 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
671 * This is the ops vector shared by all buf log items. 656 * This is the ops vector shared by all buf log items.
672 */ 657 */
673static struct xfs_item_ops xfs_buf_item_ops = { 658static struct xfs_item_ops xfs_buf_item_ops = {
674 .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, 659 .iop_size = xfs_buf_item_size,
675 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 660 .iop_format = xfs_buf_item_format,
676 xfs_buf_item_format, 661 .iop_pin = xfs_buf_item_pin,
677 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, 662 .iop_unpin = xfs_buf_item_unpin,
678 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, 663 .iop_trylock = xfs_buf_item_trylock,
679 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 664 .iop_unlock = xfs_buf_item_unlock,
680 xfs_buf_item_unpin_remove, 665 .iop_committed = xfs_buf_item_committed,
681 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, 666 .iop_push = xfs_buf_item_push,
682 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock, 667 .iop_pushbuf = xfs_buf_item_pushbuf,
683 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 668 .iop_committing = xfs_buf_item_committing
684 xfs_buf_item_committed,
685 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
686 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
687 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
688 xfs_buf_item_committing
689}; 669};
690 670
691 671
@@ -714,7 +694,6 @@ xfs_buf_item_init(
714 */ 694 */
715 if (bp->b_mount != mp) 695 if (bp->b_mount != mp)
716 bp->b_mount = mp; 696 bp->b_mount = mp;
717 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
718 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 697 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
719 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 698 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
720 if (lip->li_type == XFS_LI_BUF) { 699 if (lip->li_type == XFS_LI_BUF) {
@@ -723,20 +702,17 @@ xfs_buf_item_init(
723 } 702 }
724 703
725 /* 704 /*
726 * chunks is the number of XFS_BLI_CHUNK size pieces 705 * chunks is the number of XFS_BLF_CHUNK size pieces
727 * the buffer can be divided into. Make sure not to 706 * the buffer can be divided into. Make sure not to
728 * truncate any pieces. map_size is the size of the 707 * truncate any pieces. map_size is the size of the
729 * bitmap needed to describe the chunks of the buffer. 708 * bitmap needed to describe the chunks of the buffer.
730 */ 709 */
731 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); 710 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
732 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); 711 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
733 712
734 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 713 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
735 KM_SLEEP); 714 KM_SLEEP);
736 bip->bli_item.li_type = XFS_LI_BUF; 715 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
737 bip->bli_item.li_ops = &xfs_buf_item_ops;
738 bip->bli_item.li_mountp = mp;
739 bip->bli_item.li_ailp = mp->m_ail;
740 bip->bli_buf = bp; 716 bip->bli_buf = bp;
741 xfs_buf_hold(bp); 717 xfs_buf_hold(bp);
742 bip->bli_format.blf_type = XFS_LI_BUF; 718 bip->bli_format.blf_type = XFS_LI_BUF;
@@ -799,8 +775,8 @@ xfs_buf_item_log(
799 /* 775 /*
800 * Convert byte offsets to bit numbers. 776 * Convert byte offsets to bit numbers.
801 */ 777 */
802 first_bit = first >> XFS_BLI_SHIFT; 778 first_bit = first >> XFS_BLF_SHIFT;
803 last_bit = last >> XFS_BLI_SHIFT; 779 last_bit = last >> XFS_BLF_SHIFT;
804 780
805 /* 781 /*
806 * Calculate the total number of bits to be set. 782 * Calculate the total number of bits to be set.
@@ -1103,15 +1079,14 @@ xfs_buf_error_relse(
1103 * It is called by xfs_buf_iodone_callbacks() above which will take 1079 * It is called by xfs_buf_iodone_callbacks() above which will take
1104 * care of cleaning up the buffer itself. 1080 * care of cleaning up the buffer itself.
1105 */ 1081 */
1106/* ARGSUSED */
1107void 1082void
1108xfs_buf_iodone( 1083xfs_buf_iodone(
1109 xfs_buf_t *bp, 1084 struct xfs_buf *bp,
1110 xfs_buf_log_item_t *bip) 1085 struct xfs_log_item *lip)
1111{ 1086{
1112 struct xfs_ail *ailp = bip->bli_item.li_ailp; 1087 struct xfs_ail *ailp = lip->li_ailp;
1113 1088
1114 ASSERT(bip->bli_buf == bp); 1089 ASSERT(BUF_ITEM(lip)->bli_buf == bp);
1115 1090
1116 xfs_buf_rele(bp); 1091 xfs_buf_rele(bp);
1117 1092
@@ -1125,6 +1100,6 @@ xfs_buf_iodone(
1125 * Either way, AIL is useless if we're forcing a shutdown. 1100 * Either way, AIL is useless if we're forcing a shutdown.
1126 */ 1101 */
1127 spin_lock(&ailp->xa_lock); 1102 spin_lock(&ailp->xa_lock);
1128 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); 1103 xfs_trans_ail_delete(ailp, lip);
1129 xfs_buf_item_free(bip); 1104 xfs_buf_item_free(BUF_ITEM(lip));
1130} 1105}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34af00cb..0e2ed43f16c7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t *xfs_buf_item_zone;
26 * have been logged. 26 * have been logged.
27 * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. 27 * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything.
28 */ 28 */
29typedef struct xfs_buf_log_format_t { 29typedef struct xfs_buf_log_format {
30 unsigned short blf_type; /* buf log item type indicator */ 30 unsigned short blf_type; /* buf log item type indicator */
31 unsigned short blf_size; /* size of this item */ 31 unsigned short blf_size; /* size of this item */
32 ushort blf_flags; /* misc state */ 32 ushort blf_flags; /* misc state */
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t {
41 * This flag indicates that the buffer contains on disk inodes 41 * This flag indicates that the buffer contains on disk inodes
42 * and requires special recovery handling. 42 * and requires special recovery handling.
43 */ 43 */
44#define XFS_BLI_INODE_BUF 0x1 44#define XFS_BLF_INODE_BUF 0x1
45/* 45/*
46 * This flag indicates that the buffer should not be replayed 46 * This flag indicates that the buffer should not be replayed
47 * during recovery because its blocks are being freed. 47 * during recovery because its blocks are being freed.
48 */ 48 */
49#define XFS_BLI_CANCEL 0x2 49#define XFS_BLF_CANCEL 0x2
50/* 50/*
51 * This flag indicates that the buffer contains on disk 51 * This flag indicates that the buffer contains on disk
52 * user or group dquots and may require special recovery handling. 52 * user or group dquots and may require special recovery handling.
53 */ 53 */
54#define XFS_BLI_UDQUOT_BUF 0x4 54#define XFS_BLF_UDQUOT_BUF 0x4
55#define XFS_BLI_PDQUOT_BUF 0x8 55#define XFS_BLF_PDQUOT_BUF 0x8
56#define XFS_BLI_GDQUOT_BUF 0x10 56#define XFS_BLF_GDQUOT_BUF 0x10
57 57
58#define XFS_BLI_CHUNK 128 58#define XFS_BLF_CHUNK 128
59#define XFS_BLI_SHIFT 7 59#define XFS_BLF_SHIFT 7
60#define BIT_TO_WORD_SHIFT 5 60#define BIT_TO_WORD_SHIFT 5
61#define NBWORD (NBBY * sizeof(unsigned int)) 61#define NBWORD (NBBY * sizeof(unsigned int))
62 62
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t {
69#define XFS_BLI_LOGGED 0x08 69#define XFS_BLI_LOGGED 0x08
70#define XFS_BLI_INODE_ALLOC_BUF 0x10 70#define XFS_BLI_INODE_ALLOC_BUF 0x10
71#define XFS_BLI_STALE_INODE 0x20 71#define XFS_BLI_STALE_INODE 0x20
72#define XFS_BLI_INODE_BUF 0x40
72 73
73#define XFS_BLI_FLAGS \ 74#define XFS_BLI_FLAGS \
74 { XFS_BLI_HOLD, "HOLD" }, \ 75 { XFS_BLI_HOLD, "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t {
76 { XFS_BLI_STALE, "STALE" }, \ 77 { XFS_BLI_STALE, "STALE" }, \
77 { XFS_BLI_LOGGED, "LOGGED" }, \ 78 { XFS_BLI_LOGGED, "LOGGED" }, \
78 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 79 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79 { XFS_BLI_STALE_INODE, "STALE_INODE" } 80 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
81 { XFS_BLI_INODE_BUF, "INODE_BUF" }
80 82
81 83
82#ifdef __KERNEL__ 84#ifdef __KERNEL__
@@ -122,7 +124,7 @@ void xfs_buf_attach_iodone(struct xfs_buf *,
122 void(*)(struct xfs_buf *, xfs_log_item_t *), 124 void(*)(struct xfs_buf *, xfs_log_item_t *),
123 xfs_log_item_t *); 125 xfs_log_item_t *);
124void xfs_buf_iodone_callbacks(struct xfs_buf *); 126void xfs_buf_iodone_callbacks(struct xfs_buf *);
125void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *); 127void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
126 128
127#ifdef XFS_TRANS_DEBUG 129#ifdef XFS_TRANS_DEBUG
128void 130void
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 0ca556b4bf31..30fa0e206fba 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -25,19 +25,14 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h" 31#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 32#include "xfs_dinode.h"
37#include "xfs_inode.h" 33#include "xfs_inode.h"
38#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
39#include "xfs_alloc.h" 35#include "xfs_alloc.h"
40#include "xfs_btree.h"
41#include "xfs_bmap.h" 36#include "xfs_bmap.h"
42#include "xfs_attr.h" 37#include "xfs_attr.h"
43#include "xfs_attr_leaf.h" 38#include "xfs_attr_leaf.h"
@@ -581,16 +576,14 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
581 xfs_da_intnode_t *node; 576 xfs_da_intnode_t *node;
582 xfs_da_node_entry_t *btree; 577 xfs_da_node_entry_t *btree;
583 int tmp; 578 int tmp;
584 xfs_mount_t *mp;
585 579
586 node = oldblk->bp->data; 580 node = oldblk->bp->data;
587 mp = state->mp;
588 ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); 581 ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
589 ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); 582 ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
590 ASSERT(newblk->blkno != 0); 583 ASSERT(newblk->blkno != 0);
591 if (state->args->whichfork == XFS_DATA_FORK) 584 if (state->args->whichfork == XFS_DATA_FORK)
592 ASSERT(newblk->blkno >= mp->m_dirleafblk && 585 ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
593 newblk->blkno < mp->m_dirfreeblk); 586 newblk->blkno < state->mp->m_dirfreeblk);
594 587
595 /* 588 /*
596 * We may need to make some room before we insert the new node. 589 * We may need to make some room before we insert the new node.
@@ -1601,7 +1594,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1601 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| 1594 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
1602 XFS_BMAPI_CONTIG, 1595 XFS_BMAPI_CONTIG,
1603 args->firstblock, args->total, &map, &nmap, 1596 args->firstblock, args->total, &map, &nmap,
1604 args->flist, NULL))) { 1597 args->flist))) {
1605 return error; 1598 return error;
1606 } 1599 }
1607 ASSERT(nmap <= 1); 1600 ASSERT(nmap <= 1);
@@ -1622,8 +1615,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1622 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| 1615 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
1623 XFS_BMAPI_METADATA, 1616 XFS_BMAPI_METADATA,
1624 args->firstblock, args->total, 1617 args->firstblock, args->total,
1625 &mapp[mapi], &nmap, args->flist, 1618 &mapp[mapi], &nmap, args->flist))) {
1626 NULL))) {
1627 kmem_free(mapp); 1619 kmem_free(mapp);
1628 return error; 1620 return error;
1629 } 1621 }
@@ -1884,7 +1876,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
1884 */ 1876 */
1885 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, 1877 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
1886 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, 1878 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
1887 0, args->firstblock, args->flist, NULL, 1879 0, args->firstblock, args->flist,
1888 &done)) == ENOSPC) { 1880 &done)) == ENOSPC) {
1889 if (w != XFS_DATA_FORK) 1881 if (w != XFS_DATA_FORK)
1890 break; 1882 break;
@@ -1989,7 +1981,7 @@ xfs_da_do_buf(
1989 nfsb, 1981 nfsb,
1990 XFS_BMAPI_METADATA | 1982 XFS_BMAPI_METADATA |
1991 xfs_bmapi_aflag(whichfork), 1983 xfs_bmapi_aflag(whichfork),
1992 NULL, 0, mapp, &nmap, NULL, NULL))) 1984 NULL, 0, mapp, &nmap, NULL)))
1993 goto exit0; 1985 goto exit0;
1994 } 1986 }
1995 } else { 1987 } else {
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 5bba29a07812..3b9582c60a22 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,24 +24,15 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 29#include "xfs_dinode.h"
36#include "xfs_inode.h" 30#include "xfs_inode.h"
37#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
38#include "xfs_bmap.h" 32#include "xfs_bmap.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_itable.h" 33#include "xfs_itable.h"
42#include "xfs_dfrag.h" 34#include "xfs_dfrag.h"
43#include "xfs_error.h" 35#include "xfs_error.h"
44#include "xfs_rw.h"
45#include "xfs_vnodeops.h" 36#include "xfs_vnodeops.h"
46#include "xfs_trace.h" 37#include "xfs_trace.h"
47 38
@@ -69,7 +60,9 @@ xfs_swapext(
69 goto out; 60 goto out;
70 } 61 }
71 62
72 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { 63 if (!(file->f_mode & FMODE_WRITE) ||
64 !(file->f_mode & FMODE_READ) ||
65 (file->f_flags & O_APPEND)) {
73 error = XFS_ERROR(EBADF); 66 error = XFS_ERROR(EBADF);
74 goto out_put_file; 67 goto out_put_file;
75 } 68 }
@@ -81,6 +74,7 @@ xfs_swapext(
81 } 74 }
82 75
83 if (!(tmp_file->f_mode & FMODE_WRITE) || 76 if (!(tmp_file->f_mode & FMODE_WRITE) ||
77 !(tmp_file->f_mode & FMODE_READ) ||
84 (tmp_file->f_flags & O_APPEND)) { 78 (tmp_file->f_flags & O_APPEND)) {
85 error = XFS_ERROR(EBADF); 79 error = XFS_ERROR(EBADF);
86 goto out_put_tmp_file; 80 goto out_put_tmp_file;
@@ -422,11 +416,8 @@ xfs_swap_extents(
422 } 416 }
423 417
424 418
425 IHOLD(ip); 419 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
426 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 420 xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
427
428 IHOLD(tip);
429 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
430 421
431 xfs_trans_log_inode(tp, ip, ilf_fields); 422 xfs_trans_log_inode(tp, ip, ilf_fields);
432 xfs_trans_log_inode(tp, tip, tilf_fields); 423 xfs_trans_log_inode(tp, tip, tilf_fields);
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 42520f041265..a1321bc7f192 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -25,13 +25,11 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 31#include "xfs_alloc_btree.h"
33#include "xfs_dir2_sf.h" 32#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 33#include "xfs_dinode.h"
36#include "xfs_inode.h" 34#include "xfs_inode.h"
37#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
@@ -382,7 +380,7 @@ xfs_readdir(
382 int rval; /* return value */ 380 int rval; /* return value */
383 int v; /* type-checking value */ 381 int v; /* type-checking value */
384 382
385 xfs_itrace_entry(dp); 383 trace_xfs_readdir(dp);
386 384
387 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 385 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
388 return XFS_ERROR(EIO); 386 return XFS_ERROR(EIO);
@@ -549,7 +547,7 @@ xfs_dir2_grow_inode(
549 if ((error = xfs_bmapi(tp, dp, bno, count, 547 if ((error = xfs_bmapi(tp, dp, bno, count,
550 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, 548 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
551 args->firstblock, args->total, &map, &nmap, 549 args->firstblock, args->total, &map, &nmap,
552 args->flist, NULL))) 550 args->flist)))
553 return error; 551 return error;
554 ASSERT(nmap <= 1); 552 ASSERT(nmap <= 1);
555 if (nmap == 1) { 553 if (nmap == 1) {
@@ -581,8 +579,7 @@ xfs_dir2_grow_inode(
581 if ((error = xfs_bmapi(tp, dp, b, c, 579 if ((error = xfs_bmapi(tp, dp, b, c,
582 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA, 580 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
583 args->firstblock, args->total, 581 args->firstblock, args->total,
584 &mapp[mapi], &nmap, args->flist, 582 &mapp[mapi], &nmap, args->flist))) {
585 NULL))) {
586 kmem_free(mapp); 583 kmem_free(mapp);
587 return error; 584 return error;
588 } 585 }
@@ -715,7 +712,7 @@ xfs_dir2_shrink_inode(
715 */ 712 */
716 if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, 713 if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
717 XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, 714 XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
718 NULL, &done))) { 715 &done))) {
719 /* 716 /*
720 * ENOSPC actually can happen if we're in a removename with 717 * ENOSPC actually can happen if we're in a removename with
721 * no space reservation, and the resulting block removal 718 * no space reservation, and the resulting block removal
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 779a267b0a84..580d99cef9e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -24,12 +24,10 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h" 30#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 31#include "xfs_dinode.h"
34#include "xfs_inode.h" 32#include "xfs_inode.h"
35#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
@@ -1073,10 +1071,10 @@ xfs_dir2_sf_to_block(
1073 */ 1071 */
1074 1072
1075 buf_len = dp->i_df.if_bytes; 1073 buf_len = dp->i_df.if_bytes;
1076 buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); 1074 buf = kmem_alloc(buf_len, KM_SLEEP);
1077 1075
1078 memcpy(buf, sfp, dp->i_df.if_bytes); 1076 memcpy(buf, sfp, buf_len);
1079 xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); 1077 xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK);
1080 dp->i_d.di_size = 0; 1078 dp->i_d.di_size = 0;
1081 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1079 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1082 /* 1080 /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 498f8d694330..921595b84f5b 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -24,12 +24,10 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h" 30#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 31#include "xfs_dinode.h"
34#include "xfs_inode.h" 32#include "xfs_inode.h"
35#include "xfs_dir2_data.h" 33#include "xfs_dir2_data.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e2d89854ec9e..504be8640e91 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -25,11 +25,9 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dir2_sf.h" 31#include "xfs_dir2_sf.h"
34#include "xfs_dinode.h" 32#include "xfs_dinode.h"
35#include "xfs_inode.h" 33#include "xfs_inode.h"
@@ -875,7 +873,7 @@ xfs_dir2_leaf_getdents(
875 xfs_dir2_byte_to_da(mp, 873 xfs_dir2_byte_to_da(mp,
876 XFS_DIR2_LEAF_OFFSET) - map_off, 874 XFS_DIR2_LEAF_OFFSET) - map_off,
877 XFS_BMAPI_METADATA, NULL, 0, 875 XFS_BMAPI_METADATA, NULL, 0,
878 &map[map_valid], &nmap, NULL, NULL); 876 &map[map_valid], &nmap, NULL);
879 /* 877 /*
880 * Don't know if we should ignore this or 878 * Don't know if we should ignore this or
881 * try to return an error. 879 * try to return an error.
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 78fc4d9ae756..f9a0864b696a 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -24,12 +24,10 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h" 30#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 31#include "xfs_dinode.h"
34#include "xfs_inode.h" 32#include "xfs_inode.h"
35#include "xfs_bmap.h" 33#include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index c1a5945d463a..b1bae6b1eed9 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -24,12 +24,10 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h" 30#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 31#include "xfs_dinode.h"
34#include "xfs_inode.h" 32#include "xfs_inode.h"
35#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
deleted file mode 100644
index 2813cdd72375..000000000000
--- a/fs/xfs/xfs_dmapi.h
+++ /dev/null
@@ -1,170 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DMAPI_H__
19#define __XFS_DMAPI_H__
20
21/* Values used to define the on-disk version of dm_attrname_t. All
22 * on-disk attribute names start with the 8-byte string "SGI_DMI_".
23 *
24 * In the on-disk inode, DMAPI attribute names consist of the user-provided
25 * name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be
26 * changed.
27 */
28
29#define DMATTR_PREFIXLEN 8
30#define DMATTR_PREFIXSTRING "SGI_DMI_"
31
32typedef enum {
33 DM_EVENT_INVALID = -1,
34 DM_EVENT_CANCEL = 0, /* not supported */
35 DM_EVENT_MOUNT = 1,
36 DM_EVENT_PREUNMOUNT = 2,
37 DM_EVENT_UNMOUNT = 3,
38 DM_EVENT_DEBUT = 4, /* not supported */
39 DM_EVENT_CREATE = 5,
40 DM_EVENT_CLOSE = 6, /* not supported */
41 DM_EVENT_POSTCREATE = 7,
42 DM_EVENT_REMOVE = 8,
43 DM_EVENT_POSTREMOVE = 9,
44 DM_EVENT_RENAME = 10,
45 DM_EVENT_POSTRENAME = 11,
46 DM_EVENT_LINK = 12,
47 DM_EVENT_POSTLINK = 13,
48 DM_EVENT_SYMLINK = 14,
49 DM_EVENT_POSTSYMLINK = 15,
50 DM_EVENT_READ = 16,
51 DM_EVENT_WRITE = 17,
52 DM_EVENT_TRUNCATE = 18,
53 DM_EVENT_ATTRIBUTE = 19,
54 DM_EVENT_DESTROY = 20,
55 DM_EVENT_NOSPACE = 21,
56 DM_EVENT_USER = 22,
57 DM_EVENT_MAX = 23
58} dm_eventtype_t;
59#define HAVE_DM_EVENTTYPE_T
60
61typedef enum {
62 DM_RIGHT_NULL,
63 DM_RIGHT_SHARED,
64 DM_RIGHT_EXCL
65} dm_right_t;
66#define HAVE_DM_RIGHT_T
67
68/* Defines for determining if an event message should be sent. */
69#ifdef HAVE_DMAPI
70#define DM_EVENT_ENABLED(ip, event) ( \
71 unlikely ((ip)->i_mount->m_flags & XFS_MOUNT_DMAPI) && \
72 ( ((ip)->i_d.di_dmevmask & (1 << event)) || \
73 ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
74 )
75#else
76#define DM_EVENT_ENABLED(ip, event) (0)
77#endif
78
79#define DM_XFS_VALID_FS_EVENTS ( \
80 (1 << DM_EVENT_PREUNMOUNT) | \
81 (1 << DM_EVENT_UNMOUNT) | \
82 (1 << DM_EVENT_NOSPACE) | \
83 (1 << DM_EVENT_DEBUT) | \
84 (1 << DM_EVENT_CREATE) | \
85 (1 << DM_EVENT_POSTCREATE) | \
86 (1 << DM_EVENT_REMOVE) | \
87 (1 << DM_EVENT_POSTREMOVE) | \
88 (1 << DM_EVENT_RENAME) | \
89 (1 << DM_EVENT_POSTRENAME) | \
90 (1 << DM_EVENT_LINK) | \
91 (1 << DM_EVENT_POSTLINK) | \
92 (1 << DM_EVENT_SYMLINK) | \
93 (1 << DM_EVENT_POSTSYMLINK) | \
94 (1 << DM_EVENT_ATTRIBUTE) | \
95 (1 << DM_EVENT_DESTROY) )
96
97/* Events valid in dm_set_eventlist() when called with a file handle for
98 a regular file or a symlink. These events are persistent.
99*/
100
101#define DM_XFS_VALID_FILE_EVENTS ( \
102 (1 << DM_EVENT_ATTRIBUTE) | \
103 (1 << DM_EVENT_DESTROY) )
104
105/* Events valid in dm_set_eventlist() when called with a file handle for
106 a directory. These events are persistent.
107*/
108
109#define DM_XFS_VALID_DIRECTORY_EVENTS ( \
110 (1 << DM_EVENT_CREATE) | \
111 (1 << DM_EVENT_POSTCREATE) | \
112 (1 << DM_EVENT_REMOVE) | \
113 (1 << DM_EVENT_POSTREMOVE) | \
114 (1 << DM_EVENT_RENAME) | \
115 (1 << DM_EVENT_POSTRENAME) | \
116 (1 << DM_EVENT_LINK) | \
117 (1 << DM_EVENT_POSTLINK) | \
118 (1 << DM_EVENT_SYMLINK) | \
119 (1 << DM_EVENT_POSTSYMLINK) | \
120 (1 << DM_EVENT_ATTRIBUTE) | \
121 (1 << DM_EVENT_DESTROY) )
122
123/* Events supported by the XFS filesystem. */
124#define DM_XFS_SUPPORTED_EVENTS ( \
125 (1 << DM_EVENT_MOUNT) | \
126 (1 << DM_EVENT_PREUNMOUNT) | \
127 (1 << DM_EVENT_UNMOUNT) | \
128 (1 << DM_EVENT_NOSPACE) | \
129 (1 << DM_EVENT_CREATE) | \
130 (1 << DM_EVENT_POSTCREATE) | \
131 (1 << DM_EVENT_REMOVE) | \
132 (1 << DM_EVENT_POSTREMOVE) | \
133 (1 << DM_EVENT_RENAME) | \
134 (1 << DM_EVENT_POSTRENAME) | \
135 (1 << DM_EVENT_LINK) | \
136 (1 << DM_EVENT_POSTLINK) | \
137 (1 << DM_EVENT_SYMLINK) | \
138 (1 << DM_EVENT_POSTSYMLINK) | \
139 (1 << DM_EVENT_READ) | \
140 (1 << DM_EVENT_WRITE) | \
141 (1 << DM_EVENT_TRUNCATE) | \
142 (1 << DM_EVENT_ATTRIBUTE) | \
143 (1 << DM_EVENT_DESTROY) )
144
145
146/*
147 * Definitions used for the flags field on dm_send_*_event().
148 */
149
150#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */
151#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */
152#define DM_FLAGS_IMUX 0x004 /* thread holds i_mutex */
153#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */
154#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */
155
156/*
157 * Pull in platform specific event flags defines
158 */
159#include "xfs_dmapi_priv.h"
160
161/*
162 * Macros to turn caller specified delay/block flags into
163 * dm_send_xxxx_event flag DM_FLAGS_NDELAY.
164 */
165
166#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
167 DM_FLAGS_NDELAY : 0)
168#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
169
170#endif /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
deleted file mode 100644
index e71e2581c0c3..000000000000
--- a/fs/xfs/xfs_dmops.c
+++ /dev/null
@@ -1,55 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_dmapi.h"
25#include "xfs_inum.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28
29
30static struct xfs_dmops xfs_dmcore_stub = {
31 .xfs_send_data = (xfs_send_data_t)fs_nosys,
32 .xfs_send_mmap = (xfs_send_mmap_t)fs_noerr,
33 .xfs_send_destroy = (xfs_send_destroy_t)fs_nosys,
34 .xfs_send_namesp = (xfs_send_namesp_t)fs_nosys,
35 .xfs_send_mount = (xfs_send_mount_t)fs_nosys,
36 .xfs_send_unmount = (xfs_send_unmount_t)fs_noerr,
37};
38
39int
40xfs_dmops_get(struct xfs_mount *mp)
41{
42 if (mp->m_flags & XFS_MOUNT_DMAPI) {
43 cmn_err(CE_WARN,
44 "XFS: dmapi support not available in this kernel.");
45 return EINVAL;
46 }
47
48 mp->m_dm_ops = &xfs_dmcore_stub;
49 return 0;
50}
51
52void
53xfs_dmops_put(struct xfs_mount *mp)
54{
55}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5bf4f2..ed9990267661 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -23,12 +23,8 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
30#include "xfs_dir2_sf.h"
31#include "xfs_attr_sf.h"
32#include "xfs_dinode.h" 28#include "xfs_dinode.h"
33#include "xfs_inode.h" 29#include "xfs_inode.h"
34#include "xfs_utils.h" 30#include "xfs_utils.h"
@@ -170,7 +166,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
170 va_list ap; 166 va_list ap;
171 167
172#ifdef DEBUG 168#ifdef DEBUG
173 xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; 169 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
174#endif 170#endif
175 171
176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag) 172 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
@@ -186,18 +182,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
186 182
187void 183void
188xfs_error_report( 184xfs_error_report(
189 char *tag, 185 const char *tag,
190 int level, 186 int level,
191 xfs_mount_t *mp, 187 struct xfs_mount *mp,
192 char *fname, 188 const char *filename,
193 int linenum, 189 int linenum,
194 inst_t *ra) 190 inst_t *ra)
195{ 191{
196 if (level <= xfs_error_level) { 192 if (level <= xfs_error_level) {
197 xfs_cmn_err(XFS_PTAG_ERROR_REPORT, 193 xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
198 CE_ALERT, mp, 194 CE_ALERT, mp,
199 "XFS internal error %s at line %d of file %s. Caller 0x%p\n", 195 "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
200 tag, linenum, fname, ra); 196 tag, linenum, filename, ra);
201 197
202 xfs_stack_trace(); 198 xfs_stack_trace();
203 } 199 }
@@ -205,15 +201,15 @@ xfs_error_report(
205 201
206void 202void
207xfs_corruption_error( 203xfs_corruption_error(
208 char *tag, 204 const char *tag,
209 int level, 205 int level,
210 xfs_mount_t *mp, 206 struct xfs_mount *mp,
211 void *p, 207 void *p,
212 char *fname, 208 const char *filename,
213 int linenum, 209 int linenum,
214 inst_t *ra) 210 inst_t *ra)
215{ 211{
216 if (level <= xfs_error_level) 212 if (level <= xfs_error_level)
217 xfs_hex_dump(p, 16); 213 xfs_hex_dump(p, 16);
218 xfs_error_report(tag, level, mp, fname, linenum, ra); 214 xfs_error_report(tag, level, mp, filename, linenum, ra);
219} 215}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051c4651..c2c1a072bb82 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int xfs_error_trap(int);
29 29
30struct xfs_mount; 30struct xfs_mount;
31 31
32extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp, 32extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
33 char *fname, int linenum, inst_t *ra); 33 const char *filename, int linenum, inst_t *ra);
34extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, 34extern void xfs_corruption_error(const char *tag, int level,
35 void *p, char *fname, int linenum, inst_t *ra); 35 struct xfs_mount *mp, void *p, const char *filename,
36 int linenum, inst_t *ra);
36 37
37#define XFS_ERROR_REPORT(e, lvl, mp) \ 38#define XFS_ERROR_REPORT(e, lvl, mp) \
38 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) 39 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6f35ed1b39b9..a55e687bf562 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -24,7 +24,6 @@
24#include "xfs_buf_item.h" 24#include "xfs_buf_item.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
30#include "xfs_extfree_item.h" 29#include "xfs_extfree_item.h"
@@ -33,18 +32,19 @@
33kmem_zone_t *xfs_efi_zone; 32kmem_zone_t *xfs_efi_zone;
34kmem_zone_t *xfs_efd_zone; 33kmem_zone_t *xfs_efd_zone;
35 34
36STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *); 35static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
36{
37 return container_of(lip, struct xfs_efi_log_item, efi_item);
38}
37 39
38void 40void
39xfs_efi_item_free(xfs_efi_log_item_t *efip) 41xfs_efi_item_free(
42 struct xfs_efi_log_item *efip)
40{ 43{
41 int nexts = efip->efi_format.efi_nextents; 44 if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
42
43 if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
44 kmem_free(efip); 45 kmem_free(efip);
45 } else { 46 else
46 kmem_zone_free(xfs_efi_zone, efip); 47 kmem_zone_free(xfs_efi_zone, efip);
47 }
48} 48}
49 49
50/* 50/*
@@ -52,9 +52,9 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
52 * We only need 1 iovec for an efi item. It just logs the efi_log_format 52 * We only need 1 iovec for an efi item. It just logs the efi_log_format
53 * structure. 53 * structure.
54 */ 54 */
55/*ARGSUSED*/
56STATIC uint 55STATIC uint
57xfs_efi_item_size(xfs_efi_log_item_t *efip) 56xfs_efi_item_size(
57 struct xfs_log_item *lip)
58{ 58{
59 return 1; 59 return 1;
60} 60}
@@ -67,10 +67,12 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip)
67 * slots in the efi item have been filled. 67 * slots in the efi item have been filled.
68 */ 68 */
69STATIC void 69STATIC void
70xfs_efi_item_format(xfs_efi_log_item_t *efip, 70xfs_efi_item_format(
71 xfs_log_iovec_t *log_vector) 71 struct xfs_log_item *lip,
72 struct xfs_log_iovec *log_vector)
72{ 73{
73 uint size; 74 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
75 uint size;
74 76
75 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); 77 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
76 78
@@ -80,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
80 size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); 82 size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
81 efip->efi_format.efi_size = 1; 83 efip->efi_format.efi_size = 1;
82 84
83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); 85 log_vector->i_addr = &efip->efi_format;
84 log_vector->i_len = size; 86 log_vector->i_len = size;
85 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; 87 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
86 ASSERT(size >= sizeof(xfs_efi_log_format_t)); 88 ASSERT(size >= sizeof(xfs_efi_log_format_t));
@@ -90,60 +92,33 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
90/* 92/*
91 * Pinning has no meaning for an efi item, so just return. 93 * Pinning has no meaning for an efi item, so just return.
92 */ 94 */
93/*ARGSUSED*/
94STATIC void 95STATIC void
95xfs_efi_item_pin(xfs_efi_log_item_t *efip) 96xfs_efi_item_pin(
97 struct xfs_log_item *lip)
96{ 98{
97 return;
98} 99}
99 100
100
101/* 101/*
102 * While EFIs cannot really be pinned, the unpin operation is the 102 * While EFIs cannot really be pinned, the unpin operation is the
103 * last place at which the EFI is manipulated during a transaction. 103 * last place at which the EFI is manipulated during a transaction.
104 * Here we coordinate with xfs_efi_cancel() to determine who gets to 104 * Here we coordinate with xfs_efi_cancel() to determine who gets to
105 * free the EFI. 105 * free the EFI.
106 */ 106 */
107/*ARGSUSED*/
108STATIC void
109xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
110{
111 struct xfs_ail *ailp = efip->efi_item.li_ailp;
112
113 spin_lock(&ailp->xa_lock);
114 if (efip->efi_flags & XFS_EFI_CANCELED) {
115 /* xfs_trans_ail_delete() drops the AIL lock. */
116 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
117 xfs_efi_item_free(efip);
118 } else {
119 efip->efi_flags |= XFS_EFI_COMMITTED;
120 spin_unlock(&ailp->xa_lock);
121 }
122}
123
124/*
125 * like unpin only we have to also clear the xaction descriptor
126 * pointing the log item if we free the item. This routine duplicates
127 * unpin because efi_flags is protected by the AIL lock. Freeing
128 * the descriptor and then calling unpin would force us to drop the AIL
129 * lock which would open up a race condition.
130 */
131STATIC void 107STATIC void
132xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) 108xfs_efi_item_unpin(
109 struct xfs_log_item *lip,
110 int remove)
133{ 111{
134 struct xfs_ail *ailp = efip->efi_item.li_ailp; 112 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
135 xfs_log_item_desc_t *lidp; 113 struct xfs_ail *ailp = lip->li_ailp;
136 114
137 spin_lock(&ailp->xa_lock); 115 spin_lock(&ailp->xa_lock);
138 if (efip->efi_flags & XFS_EFI_CANCELED) { 116 if (efip->efi_flags & XFS_EFI_CANCELED) {
139 /* 117 if (remove)
140 * free the xaction descriptor pointing to this item 118 xfs_trans_del_item(lip);
141 */
142 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
143 xfs_trans_free_item(tp, lidp);
144 119
145 /* xfs_trans_ail_delete() drops the AIL lock. */ 120 /* xfs_trans_ail_delete() drops the AIL lock. */
146 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); 121 xfs_trans_ail_delete(ailp, lip);
147 xfs_efi_item_free(efip); 122 xfs_efi_item_free(efip);
148 } else { 123 } else {
149 efip->efi_flags |= XFS_EFI_COMMITTED; 124 efip->efi_flags |= XFS_EFI_COMMITTED;
@@ -158,9 +133,9 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
158 * XFS_ITEM_PINNED so that the caller will eventually flush the log. 133 * XFS_ITEM_PINNED so that the caller will eventually flush the log.
159 * This should help in getting the EFI out of the AIL. 134 * This should help in getting the EFI out of the AIL.
160 */ 135 */
161/*ARGSUSED*/
162STATIC uint 136STATIC uint
163xfs_efi_item_trylock(xfs_efi_log_item_t *efip) 137xfs_efi_item_trylock(
138 struct xfs_log_item *lip)
164{ 139{
165 return XFS_ITEM_PINNED; 140 return XFS_ITEM_PINNED;
166} 141}
@@ -168,13 +143,12 @@ xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
168/* 143/*
169 * Efi items have no locking, so just return. 144 * Efi items have no locking, so just return.
170 */ 145 */
171/*ARGSUSED*/
172STATIC void 146STATIC void
173xfs_efi_item_unlock(xfs_efi_log_item_t *efip) 147xfs_efi_item_unlock(
148 struct xfs_log_item *lip)
174{ 149{
175 if (efip->efi_item.li_flags & XFS_LI_ABORTED) 150 if (lip->li_flags & XFS_LI_ABORTED)
176 xfs_efi_item_free(efip); 151 xfs_efi_item_free(EFI_ITEM(lip));
177 return;
178} 152}
179 153
180/* 154/*
@@ -183,9 +157,10 @@ xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
183 * flag is not paid any attention here. Checking for that is delayed 157 * flag is not paid any attention here. Checking for that is delayed
184 * until the EFI is unpinned. 158 * until the EFI is unpinned.
185 */ 159 */
186/*ARGSUSED*/
187STATIC xfs_lsn_t 160STATIC xfs_lsn_t
188xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) 161xfs_efi_item_committed(
162 struct xfs_log_item *lip,
163 xfs_lsn_t lsn)
189{ 164{
190 return lsn; 165 return lsn;
191} 166}
@@ -195,11 +170,10 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
195 * stuck waiting for all of its corresponding efd items to be 170 * stuck waiting for all of its corresponding efd items to be
196 * committed to disk. 171 * committed to disk.
197 */ 172 */
198/*ARGSUSED*/
199STATIC void 173STATIC void
200xfs_efi_item_push(xfs_efi_log_item_t *efip) 174xfs_efi_item_push(
175 struct xfs_log_item *lip)
201{ 176{
202 return;
203} 177}
204 178
205/* 179/*
@@ -209,64 +183,55 @@ xfs_efi_item_push(xfs_efi_log_item_t *efip)
209 * example, for inodes, the inode is locked throughout the extent freeing 183 * example, for inodes, the inode is locked throughout the extent freeing
210 * so the dependency should be recorded there. 184 * so the dependency should be recorded there.
211 */ 185 */
212/*ARGSUSED*/
213STATIC void 186STATIC void
214xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) 187xfs_efi_item_committing(
188 struct xfs_log_item *lip,
189 xfs_lsn_t lsn)
215{ 190{
216 return;
217} 191}
218 192
219/* 193/*
220 * This is the ops vector shared by all efi log items. 194 * This is the ops vector shared by all efi log items.
221 */ 195 */
222static struct xfs_item_ops xfs_efi_item_ops = { 196static struct xfs_item_ops xfs_efi_item_ops = {
223 .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size, 197 .iop_size = xfs_efi_item_size,
224 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 198 .iop_format = xfs_efi_item_format,
225 xfs_efi_item_format, 199 .iop_pin = xfs_efi_item_pin,
226 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, 200 .iop_unpin = xfs_efi_item_unpin,
227 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, 201 .iop_trylock = xfs_efi_item_trylock,
228 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 202 .iop_unlock = xfs_efi_item_unlock,
229 xfs_efi_item_unpin_remove, 203 .iop_committed = xfs_efi_item_committed,
230 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, 204 .iop_push = xfs_efi_item_push,
231 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock, 205 .iop_committing = xfs_efi_item_committing
232 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
233 xfs_efi_item_committed,
234 .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push,
235 .iop_pushbuf = NULL,
236 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
237 xfs_efi_item_committing
238}; 206};
239 207
240 208
241/* 209/*
242 * Allocate and initialize an efi item with the given number of extents. 210 * Allocate and initialize an efi item with the given number of extents.
243 */ 211 */
244xfs_efi_log_item_t * 212struct xfs_efi_log_item *
245xfs_efi_init(xfs_mount_t *mp, 213xfs_efi_init(
246 uint nextents) 214 struct xfs_mount *mp,
215 uint nextents)
247 216
248{ 217{
249 xfs_efi_log_item_t *efip; 218 struct xfs_efi_log_item *efip;
250 uint size; 219 uint size;
251 220
252 ASSERT(nextents > 0); 221 ASSERT(nextents > 0);
253 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { 222 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
254 size = (uint)(sizeof(xfs_efi_log_item_t) + 223 size = (uint)(sizeof(xfs_efi_log_item_t) +
255 ((nextents - 1) * sizeof(xfs_extent_t))); 224 ((nextents - 1) * sizeof(xfs_extent_t)));
256 efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP); 225 efip = kmem_zalloc(size, KM_SLEEP);
257 } else { 226 } else {
258 efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone, 227 efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
259 KM_SLEEP);
260 } 228 }
261 229
262 efip->efi_item.li_type = XFS_LI_EFI; 230 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
263 efip->efi_item.li_ops = &xfs_efi_item_ops;
264 efip->efi_item.li_mountp = mp;
265 efip->efi_item.li_ailp = mp->m_ail;
266 efip->efi_format.efi_nextents = nextents; 231 efip->efi_format.efi_nextents = nextents;
267 efip->efi_format.efi_id = (__psint_t)(void*)efip; 232 efip->efi_format.efi_id = (__psint_t)(void*)efip;
268 233
269 return (efip); 234 return efip;
270} 235}
271 236
272/* 237/*
@@ -279,7 +244,7 @@ xfs_efi_init(xfs_mount_t *mp,
279int 244int
280xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) 245xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
281{ 246{
282 xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr; 247 xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
283 uint i; 248 uint i;
284 uint len = sizeof(xfs_efi_log_format_t) + 249 uint len = sizeof(xfs_efi_log_format_t) +
285 (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); 250 (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
@@ -292,8 +257,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
292 memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); 257 memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
293 return 0; 258 return 0;
294 } else if (buf->i_len == len32) { 259 } else if (buf->i_len == len32) {
295 xfs_efi_log_format_32_t *src_efi_fmt_32 = 260 xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
296 (xfs_efi_log_format_32_t *)buf->i_addr;
297 261
298 dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; 262 dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type;
299 dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; 263 dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size;
@@ -307,8 +271,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
307 } 271 }
308 return 0; 272 return 0;
309 } else if (buf->i_len == len64) { 273 } else if (buf->i_len == len64) {
310 xfs_efi_log_format_64_t *src_efi_fmt_64 = 274 xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
311 (xfs_efi_log_format_64_t *)buf->i_addr;
312 275
313 dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; 276 dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type;
314 dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; 277 dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size;
@@ -359,16 +322,18 @@ xfs_efi_release(xfs_efi_log_item_t *efip,
359 } 322 }
360} 323}
361 324
362STATIC void 325static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
363xfs_efd_item_free(xfs_efd_log_item_t *efdp)
364{ 326{
365 int nexts = efdp->efd_format.efd_nextents; 327 return container_of(lip, struct xfs_efd_log_item, efd_item);
328}
366 329
367 if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { 330STATIC void
331xfs_efd_item_free(struct xfs_efd_log_item *efdp)
332{
333 if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
368 kmem_free(efdp); 334 kmem_free(efdp);
369 } else { 335 else
370 kmem_zone_free(xfs_efd_zone, efdp); 336 kmem_zone_free(xfs_efd_zone, efdp);
371 }
372} 337}
373 338
374/* 339/*
@@ -376,9 +341,9 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
376 * We only need 1 iovec for an efd item. It just logs the efd_log_format 341 * We only need 1 iovec for an efd item. It just logs the efd_log_format
377 * structure. 342 * structure.
378 */ 343 */
379/*ARGSUSED*/
380STATIC uint 344STATIC uint
381xfs_efd_item_size(xfs_efd_log_item_t *efdp) 345xfs_efd_item_size(
346 struct xfs_log_item *lip)
382{ 347{
383 return 1; 348 return 1;
384} 349}
@@ -391,10 +356,12 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp)
391 * slots in the efd item have been filled. 356 * slots in the efd item have been filled.
392 */ 357 */
393STATIC void 358STATIC void
394xfs_efd_item_format(xfs_efd_log_item_t *efdp, 359xfs_efd_item_format(
395 xfs_log_iovec_t *log_vector) 360 struct xfs_log_item *lip,
361 struct xfs_log_iovec *log_vector)
396{ 362{
397 uint size; 363 struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
364 uint size;
398 365
399 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); 366 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
400 367
@@ -404,48 +371,38 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp,
404 size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); 371 size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
405 efdp->efd_format.efd_size = 1; 372 efdp->efd_format.efd_size = 1;
406 373
407 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); 374 log_vector->i_addr = &efdp->efd_format;
408 log_vector->i_len = size; 375 log_vector->i_len = size;
409 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; 376 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
410 ASSERT(size >= sizeof(xfs_efd_log_format_t)); 377 ASSERT(size >= sizeof(xfs_efd_log_format_t));
411} 378}
412 379
413
414/* 380/*
415 * Pinning has no meaning for an efd item, so just return. 381 * Pinning has no meaning for an efd item, so just return.
416 */ 382 */
417/*ARGSUSED*/
418STATIC void 383STATIC void
419xfs_efd_item_pin(xfs_efd_log_item_t *efdp) 384xfs_efd_item_pin(
385 struct xfs_log_item *lip)
420{ 386{
421 return;
422} 387}
423 388
424
425/* 389/*
426 * Since pinning has no meaning for an efd item, unpinning does 390 * Since pinning has no meaning for an efd item, unpinning does
427 * not either. 391 * not either.
428 */ 392 */
429/*ARGSUSED*/
430STATIC void
431xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
432{
433 return;
434}
435
436/*ARGSUSED*/
437STATIC void 393STATIC void
438xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp) 394xfs_efd_item_unpin(
395 struct xfs_log_item *lip,
396 int remove)
439{ 397{
440 return;
441} 398}
442 399
443/* 400/*
444 * Efd items have no locking, so just return success. 401 * Efd items have no locking, so just return success.
445 */ 402 */
446/*ARGSUSED*/
447STATIC uint 403STATIC uint
448xfs_efd_item_trylock(xfs_efd_log_item_t *efdp) 404xfs_efd_item_trylock(
405 struct xfs_log_item *lip)
449{ 406{
450 return XFS_ITEM_LOCKED; 407 return XFS_ITEM_LOCKED;
451} 408}
@@ -454,13 +411,12 @@ xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
454 * Efd items have no locking or pushing, so return failure 411 * Efd items have no locking or pushing, so return failure
455 * so that the caller doesn't bother with us. 412 * so that the caller doesn't bother with us.
456 */ 413 */
457/*ARGSUSED*/
458STATIC void 414STATIC void
459xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) 415xfs_efd_item_unlock(
416 struct xfs_log_item *lip)
460{ 417{
461 if (efdp->efd_item.li_flags & XFS_LI_ABORTED) 418 if (lip->li_flags & XFS_LI_ABORTED)
462 xfs_efd_item_free(efdp); 419 xfs_efd_item_free(EFD_ITEM(lip));
463 return;
464} 420}
465 421
466/* 422/*
@@ -470,15 +426,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
470 * return -1 to keep the transaction code from further referencing 426 * return -1 to keep the transaction code from further referencing
471 * this item. 427 * this item.
472 */ 428 */
473/*ARGSUSED*/
474STATIC xfs_lsn_t 429STATIC xfs_lsn_t
475xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) 430xfs_efd_item_committed(
431 struct xfs_log_item *lip,
432 xfs_lsn_t lsn)
476{ 433{
434 struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
435
477 /* 436 /*
478 * If we got a log I/O error, it's always the case that the LR with the 437 * If we got a log I/O error, it's always the case that the LR with the
479 * EFI got unpinned and freed before the EFD got aborted. 438 * EFI got unpinned and freed before the EFD got aborted.
480 */ 439 */
481 if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) 440 if (!(lip->li_flags & XFS_LI_ABORTED))
482 xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); 441 xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
483 442
484 xfs_efd_item_free(efdp); 443 xfs_efd_item_free(efdp);
@@ -489,11 +448,10 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
489 * There isn't much you can do to push on an efd item. It is simply 448 * There isn't much you can do to push on an efd item. It is simply
490 * stuck waiting for the log to be flushed to disk. 449 * stuck waiting for the log to be flushed to disk.
491 */ 450 */
492/*ARGSUSED*/
493STATIC void 451STATIC void
494xfs_efd_item_push(xfs_efd_log_item_t *efdp) 452xfs_efd_item_push(
453 struct xfs_log_item *lip)
495{ 454{
496 return;
497} 455}
498 456
499/* 457/*
@@ -503,64 +461,54 @@ xfs_efd_item_push(xfs_efd_log_item_t *efdp)
503 * example, for inodes, the inode is locked throughout the extent freeing 461 * example, for inodes, the inode is locked throughout the extent freeing
504 * so the dependency should be recorded there. 462 * so the dependency should be recorded there.
505 */ 463 */
506/*ARGSUSED*/
507STATIC void 464STATIC void
508xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn) 465xfs_efd_item_committing(
466 struct xfs_log_item *lip,
467 xfs_lsn_t lsn)
509{ 468{
510 return;
511} 469}
512 470
513/* 471/*
514 * This is the ops vector shared by all efd log items. 472 * This is the ops vector shared by all efd log items.
515 */ 473 */
516static struct xfs_item_ops xfs_efd_item_ops = { 474static struct xfs_item_ops xfs_efd_item_ops = {
517 .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size, 475 .iop_size = xfs_efd_item_size,
518 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 476 .iop_format = xfs_efd_item_format,
519 xfs_efd_item_format, 477 .iop_pin = xfs_efd_item_pin,
520 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, 478 .iop_unpin = xfs_efd_item_unpin,
521 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, 479 .iop_trylock = xfs_efd_item_trylock,
522 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 480 .iop_unlock = xfs_efd_item_unlock,
523 xfs_efd_item_unpin_remove, 481 .iop_committed = xfs_efd_item_committed,
524 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, 482 .iop_push = xfs_efd_item_push,
525 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock, 483 .iop_committing = xfs_efd_item_committing
526 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
527 xfs_efd_item_committed,
528 .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push,
529 .iop_pushbuf = NULL,
530 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
531 xfs_efd_item_committing
532}; 484};
533 485
534
535/* 486/*
536 * Allocate and initialize an efd item with the given number of extents. 487 * Allocate and initialize an efd item with the given number of extents.
537 */ 488 */
538xfs_efd_log_item_t * 489struct xfs_efd_log_item *
539xfs_efd_init(xfs_mount_t *mp, 490xfs_efd_init(
540 xfs_efi_log_item_t *efip, 491 struct xfs_mount *mp,
541 uint nextents) 492 struct xfs_efi_log_item *efip,
493 uint nextents)
542 494
543{ 495{
544 xfs_efd_log_item_t *efdp; 496 struct xfs_efd_log_item *efdp;
545 uint size; 497 uint size;
546 498
547 ASSERT(nextents > 0); 499 ASSERT(nextents > 0);
548 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { 500 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
549 size = (uint)(sizeof(xfs_efd_log_item_t) + 501 size = (uint)(sizeof(xfs_efd_log_item_t) +
550 ((nextents - 1) * sizeof(xfs_extent_t))); 502 ((nextents - 1) * sizeof(xfs_extent_t)));
551 efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP); 503 efdp = kmem_zalloc(size, KM_SLEEP);
552 } else { 504 } else {
553 efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone, 505 efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
554 KM_SLEEP);
555 } 506 }
556 507
557 efdp->efd_item.li_type = XFS_LI_EFD; 508 xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
558 efdp->efd_item.li_ops = &xfs_efd_item_ops;
559 efdp->efd_item.li_mountp = mp;
560 efdp->efd_item.li_ailp = mp->m_ail;
561 efdp->efd_efip = efip; 509 efdp->efd_efip = efip;
562 efdp->efd_format.efd_nextents = nextents; 510 efdp->efd_format.efd_nextents = nextents;
563 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; 511 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
564 512
565 return (efdp); 513 return efdp;
566} 514}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 390850ee6603..9b715dce5699 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -18,13 +18,9 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_bmap_btree.h" 19#include "xfs_bmap_btree.h"
20#include "xfs_inum.h" 20#include "xfs_inum.h"
21#include "xfs_dir2.h"
22#include "xfs_dir2_sf.h"
23#include "xfs_attr_sf.h"
24#include "xfs_dinode.h" 21#include "xfs_dinode.h"
25#include "xfs_inode.h" 22#include "xfs_inode.h"
26#include "xfs_ag.h" 23#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_log.h" 24#include "xfs_log.h"
29#include "xfs_trans.h" 25#include "xfs_trans.h"
30#include "xfs_sb.h" 26#include "xfs_sb.h"
@@ -127,6 +123,82 @@ typedef struct fstrm_item
127 xfs_inode_t *pip; /* Parent directory inode pointer. */ 123 xfs_inode_t *pip; /* Parent directory inode pointer. */
128} fstrm_item_t; 124} fstrm_item_t;
129 125
126/*
127 * Allocation group filestream associations are tracked with per-ag atomic
128 * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
129 * particular AG already has active filestreams associated with it. The mount
130 * point's m_peraglock is used to protect these counters from per-ag array
131 * re-allocation during a growfs operation. When xfs_growfs_data_private() is
132 * about to reallocate the array, it calls xfs_filestream_flush() with the
133 * m_peraglock held in write mode.
134 *
135 * Since xfs_mru_cache_flush() guarantees that all the free functions for all
136 * the cache elements have finished executing before it returns, it's safe for
137 * the free functions to use the atomic counters without m_peraglock protection.
138 * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
139 * whether it was called with the m_peraglock held in read mode, write mode or
140 * not held at all. The race condition this addresses is the following:
141 *
142 * - The work queue scheduler fires and pulls a filestream directory cache
143 * element off the LRU end of the cache for deletion, then gets pre-empted.
144 * - A growfs operation grabs the m_peraglock in write mode, flushes all the
145 * remaining items from the cache and reallocates the mount point's per-ag
146 * array, resetting all the counters to zero.
147 * - The work queue thread resumes and calls the free function for the element
148 * it started cleaning up earlier. In the process it decrements the
149 * filestreams counter for an AG that now has no references.
150 *
151 * With a shrinkfs feature, the above scenario could panic the system.
152 *
153 * All other uses of the following macros should be protected by either the
154 * m_peraglock held in read mode, or the cache's internal locking exposed by the
155 * interval between a call to xfs_mru_cache_lookup() and a call to
156 * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
157 * when new elements are added to the cache.
158 *
159 * Combined, these locking rules ensure that no associations will ever exist in
160 * the cache that reference per-ag array elements that have since been
161 * reallocated.
162 */
163static int
164xfs_filestream_peek_ag(
165 xfs_mount_t *mp,
166 xfs_agnumber_t agno)
167{
168 struct xfs_perag *pag;
169 int ret;
170
171 pag = xfs_perag_get(mp, agno);
172 ret = atomic_read(&pag->pagf_fstrms);
173 xfs_perag_put(pag);
174 return ret;
175}
176
177static int
178xfs_filestream_get_ag(
179 xfs_mount_t *mp,
180 xfs_agnumber_t agno)
181{
182 struct xfs_perag *pag;
183 int ret;
184
185 pag = xfs_perag_get(mp, agno);
186 ret = atomic_inc_return(&pag->pagf_fstrms);
187 xfs_perag_put(pag);
188 return ret;
189}
190
191static void
192xfs_filestream_put_ag(
193 xfs_mount_t *mp,
194 xfs_agnumber_t agno)
195{
196 struct xfs_perag *pag;
197
198 pag = xfs_perag_get(mp, agno);
199 atomic_dec(&pag->pagf_fstrms);
200 xfs_perag_put(pag);
201}
130 202
131/* 203/*
132 * Scan the AGs starting at startag looking for an AG that isn't in use and has 204 * Scan the AGs starting at startag looking for an AG that isn't in use and has
@@ -355,16 +427,14 @@ xfs_fstrm_free_func(
355{ 427{
356 fstrm_item_t *item = (fstrm_item_t *)data; 428 fstrm_item_t *item = (fstrm_item_t *)data;
357 xfs_inode_t *ip = item->ip; 429 xfs_inode_t *ip = item->ip;
358 int ref;
359 430
360 ASSERT(ip->i_ino == ino); 431 ASSERT(ip->i_ino == ino);
361 432
362 xfs_iflags_clear(ip, XFS_IFILESTREAM); 433 xfs_iflags_clear(ip, XFS_IFILESTREAM);
363 434
364 /* Drop the reference taken on the AG when the item was added. */ 435 /* Drop the reference taken on the AG when the item was added. */
365 ref = xfs_filestream_put_ag(ip->i_mount, item->ag); 436 xfs_filestream_put_ag(ip->i_mount, item->ag);
366 437
367 ASSERT(ref >= 0);
368 TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, 438 TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
369 xfs_filestream_peek_ag(ip->i_mount, item->ag)); 439 xfs_filestream_peek_ag(ip->i_mount, item->ag));
370 440
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 260f757bbc5d..09dd9af45434 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -42,88 +42,6 @@ extern ktrace_t *xfs_filestreams_trace_buf;
42 42
43#endif 43#endif
44 44
45/*
46 * Allocation group filestream associations are tracked with per-ag atomic
47 * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
48 * particular AG already has active filestreams associated with it. The mount
49 * point's m_peraglock is used to protect these counters from per-ag array
50 * re-allocation during a growfs operation. When xfs_growfs_data_private() is
51 * about to reallocate the array, it calls xfs_filestream_flush() with the
52 * m_peraglock held in write mode.
53 *
54 * Since xfs_mru_cache_flush() guarantees that all the free functions for all
55 * the cache elements have finished executing before it returns, it's safe for
56 * the free functions to use the atomic counters without m_peraglock protection.
57 * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
58 * whether it was called with the m_peraglock held in read mode, write mode or
59 * not held at all. The race condition this addresses is the following:
60 *
61 * - The work queue scheduler fires and pulls a filestream directory cache
62 * element off the LRU end of the cache for deletion, then gets pre-empted.
63 * - A growfs operation grabs the m_peraglock in write mode, flushes all the
64 * remaining items from the cache and reallocates the mount point's per-ag
65 * array, resetting all the counters to zero.
66 * - The work queue thread resumes and calls the free function for the element
67 * it started cleaning up earlier. In the process it decrements the
68 * filestreams counter for an AG that now has no references.
69 *
70 * With a shrinkfs feature, the above scenario could panic the system.
71 *
72 * All other uses of the following macros should be protected by either the
73 * m_peraglock held in read mode, or the cache's internal locking exposed by the
74 * interval between a call to xfs_mru_cache_lookup() and a call to
75 * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
76 * when new elements are added to the cache.
77 *
78 * Combined, these locking rules ensure that no associations will ever exist in
79 * the cache that reference per-ag array elements that have since been
80 * reallocated.
81 */
82/*
83 * xfs_filestream_peek_ag is only used in tracing code
84 */
85static inline int
86xfs_filestream_peek_ag(
87 xfs_mount_t *mp,
88 xfs_agnumber_t agno)
89{
90 struct xfs_perag *pag;
91 int ret;
92
93 pag = xfs_perag_get(mp, agno);
94 ret = atomic_read(&pag->pagf_fstrms);
95 xfs_perag_put(pag);
96 return ret;
97}
98
99static inline int
100xfs_filestream_get_ag(
101 xfs_mount_t *mp,
102 xfs_agnumber_t agno)
103{
104 struct xfs_perag *pag;
105 int ret;
106
107 pag = xfs_perag_get(mp, agno);
108 ret = atomic_inc_return(&pag->pagf_fstrms);
109 xfs_perag_put(pag);
110 return ret;
111}
112
113static inline int
114xfs_filestream_put_ag(
115 xfs_mount_t *mp,
116 xfs_agnumber_t agno)
117{
118 struct xfs_perag *pag;
119 int ret;
120
121 pag = xfs_perag_get(mp, agno);
122 ret = atomic_dec_return(&pag->pagf_fstrms);
123 xfs_perag_put(pag);
124 return ret;
125}
126
127/* allocation selection flags */ 45/* allocation selection flags */
128typedef enum xfs_fstrm_alloc { 46typedef enum xfs_fstrm_alloc {
129 XFS_PICK_USERDATA = 1, 47 XFS_PICK_USERDATA = 1,
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5f..87c2e9d02288 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ 114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ 115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ 116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
117#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
117#define BMV_IF_VALID \ 118#define BMV_IF_VALID \
118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) 119 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
120 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
119 121
120/* bmv_oflags values - returned for each non-header segment */ 122/* bmv_oflags values - returned for each non-header segment */
121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ 123#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 37a6f62c57b6..43b1d5699335 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,14 +24,10 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
@@ -608,32 +604,36 @@ out:
608 return 0; 604 return 0;
609} 605}
610 606
607/*
608 * Dump a transaction into the log that contains no real change. This is needed
609 * to be able to make the log dirty or stamp the current tail LSN into the log
610 * during the covering operation.
611 *
612 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead.
615 */
611int 616int
612xfs_fs_log_dummy( 617xfs_fs_log_dummy(
613 xfs_mount_t *mp) 618 xfs_mount_t *mp,
619 int flags)
614{ 620{
615 xfs_trans_t *tp; 621 xfs_trans_t *tp;
616 xfs_inode_t *ip;
617 int error; 622 int error;
618 623
619 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); 624 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
620 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 625 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
626 XFS_DEFAULT_LOG_COUNT);
621 if (error) { 627 if (error) {
622 xfs_trans_cancel(tp, 0); 628 xfs_trans_cancel(tp, 0);
623 return error; 629 return error;
624 } 630 }
625 631
626 ip = mp->m_rootip; 632 /* log the UUID because it is an unchanging field */
627 xfs_ilock(ip, XFS_ILOCK_EXCL); 633 xfs_mod_sb(tp, XFS_SB_UUID);
628 634 if (flags & SYNC_WAIT)
629 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 635 xfs_trans_set_sync(tp);
630 xfs_trans_ihold(tp, ip); 636 return xfs_trans_commit(tp, 0);
631 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
632 xfs_trans_set_sync(tp);
633 error = xfs_trans_commit(tp, 0);
634
635 xfs_iunlock(ip, XFS_ILOCK_EXCL);
636 return error;
637} 637}
638 638
639int 639int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c9..a786c5212c1e 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp); 28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 9d884c127bb9..5371d2dc360e 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -24,14 +24,10 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
@@ -1203,6 +1199,67 @@ error0:
1203 return error; 1199 return error;
1204} 1200}
1205 1201
1202STATIC int
1203xfs_imap_lookup(
1204 struct xfs_mount *mp,
1205 struct xfs_trans *tp,
1206 xfs_agnumber_t agno,
1207 xfs_agino_t agino,
1208 xfs_agblock_t agbno,
1209 xfs_agblock_t *chunk_agbno,
1210 xfs_agblock_t *offset_agbno,
1211 int flags)
1212{
1213 struct xfs_inobt_rec_incore rec;
1214 struct xfs_btree_cur *cur;
1215 struct xfs_buf *agbp;
1216 int error;
1217 int i;
1218
1219 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1220 if (error) {
1221 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1222 "xfs_ialloc_read_agi() returned "
1223 "error %d, agno %d",
1224 error, agno);
1225 return error;
1226 }
1227
1228 /*
1229 * Lookup the inode record for the given agino. If the record cannot be
1230 * found, then it's an invalid inode number and we should abort. Once
1231 * we have a record, we need to ensure it contains the inode number
1232 * we are looking up.
1233 */
1234 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1235 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1236 if (!error) {
1237 if (i)
1238 error = xfs_inobt_get_rec(cur, &rec, &i);
1239 if (!error && i == 0)
1240 error = EINVAL;
1241 }
1242
1243 xfs_trans_brelse(tp, agbp);
1244 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1245 if (error)
1246 return error;
1247
1248 /* check that the returned record contains the required inode */
1249 if (rec.ir_startino > agino ||
1250 rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
1251 return EINVAL;
1252
1253 /* for untrusted inodes check it is allocated first */
1254 if ((flags & XFS_IGET_UNTRUSTED) &&
1255 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
1256 return EINVAL;
1257
1258 *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
1259 *offset_agbno = agbno - *chunk_agbno;
1260 return 0;
1261}
1262
1206/* 1263/*
1207 * Return the location of the inode in imap, for mapping it into a buffer. 1264 * Return the location of the inode in imap, for mapping it into a buffer.
1208 */ 1265 */
@@ -1235,8 +1292,11 @@ xfs_imap(
1235 if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || 1292 if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
1236 ino != XFS_AGINO_TO_INO(mp, agno, agino)) { 1293 ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1237#ifdef DEBUG 1294#ifdef DEBUG
1238 /* no diagnostics for bulkstat, ino comes from userspace */ 1295 /*
1239 if (flags & XFS_IGET_BULKSTAT) 1296 * Don't output diagnostic information for untrusted inodes
1297 * as they can be invalid without implying corruption.
1298 */
1299 if (flags & XFS_IGET_UNTRUSTED)
1240 return XFS_ERROR(EINVAL); 1300 return XFS_ERROR(EINVAL);
1241 if (agno >= mp->m_sb.sb_agcount) { 1301 if (agno >= mp->m_sb.sb_agcount) {
1242 xfs_fs_cmn_err(CE_ALERT, mp, 1302 xfs_fs_cmn_err(CE_ALERT, mp,
@@ -1263,6 +1323,23 @@ xfs_imap(
1263 return XFS_ERROR(EINVAL); 1323 return XFS_ERROR(EINVAL);
1264 } 1324 }
1265 1325
1326 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
1327
1328 /*
1329 * For bulkstat and handle lookups, we have an untrusted inode number
1330 * that we have to verify is valid. We cannot do this just by reading
1331 * the inode buffer as it may have been unlinked and removed leaving
1332 * inodes in stale state on disk. Hence we have to do a btree lookup
1333 * in all cases where an untrusted inode number is passed.
1334 */
1335 if (flags & XFS_IGET_UNTRUSTED) {
1336 error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
1337 &chunk_agbno, &offset_agbno, flags);
1338 if (error)
1339 return error;
1340 goto out_map;
1341 }
1342
1266 /* 1343 /*
1267 * If the inode cluster size is the same as the blocksize or 1344 * If the inode cluster size is the same as the blocksize or
1268 * smaller we get to the buffer by simple arithmetics. 1345 * smaller we get to the buffer by simple arithmetics.
@@ -1277,24 +1354,6 @@ xfs_imap(
1277 return 0; 1354 return 0;
1278 } 1355 }
1279 1356
1280 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
1281
1282 /*
1283 * If we get a block number passed from bulkstat we can use it to
1284 * find the buffer easily.
1285 */
1286 if (imap->im_blkno) {
1287 offset = XFS_INO_TO_OFFSET(mp, ino);
1288 ASSERT(offset < mp->m_sb.sb_inopblock);
1289
1290 cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno);
1291 offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
1292
1293 imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
1294 imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
1295 return 0;
1296 }
1297
1298 /* 1357 /*
1299 * If the inode chunks are aligned then use simple maths to 1358 * If the inode chunks are aligned then use simple maths to
1300 * find the location. Otherwise we have to do a btree 1359 * find the location. Otherwise we have to do a btree
@@ -1304,50 +1363,13 @@ xfs_imap(
1304 offset_agbno = agbno & mp->m_inoalign_mask; 1363 offset_agbno = agbno & mp->m_inoalign_mask;
1305 chunk_agbno = agbno - offset_agbno; 1364 chunk_agbno = agbno - offset_agbno;
1306 } else { 1365 } else {
1307 xfs_btree_cur_t *cur; /* inode btree cursor */ 1366 error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
1308 xfs_inobt_rec_incore_t chunk_rec; 1367 &chunk_agbno, &offset_agbno, flags);
1309 xfs_buf_t *agbp; /* agi buffer */
1310 int i; /* temp state */
1311
1312 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1313 if (error) {
1314 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1315 "xfs_ialloc_read_agi() returned "
1316 "error %d, agno %d",
1317 error, agno);
1318 return error;
1319 }
1320
1321 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1322 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1323 if (error) {
1324 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1325 "xfs_inobt_lookup() failed");
1326 goto error0;
1327 }
1328
1329 error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
1330 if (error) {
1331 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1332 "xfs_inobt_get_rec() failed");
1333 goto error0;
1334 }
1335 if (i == 0) {
1336#ifdef DEBUG
1337 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1338 "xfs_inobt_get_rec() failed");
1339#endif /* DEBUG */
1340 error = XFS_ERROR(EINVAL);
1341 }
1342 error0:
1343 xfs_trans_brelse(tp, agbp);
1344 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1345 if (error) 1368 if (error)
1346 return error; 1369 return error;
1347 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
1348 offset_agbno = agbno - chunk_agbno;
1349 } 1370 }
1350 1371
1372out_map:
1351 ASSERT(agbno >= chunk_agbno); 1373 ASSERT(agbno >= chunk_agbno);
1352 cluster_agbno = chunk_agbno + 1374 cluster_agbno = chunk_agbno +
1353 ((offset_agbno / blks_per_cluster) * blks_per_cluster); 1375 ((offset_agbno / blks_per_cluster) * blks_per_cluster);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c282a9af5393..d352862cefa0 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -24,14 +24,10 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db90818f..b1ecc6f97ade 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -25,14 +25,10 @@
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h" 27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 32#include "xfs_dinode.h"
37#include "xfs_inode.h" 33#include "xfs_inode.h"
38#include "xfs_btree.h" 34#include "xfs_btree.h"
@@ -95,7 +91,7 @@ xfs_inode_alloc(
95 return ip; 91 return ip;
96} 92}
97 93
98STATIC void 94void
99xfs_inode_free( 95xfs_inode_free(
100 struct xfs_inode *ip) 96 struct xfs_inode *ip)
101{ 97{
@@ -212,7 +208,7 @@ xfs_iget_cache_hit(
212 ip->i_flags &= ~XFS_INEW; 208 ip->i_flags &= ~XFS_INEW;
213 ip->i_flags |= XFS_IRECLAIMABLE; 209 ip->i_flags |= XFS_IRECLAIMABLE;
214 __xfs_inode_set_reclaim_tag(pag, ip); 210 __xfs_inode_set_reclaim_tag(pag, ip);
215 trace_xfs_iget_reclaim(ip); 211 trace_xfs_iget_reclaim_fail(ip);
216 goto out_error; 212 goto out_error;
217 } 213 }
218 214
@@ -227,6 +223,7 @@ xfs_iget_cache_hit(
227 } else { 223 } else {
228 /* If the VFS inode is being torn down, pause and try again. */ 224 /* If the VFS inode is being torn down, pause and try again. */
229 if (!igrab(inode)) { 225 if (!igrab(inode)) {
226 trace_xfs_iget_skip(ip);
230 error = EAGAIN; 227 error = EAGAIN;
231 goto out_error; 228 goto out_error;
232 } 229 }
@@ -234,6 +231,7 @@ xfs_iget_cache_hit(
234 /* We've got a live one. */ 231 /* We've got a live one. */
235 spin_unlock(&ip->i_flags_lock); 232 spin_unlock(&ip->i_flags_lock);
236 read_unlock(&pag->pag_ici_lock); 233 read_unlock(&pag->pag_ici_lock);
234 trace_xfs_iget_hit(ip);
237 } 235 }
238 236
239 if (lock_flags != 0) 237 if (lock_flags != 0)
@@ -242,7 +240,6 @@ xfs_iget_cache_hit(
242 xfs_iflags_clear(ip, XFS_ISTALE); 240 xfs_iflags_clear(ip, XFS_ISTALE);
243 XFS_STATS_INC(xs_ig_found); 241 XFS_STATS_INC(xs_ig_found);
244 242
245 trace_xfs_iget_found(ip);
246 return 0; 243 return 0;
247 244
248out_error: 245out_error:
@@ -259,24 +256,22 @@ xfs_iget_cache_miss(
259 xfs_trans_t *tp, 256 xfs_trans_t *tp,
260 xfs_ino_t ino, 257 xfs_ino_t ino,
261 struct xfs_inode **ipp, 258 struct xfs_inode **ipp,
262 xfs_daddr_t bno,
263 int flags, 259 int flags,
264 int lock_flags) 260 int lock_flags)
265{ 261{
266 struct xfs_inode *ip; 262 struct xfs_inode *ip;
267 int error; 263 int error;
268 unsigned long first_index, mask;
269 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 264 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
270 265
271 ip = xfs_inode_alloc(mp, ino); 266 ip = xfs_inode_alloc(mp, ino);
272 if (!ip) 267 if (!ip)
273 return ENOMEM; 268 return ENOMEM;
274 269
275 error = xfs_iread(mp, tp, ip, bno, flags); 270 error = xfs_iread(mp, tp, ip, flags);
276 if (error) 271 if (error)
277 goto out_destroy; 272 goto out_destroy;
278 273
279 xfs_itrace_entry(ip); 274 trace_xfs_iget_miss(ip);
280 275
281 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { 276 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
282 error = ENOENT; 277 error = ENOENT;
@@ -302,8 +297,6 @@ xfs_iget_cache_miss(
302 BUG(); 297 BUG();
303 } 298 }
304 299
305 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
306 first_index = agino & mask;
307 write_lock(&pag->pag_ici_lock); 300 write_lock(&pag->pag_ici_lock);
308 301
309 /* insert the new inode */ 302 /* insert the new inode */
@@ -322,7 +315,6 @@ xfs_iget_cache_miss(
322 write_unlock(&pag->pag_ici_lock); 315 write_unlock(&pag->pag_ici_lock);
323 radix_tree_preload_end(); 316 radix_tree_preload_end();
324 317
325 trace_xfs_iget_alloc(ip);
326 *ipp = ip; 318 *ipp = ip;
327 return 0; 319 return 0;
328 320
@@ -358,8 +350,6 @@ out_destroy:
358 * within the file system for the inode being requested. 350 * within the file system for the inode being requested.
359 * lock_flags -- flags indicating how to lock the inode. See the comment 351 * lock_flags -- flags indicating how to lock the inode. See the comment
360 * for xfs_ilock() for a list of valid values. 352 * for xfs_ilock() for a list of valid values.
361 * bno -- the block number starting the buffer containing the inode,
362 * if known (as by bulkstat), else 0.
363 */ 353 */
364int 354int
365xfs_iget( 355xfs_iget(
@@ -368,8 +358,7 @@ xfs_iget(
368 xfs_ino_t ino, 358 xfs_ino_t ino,
369 uint flags, 359 uint flags,
370 uint lock_flags, 360 uint lock_flags,
371 xfs_inode_t **ipp, 361 xfs_inode_t **ipp)
372 xfs_daddr_t bno)
373{ 362{
374 xfs_inode_t *ip; 363 xfs_inode_t *ip;
375 int error; 364 int error;
@@ -382,9 +371,6 @@ xfs_iget(
382 371
383 /* get the perag structure and ensure that it's inode capable */ 372 /* get the perag structure and ensure that it's inode capable */
384 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 373 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
385 if (!pag->pagi_inodeok)
386 return EINVAL;
387 ASSERT(pag->pag_ici_init);
388 agino = XFS_INO_TO_AGINO(mp, ino); 374 agino = XFS_INO_TO_AGINO(mp, ino);
389 375
390again: 376again:
@@ -400,7 +386,7 @@ again:
400 read_unlock(&pag->pag_ici_lock); 386 read_unlock(&pag->pag_ici_lock);
401 XFS_STATS_INC(xs_ig_missed); 387 XFS_STATS_INC(xs_ig_missed);
402 388
403 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno, 389 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
404 flags, lock_flags); 390 flags, lock_flags);
405 if (error) 391 if (error)
406 goto out_error_or_again; 392 goto out_error_or_again;
@@ -429,97 +415,6 @@ out_error_or_again:
429} 415}
430 416
431/* 417/*
432 * Decrement reference count of an inode structure and unlock it.
433 *
434 * ip -- the inode being released
435 * lock_flags -- this parameter indicates the inode's locks to be
436 * to be released. See the comment on xfs_iunlock() for a list
437 * of valid values.
438 */
439void
440xfs_iput(xfs_inode_t *ip,
441 uint lock_flags)
442{
443 xfs_itrace_entry(ip);
444 xfs_iunlock(ip, lock_flags);
445 IRELE(ip);
446}
447
448/*
449 * Special iput for brand-new inodes that are still locked
450 */
451void
452xfs_iput_new(
453 xfs_inode_t *ip,
454 uint lock_flags)
455{
456 struct inode *inode = VFS_I(ip);
457
458 xfs_itrace_entry(ip);
459
460 if ((ip->i_d.di_mode == 0)) {
461 ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
462 make_bad_inode(inode);
463 }
464 if (inode->i_state & I_NEW)
465 unlock_new_inode(inode);
466 if (lock_flags)
467 xfs_iunlock(ip, lock_flags);
468 IRELE(ip);
469}
470
471/*
472 * This is called free all the memory associated with an inode.
473 * It must free the inode itself and any buffers allocated for
474 * if_extents/if_data and if_broot. It must also free the lock
475 * associated with the inode.
476 *
477 * Note: because we don't initialise everything on reallocation out
478 * of the zone, we must ensure we nullify everything correctly before
479 * freeing the structure.
480 */
481void
482xfs_ireclaim(
483 struct xfs_inode *ip)
484{
485 struct xfs_mount *mp = ip->i_mount;
486 struct xfs_perag *pag;
487 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
488
489 XFS_STATS_INC(xs_ig_reclaims);
490
491 /*
492 * Remove the inode from the per-AG radix tree.
493 *
494 * Because radix_tree_delete won't complain even if the item was never
495 * added to the tree assert that it's been there before to catch
496 * problems with the inode life time early on.
497 */
498 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
499 write_lock(&pag->pag_ici_lock);
500 if (!radix_tree_delete(&pag->pag_ici_root, agino))
501 ASSERT(0);
502 write_unlock(&pag->pag_ici_lock);
503 xfs_perag_put(pag);
504
505 /*
506 * Here we do an (almost) spurious inode lock in order to coordinate
507 * with inode cache radix tree lookups. This is because the lookup
508 * can reference the inodes in the cache without taking references.
509 *
510 * We make that OK here by ensuring that we wait until the inode is
511 * unlocked after the lookup before we go ahead and free it. We get
512 * both the ilock and the iolock because the code may need to drop the
513 * ilock one but will still hold the iolock.
514 */
515 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
516 xfs_qm_dqdetach(ip);
517 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
518
519 xfs_inode_free(ip);
520}
521
522/*
523 * This is a wrapper routine around the xfs_ilock() routine 418 * This is a wrapper routine around the xfs_ilock() routine
524 * used to centralize some grungy code. It is used in places 419 * used to centralize some grungy code. It is used in places
525 * that wish to lock the inode solely for reading the extents. 420 * that wish to lock the inode solely for reading the extents.
@@ -744,30 +639,24 @@ xfs_ilock_demote(
744} 639}
745 640
746#ifdef DEBUG 641#ifdef DEBUG
747/*
748 * Debug-only routine, without additional rw_semaphore APIs, we can
749 * now only answer requests regarding whether we hold the lock for write
750 * (reader state is outside our visibility, we only track writer state).
751 *
752 * Note: this means !xfs_isilocked would give false positives, so don't do that.
753 */
754int 642int
755xfs_isilocked( 643xfs_isilocked(
756 xfs_inode_t *ip, 644 xfs_inode_t *ip,
757 uint lock_flags) 645 uint lock_flags)
758{ 646{
759 if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) == 647 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
760 XFS_ILOCK_EXCL) { 648 if (!(lock_flags & XFS_ILOCK_SHARED))
761 if (!ip->i_lock.mr_writer) 649 return !!ip->i_lock.mr_writer;
762 return 0; 650 return rwsem_is_locked(&ip->i_lock.mr_lock);
763 } 651 }
764 652
765 if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) == 653 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
766 XFS_IOLOCK_EXCL) { 654 if (!(lock_flags & XFS_IOLOCK_SHARED))
767 if (!ip->i_iolock.mr_writer) 655 return !!ip->i_iolock.mr_writer;
768 return 0; 656 return rwsem_is_locked(&ip->i_iolock.mr_lock);
769 } 657 }
770 658
771 return 1; 659 ASSERT(0);
660 return 0;
772} 661}
773#endif 662#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ffd56447045..34798f391c49 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -27,13 +27,10 @@
27#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
28#include "xfs_sb.h" 28#include "xfs_sb.h"
29#include "xfs_ag.h" 29#include "xfs_ag.h"
30#include "xfs_dir2.h"
31#include "xfs_dmapi.h"
32#include "xfs_mount.h" 30#include "xfs_mount.h"
33#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
34#include "xfs_alloc_btree.h" 32#include "xfs_alloc_btree.h"
35#include "xfs_ialloc_btree.h" 33#include "xfs_ialloc_btree.h"
36#include "xfs_dir2_sf.h"
37#include "xfs_attr_sf.h" 34#include "xfs_attr_sf.h"
38#include "xfs_dinode.h" 35#include "xfs_dinode.h"
39#include "xfs_inode.h" 36#include "xfs_inode.h"
@@ -44,7 +41,6 @@
44#include "xfs_alloc.h" 41#include "xfs_alloc.h"
45#include "xfs_ialloc.h" 42#include "xfs_ialloc.h"
46#include "xfs_bmap.h" 43#include "xfs_bmap.h"
47#include "xfs_rw.h"
48#include "xfs_error.h" 44#include "xfs_error.h"
49#include "xfs_utils.h" 45#include "xfs_utils.h"
50#include "xfs_quota.h" 46#include "xfs_quota.h"
@@ -177,7 +173,7 @@ xfs_imap_to_bp(
177 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 173 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
178 XFS_ERRTAG_ITOBP_INOTOBP, 174 XFS_ERRTAG_ITOBP_INOTOBP,
179 XFS_RANDOM_ITOBP_INOTOBP))) { 175 XFS_RANDOM_ITOBP_INOTOBP))) {
180 if (iget_flags & XFS_IGET_BULKSTAT) { 176 if (iget_flags & XFS_IGET_UNTRUSTED) {
181 xfs_trans_brelse(tp, bp); 177 xfs_trans_brelse(tp, bp);
182 return XFS_ERROR(EINVAL); 178 return XFS_ERROR(EINVAL);
183 } 179 }
@@ -426,7 +422,7 @@ xfs_iformat(
426 if (!XFS_DFORK_Q(dip)) 422 if (!XFS_DFORK_Q(dip))
427 return 0; 423 return 0;
428 ASSERT(ip->i_afp == NULL); 424 ASSERT(ip->i_afp == NULL);
429 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 425 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
430 ip->i_afp->if_ext_max = 426 ip->i_afp->if_ext_max =
431 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 427 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
432 switch (dip->di_aformat) { 428 switch (dip->di_aformat) {
@@ -509,7 +505,7 @@ xfs_iformat_local(
509 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 505 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
510 else { 506 else {
511 real_size = roundup(size, 4); 507 real_size = roundup(size, 4);
512 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 508 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
513 } 509 }
514 ifp->if_bytes = size; 510 ifp->if_bytes = size;
515 ifp->if_real_bytes = real_size; 511 ifp->if_real_bytes = real_size;
@@ -636,7 +632,7 @@ xfs_iformat_btree(
636 } 632 }
637 633
638 ifp->if_broot_bytes = size; 634 ifp->if_broot_bytes = size;
639 ifp->if_broot = kmem_alloc(size, KM_SLEEP); 635 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
640 ASSERT(ifp->if_broot != NULL); 636 ASSERT(ifp->if_broot != NULL);
641 /* 637 /*
642 * Copy and convert from the on-disk structure 638 * Copy and convert from the on-disk structure
@@ -787,7 +783,6 @@ xfs_iread(
787 xfs_mount_t *mp, 783 xfs_mount_t *mp,
788 xfs_trans_t *tp, 784 xfs_trans_t *tp,
789 xfs_inode_t *ip, 785 xfs_inode_t *ip,
790 xfs_daddr_t bno,
791 uint iget_flags) 786 uint iget_flags)
792{ 787{
793 xfs_buf_t *bp; 788 xfs_buf_t *bp;
@@ -797,11 +792,9 @@ xfs_iread(
797 /* 792 /*
798 * Fill in the location information in the in-core inode. 793 * Fill in the location information in the in-core inode.
799 */ 794 */
800 ip->i_imap.im_blkno = bno;
801 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); 795 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
802 if (error) 796 if (error)
803 return error; 797 return error;
804 ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
805 798
806 /* 799 /*
807 * Get pointers to the on-disk inode and the buffer containing it. 800 * Get pointers to the on-disk inode and the buffer containing it.
@@ -925,7 +918,6 @@ xfs_iread_extents(
925 int error; 918 int error;
926 xfs_ifork_t *ifp; 919 xfs_ifork_t *ifp;
927 xfs_extnum_t nextents; 920 xfs_extnum_t nextents;
928 size_t size;
929 921
930 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 922 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
931 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 923 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
@@ -933,7 +925,6 @@ xfs_iread_extents(
933 return XFS_ERROR(EFSCORRUPTED); 925 return XFS_ERROR(EFSCORRUPTED);
934 } 926 }
935 nextents = XFS_IFORK_NEXTENTS(ip, whichfork); 927 nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
936 size = nextents * sizeof(xfs_bmbt_rec_t);
937 ifp = XFS_IFORK_PTR(ip, whichfork); 928 ifp = XFS_IFORK_PTR(ip, whichfork);
938 929
939 /* 930 /*
@@ -1229,7 +1220,7 @@ xfs_isize_check(
1229 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - 1220 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
1230 map_first), 1221 map_first),
1231 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, 1222 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
1232 NULL, NULL)) 1223 NULL))
1233 return; 1224 return;
1234 ASSERT(nimaps == 1); 1225 ASSERT(nimaps == 1);
1235 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1226 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
@@ -1463,7 +1454,7 @@ xfs_itruncate_finish(
1463 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 1454 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
1464 ASSERT(ip->i_transp == *tp); 1455 ASSERT(ip->i_transp == *tp);
1465 ASSERT(ip->i_itemp != NULL); 1456 ASSERT(ip->i_itemp != NULL);
1466 ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); 1457 ASSERT(ip->i_itemp->ili_lock_flags == 0);
1467 1458
1468 1459
1469 ntp = *tp; 1460 ntp = *tp;
@@ -1592,11 +1583,10 @@ xfs_itruncate_finish(
1592 xfs_bmap_init(&free_list, &first_block); 1583 xfs_bmap_init(&free_list, &first_block);
1593 error = xfs_bunmapi(ntp, ip, 1584 error = xfs_bunmapi(ntp, ip,
1594 first_unmap_block, unmap_len, 1585 first_unmap_block, unmap_len,
1595 xfs_bmapi_aflag(fork) | 1586 xfs_bmapi_aflag(fork),
1596 (sync ? 0 : XFS_BMAPI_ASYNC),
1597 XFS_ITRUNC_MAX_EXTENTS, 1587 XFS_ITRUNC_MAX_EXTENTS,
1598 &first_block, &free_list, 1588 &first_block, &free_list,
1599 NULL, &done); 1589 &done);
1600 if (error) { 1590 if (error) {
1601 /* 1591 /*
1602 * If the bunmapi call encounters an error, 1592 * If the bunmapi call encounters an error,
@@ -1615,12 +1605,8 @@ xfs_itruncate_finish(
1615 */ 1605 */
1616 error = xfs_bmap_finish(tp, &free_list, &committed); 1606 error = xfs_bmap_finish(tp, &free_list, &committed);
1617 ntp = *tp; 1607 ntp = *tp;
1618 if (committed) { 1608 if (committed)
1619 /* link the inode into the next xact in the chain */ 1609 xfs_trans_ijoin(ntp, ip);
1620 xfs_trans_ijoin(ntp, ip,
1621 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1622 xfs_trans_ihold(ntp, ip);
1623 }
1624 1610
1625 if (error) { 1611 if (error) {
1626 /* 1612 /*
@@ -1649,9 +1635,7 @@ xfs_itruncate_finish(
1649 error = xfs_trans_commit(*tp, 0); 1635 error = xfs_trans_commit(*tp, 0);
1650 *tp = ntp; 1636 *tp = ntp;
1651 1637
1652 /* link the inode into the next transaction in the chain */ 1638 xfs_trans_ijoin(ntp, ip);
1653 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1654 xfs_trans_ihold(ntp, ip);
1655 1639
1656 if (error) 1640 if (error)
1657 return error; 1641 return error;
@@ -1930,6 +1914,11 @@ xfs_iunlink_remove(
1930 return 0; 1914 return 0;
1931} 1915}
1932 1916
1917/*
1918 * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1919 * inodes that are in memory - they all must be marked stale and attached to
1920 * the cluster buffer.
1921 */
1933STATIC void 1922STATIC void
1934xfs_ifree_cluster( 1923xfs_ifree_cluster(
1935 xfs_inode_t *free_ip, 1924 xfs_inode_t *free_ip,
@@ -1940,10 +1929,10 @@ xfs_ifree_cluster(
1940 int blks_per_cluster; 1929 int blks_per_cluster;
1941 int nbufs; 1930 int nbufs;
1942 int ninodes; 1931 int ninodes;
1943 int i, j, found, pre_flushed; 1932 int i, j;
1944 xfs_daddr_t blkno; 1933 xfs_daddr_t blkno;
1945 xfs_buf_t *bp; 1934 xfs_buf_t *bp;
1946 xfs_inode_t *ip, **ip_found; 1935 xfs_inode_t *ip;
1947 xfs_inode_log_item_t *iip; 1936 xfs_inode_log_item_t *iip;
1948 xfs_log_item_t *lip; 1937 xfs_log_item_t *lip;
1949 struct xfs_perag *pag; 1938 struct xfs_perag *pag;
@@ -1960,109 +1949,91 @@ xfs_ifree_cluster(
1960 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 1949 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1961 } 1950 }
1962 1951
1963 ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
1964
1965 for (j = 0; j < nbufs; j++, inum += ninodes) { 1952 for (j = 0; j < nbufs; j++, inum += ninodes) {
1966 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1953 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1967 XFS_INO_TO_AGBNO(mp, inum)); 1954 XFS_INO_TO_AGBNO(mp, inum));
1968 1955
1956 /*
1957 * We obtain and lock the backing buffer first in the process
1958 * here, as we have to ensure that any dirty inode that we
1959 * can't get the flush lock on is attached to the buffer.
1960 * If we scan the in-memory inodes first, then buffer IO can
1961 * complete before we get a lock on it, and hence we may fail
1962 * to mark all the active inodes on the buffer stale.
1963 */
1964 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1965 mp->m_bsize * blks_per_cluster,
1966 XBF_LOCK);
1967
1968 /*
1969 * Walk the inodes already attached to the buffer and mark them
1970 * stale. These will all have the flush locks held, so an
1971 * in-memory inode walk can't lock them. By marking them all
1972 * stale first, we will not attempt to lock them in the loop
1973 * below as the XFS_ISTALE flag will be set.
1974 */
1975 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1976 while (lip) {
1977 if (lip->li_type == XFS_LI_INODE) {
1978 iip = (xfs_inode_log_item_t *)lip;
1979 ASSERT(iip->ili_logged == 1);
1980 lip->li_cb = xfs_istale_done;
1981 xfs_trans_ail_copy_lsn(mp->m_ail,
1982 &iip->ili_flush_lsn,
1983 &iip->ili_item.li_lsn);
1984 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1985 }
1986 lip = lip->li_bio_list;
1987 }
1988
1969 1989
1970 /* 1990 /*
1971 * Look for each inode in memory and attempt to lock it, 1991 * For each inode in memory attempt to add it to the inode
1972 * we can be racing with flush and tail pushing here. 1992 * buffer and set it up for being staled on buffer IO
1973 * any inode we get the locks on, add to an array of 1993 * completion. This is safe as we've locked out tail pushing
1974 * inode items to process later. 1994 * and flushing by locking the buffer.
1975 * 1995 *
1976 * The get the buffer lock, we could beat a flush 1996 * We have already marked every inode that was part of a
1977 * or tail pushing thread to the lock here, in which 1997 * transaction stale above, which means there is no point in
1978 * case they will go looking for the inode buffer 1998 * even trying to lock them.
1979 * and fail, we need some other form of interlock
1980 * here.
1981 */ 1999 */
1982 found = 0;
1983 for (i = 0; i < ninodes; i++) { 2000 for (i = 0; i < ninodes; i++) {
2001retry:
1984 read_lock(&pag->pag_ici_lock); 2002 read_lock(&pag->pag_ici_lock);
1985 ip = radix_tree_lookup(&pag->pag_ici_root, 2003 ip = radix_tree_lookup(&pag->pag_ici_root,
1986 XFS_INO_TO_AGINO(mp, (inum + i))); 2004 XFS_INO_TO_AGINO(mp, (inum + i)));
1987 2005
1988 /* Inode not in memory or we found it already, 2006 /* Inode not in memory or stale, nothing to do */
1989 * nothing to do
1990 */
1991 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2007 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
1992 read_unlock(&pag->pag_ici_lock); 2008 read_unlock(&pag->pag_ici_lock);
1993 continue; 2009 continue;
1994 } 2010 }
1995 2011
1996 if (xfs_inode_clean(ip)) { 2012 /*
1997 read_unlock(&pag->pag_ici_lock); 2013 * Don't try to lock/unlock the current inode, but we
1998 continue; 2014 * _cannot_ skip the other inodes that we did not find
1999 } 2015 * in the list attached to the buffer and are not
2000 2016 * already marked stale. If we can't lock it, back off
2001 /* If we can get the locks then add it to the 2017 * and retry.
2002 * list, otherwise by the time we get the bp lock
2003 * below it will already be attached to the
2004 * inode buffer.
2005 */
2006
2007 /* This inode will already be locked - by us, lets
2008 * keep it that way.
2009 */ 2018 */
2010 2019 if (ip != free_ip &&
2011 if (ip == free_ip) { 2020 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2012 if (xfs_iflock_nowait(ip)) {
2013 xfs_iflags_set(ip, XFS_ISTALE);
2014 if (xfs_inode_clean(ip)) {
2015 xfs_ifunlock(ip);
2016 } else {
2017 ip_found[found++] = ip;
2018 }
2019 }
2020 read_unlock(&pag->pag_ici_lock); 2021 read_unlock(&pag->pag_ici_lock);
2021 continue; 2022 delay(1);
2022 } 2023 goto retry;
2023
2024 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2025 if (xfs_iflock_nowait(ip)) {
2026 xfs_iflags_set(ip, XFS_ISTALE);
2027
2028 if (xfs_inode_clean(ip)) {
2029 xfs_ifunlock(ip);
2030 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2031 } else {
2032 ip_found[found++] = ip;
2033 }
2034 } else {
2035 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2036 }
2037 } 2024 }
2038 read_unlock(&pag->pag_ici_lock); 2025 read_unlock(&pag->pag_ici_lock);
2039 }
2040 2026
2041 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2027 xfs_iflock(ip);
2042 mp->m_bsize * blks_per_cluster, 2028 xfs_iflags_set(ip, XFS_ISTALE);
2043 XBF_LOCK);
2044 2029
2045 pre_flushed = 0; 2030 /*
2046 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2031 * we don't need to attach clean inodes or those only
2047 while (lip) { 2032 * with unlogged changes (which we throw away, anyway).
2048 if (lip->li_type == XFS_LI_INODE) { 2033 */
2049 iip = (xfs_inode_log_item_t *)lip;
2050 ASSERT(iip->ili_logged == 1);
2051 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2052 xfs_trans_ail_copy_lsn(mp->m_ail,
2053 &iip->ili_flush_lsn,
2054 &iip->ili_item.li_lsn);
2055 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2056 pre_flushed++;
2057 }
2058 lip = lip->li_bio_list;
2059 }
2060
2061 for (i = 0; i < found; i++) {
2062 ip = ip_found[i];
2063 iip = ip->i_itemp; 2034 iip = ip->i_itemp;
2064 2035 if (!iip || xfs_inode_clean(ip)) {
2065 if (!iip) { 2036 ASSERT(ip != free_ip);
2066 ip->i_update_core = 0; 2037 ip->i_update_core = 0;
2067 xfs_ifunlock(ip); 2038 xfs_ifunlock(ip);
2068 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2039 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -2075,20 +2046,17 @@ xfs_ifree_cluster(
2075 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2046 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2076 &iip->ili_item.li_lsn); 2047 &iip->ili_item.li_lsn);
2077 2048
2078 xfs_buf_attach_iodone(bp, 2049 xfs_buf_attach_iodone(bp, xfs_istale_done,
2079 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2050 &iip->ili_item);
2080 xfs_istale_done, (xfs_log_item_t *)iip); 2051
2081 if (ip != free_ip) { 2052 if (ip != free_ip)
2082 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2053 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2083 }
2084 } 2054 }
2085 2055
2086 if (found || pre_flushed) 2056 xfs_trans_stale_inode_buf(tp, bp);
2087 xfs_trans_stale_inode_buf(tp, bp);
2088 xfs_trans_binval(tp, bp); 2057 xfs_trans_binval(tp, bp);
2089 } 2058 }
2090 2059
2091 kmem_free(ip_found);
2092 xfs_perag_put(pag); 2060 xfs_perag_put(pag);
2093} 2061}
2094 2062
@@ -2224,7 +2192,7 @@ xfs_iroot_realloc(
2224 */ 2192 */
2225 if (ifp->if_broot_bytes == 0) { 2193 if (ifp->if_broot_bytes == 0) {
2226 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 2194 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2227 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP); 2195 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2228 ifp->if_broot_bytes = (int)new_size; 2196 ifp->if_broot_bytes = (int)new_size;
2229 return; 2197 return;
2230 } 2198 }
@@ -2240,7 +2208,7 @@ xfs_iroot_realloc(
2240 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2208 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2241 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 2209 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
2242 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 2210 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2243 KM_SLEEP); 2211 KM_SLEEP | KM_NOFS);
2244 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2212 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2245 ifp->if_broot_bytes); 2213 ifp->if_broot_bytes);
2246 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2214 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -2266,7 +2234,7 @@ xfs_iroot_realloc(
2266 else 2234 else
2267 new_size = 0; 2235 new_size = 0;
2268 if (new_size > 0) { 2236 if (new_size > 0) {
2269 new_broot = kmem_alloc(new_size, KM_SLEEP); 2237 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2270 /* 2238 /*
2271 * First copy over the btree block header. 2239 * First copy over the btree block header.
2272 */ 2240 */
@@ -2370,7 +2338,8 @@ xfs_idata_realloc(
2370 real_size = roundup(new_size, 4); 2338 real_size = roundup(new_size, 4);
2371 if (ifp->if_u1.if_data == NULL) { 2339 if (ifp->if_u1.if_data == NULL) {
2372 ASSERT(ifp->if_real_bytes == 0); 2340 ASSERT(ifp->if_real_bytes == 0);
2373 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2341 ifp->if_u1.if_data = kmem_alloc(real_size,
2342 KM_SLEEP | KM_NOFS);
2374 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2343 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2375 /* 2344 /*
2376 * Only do the realloc if the underlying size 2345 * Only do the realloc if the underlying size
@@ -2381,11 +2350,12 @@ xfs_idata_realloc(
2381 kmem_realloc(ifp->if_u1.if_data, 2350 kmem_realloc(ifp->if_u1.if_data,
2382 real_size, 2351 real_size,
2383 ifp->if_real_bytes, 2352 ifp->if_real_bytes,
2384 KM_SLEEP); 2353 KM_SLEEP | KM_NOFS);
2385 } 2354 }
2386 } else { 2355 } else {
2387 ASSERT(ifp->if_real_bytes == 0); 2356 ASSERT(ifp->if_real_bytes == 0);
2388 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2357 ifp->if_u1.if_data = kmem_alloc(real_size,
2358 KM_SLEEP | KM_NOFS);
2389 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 2359 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2390 ifp->if_bytes); 2360 ifp->if_bytes);
2391 } 2361 }
@@ -2449,6 +2419,8 @@ xfs_iunpin_nowait(
2449{ 2419{
2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2420 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2451 2421
2422 trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2423
2452 /* Give the log a push to start the unpinning I/O */ 2424 /* Give the log a push to start the unpinning I/O */
2453 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2425 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2454 2426
@@ -2647,8 +2619,6 @@ xfs_iflush_cluster(
2647 int i; 2619 int i;
2648 2620
2649 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2621 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2650 ASSERT(pag->pagi_inodeok);
2651 ASSERT(pag->pag_ici_init);
2652 2622
2653 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2623 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2654 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2624 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
@@ -2752,7 +2722,6 @@ cluster_corrupt_out:
2752 * mark it as stale and brelse. 2722 * mark it as stale and brelse.
2753 */ 2723 */
2754 if (XFS_BUF_IODONE_FUNC(bp)) { 2724 if (XFS_BUF_IODONE_FUNC(bp)) {
2755 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
2756 XFS_BUF_UNDONE(bp); 2725 XFS_BUF_UNDONE(bp);
2757 XFS_BUF_STALE(bp); 2726 XFS_BUF_STALE(bp);
2758 XFS_BUF_ERROR(bp,EIO); 2727 XFS_BUF_ERROR(bp,EIO);
@@ -3090,8 +3059,7 @@ xfs_iflush_int(
3090 * and unlock the inode's flush lock when the inode is 3059 * and unlock the inode's flush lock when the inode is
3091 * completely written to disk. 3060 * completely written to disk.
3092 */ 3061 */
3093 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) 3062 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3094 xfs_iflush_done, (xfs_log_item_t *)iip);
3095 3063
3096 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 3064 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
3097 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); 3065 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
@@ -3535,13 +3503,11 @@ xfs_iext_remove_indirect(
3535 xfs_extnum_t ext_diff; /* extents to remove in current list */ 3503 xfs_extnum_t ext_diff; /* extents to remove in current list */
3536 xfs_extnum_t nex1; /* number of extents before idx */ 3504 xfs_extnum_t nex1; /* number of extents before idx */
3537 xfs_extnum_t nex2; /* extents after idx + count */ 3505 xfs_extnum_t nex2; /* extents after idx + count */
3538 int nlists; /* entries in indirection array */
3539 int page_idx = idx; /* index in target extent list */ 3506 int page_idx = idx; /* index in target extent list */
3540 3507
3541 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3508 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3542 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3509 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
3543 ASSERT(erp != NULL); 3510 ASSERT(erp != NULL);
3544 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3545 nex1 = page_idx; 3511 nex1 = page_idx;
3546 ext_cnt = count; 3512 ext_cnt = count;
3547 while (ext_cnt) { 3513 while (ext_cnt) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9965e40a4615..0898c5417d12 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -442,9 +442,7 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
442 * xfs_iget.c prototypes. 442 * xfs_iget.c prototypes.
443 */ 443 */
444int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, 444int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
445 uint, uint, xfs_inode_t **, xfs_daddr_t); 445 uint, uint, xfs_inode_t **);
446void xfs_iput(xfs_inode_t *, uint);
447void xfs_iput_new(xfs_inode_t *, uint);
448void xfs_ilock(xfs_inode_t *, uint); 446void xfs_ilock(xfs_inode_t *, uint);
449int xfs_ilock_nowait(xfs_inode_t *, uint); 447int xfs_ilock_nowait(xfs_inode_t *, uint);
450void xfs_iunlock(xfs_inode_t *, uint); 448void xfs_iunlock(xfs_inode_t *, uint);
@@ -452,7 +450,7 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
452int xfs_isilocked(xfs_inode_t *, uint); 450int xfs_isilocked(xfs_inode_t *, uint);
453uint xfs_ilock_map_shared(xfs_inode_t *); 451uint xfs_ilock_map_shared(xfs_inode_t *);
454void xfs_iunlock_map_shared(xfs_inode_t *, uint); 452void xfs_iunlock_map_shared(xfs_inode_t *, uint);
455void xfs_ireclaim(xfs_inode_t *); 453void xfs_inode_free(struct xfs_inode *ip);
456 454
457/* 455/*
458 * xfs_inode.c prototypes. 456 * xfs_inode.c prototypes.
@@ -500,7 +498,7 @@ do { \
500 * Flags for xfs_iget() 498 * Flags for xfs_iget()
501 */ 499 */
502#define XFS_IGET_CREATE 0x1 500#define XFS_IGET_CREATE 0x1
503#define XFS_IGET_BULKSTAT 0x2 501#define XFS_IGET_UNTRUSTED 0x2
504 502
505int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, 503int xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
506 xfs_ino_t, struct xfs_dinode **, 504 xfs_ino_t, struct xfs_dinode **,
@@ -509,7 +507,7 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
509 struct xfs_inode *, struct xfs_dinode **, 507 struct xfs_inode *, struct xfs_dinode **,
510 struct xfs_buf **, uint); 508 struct xfs_buf **, uint);
511int xfs_iread(struct xfs_mount *, struct xfs_trans *, 509int xfs_iread(struct xfs_mount *, struct xfs_trans *,
512 struct xfs_inode *, xfs_daddr_t, uint); 510 struct xfs_inode *, uint);
513void xfs_dinode_to_disk(struct xfs_dinode *, 511void xfs_dinode_to_disk(struct xfs_dinode *,
514 struct xfs_icdinode *); 512 struct xfs_icdinode *);
515void xfs_idestroy_fork(struct xfs_inode *, int); 513void xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7bfea8540159..fe00777e2796 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -22,30 +22,26 @@
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_buf_item.h"
26#include "xfs_sb.h" 25#include "xfs_sb.h"
27#include "xfs_ag.h" 26#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h" 27#include "xfs_mount.h"
31#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
32#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 30#include "xfs_dinode.h"
38#include "xfs_inode.h" 31#include "xfs_inode.h"
39#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
40#include "xfs_btree.h"
41#include "xfs_ialloc.h"
42#include "xfs_rw.h"
43#include "xfs_error.h" 33#include "xfs_error.h"
44#include "xfs_trace.h" 34#include "xfs_trace.h"
45 35
46 36
47kmem_zone_t *xfs_ili_zone; /* inode log item zone */ 37kmem_zone_t *xfs_ili_zone; /* inode log item zone */
48 38
39static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
40{
41 return container_of(lip, struct xfs_inode_log_item, ili_item);
42}
43
44
49/* 45/*
50 * This returns the number of iovecs needed to log the given inode item. 46 * This returns the number of iovecs needed to log the given inode item.
51 * 47 *
@@ -55,13 +51,11 @@ kmem_zone_t *xfs_ili_zone; /* inode log item zone */
55 */ 51 */
56STATIC uint 52STATIC uint
57xfs_inode_item_size( 53xfs_inode_item_size(
58 xfs_inode_log_item_t *iip) 54 struct xfs_log_item *lip)
59{ 55{
60 uint nvecs; 56 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
61 xfs_inode_t *ip; 57 struct xfs_inode *ip = iip->ili_inode;
62 58 uint nvecs = 2;
63 ip = iip->ili_inode;
64 nvecs = 2;
65 59
66 /* 60 /*
67 * Only log the data/extents/b-tree root if there is something 61 * Only log the data/extents/b-tree root if there is something
@@ -212,21 +206,17 @@ xfs_inode_item_size(
212 */ 206 */
213STATIC void 207STATIC void
214xfs_inode_item_format( 208xfs_inode_item_format(
215 xfs_inode_log_item_t *iip, 209 struct xfs_log_item *lip,
216 xfs_log_iovec_t *log_vector) 210 struct xfs_log_iovec *vecp)
217{ 211{
212 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
213 struct xfs_inode *ip = iip->ili_inode;
218 uint nvecs; 214 uint nvecs;
219 xfs_log_iovec_t *vecp;
220 xfs_inode_t *ip;
221 size_t data_bytes; 215 size_t data_bytes;
222 xfs_bmbt_rec_t *ext_buffer; 216 xfs_bmbt_rec_t *ext_buffer;
223 int nrecs;
224 xfs_mount_t *mp; 217 xfs_mount_t *mp;
225 218
226 ip = iip->ili_inode; 219 vecp->i_addr = &iip->ili_format;
227 vecp = log_vector;
228
229 vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
230 vecp->i_len = sizeof(xfs_inode_log_format_t); 220 vecp->i_len = sizeof(xfs_inode_log_format_t);
231 vecp->i_type = XLOG_REG_TYPE_IFORMAT; 221 vecp->i_type = XLOG_REG_TYPE_IFORMAT;
232 vecp++; 222 vecp++;
@@ -277,7 +267,7 @@ xfs_inode_item_format(
277 */ 267 */
278 xfs_synchronize_times(ip); 268 xfs_synchronize_times(ip);
279 269
280 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 270 vecp->i_addr = &ip->i_d;
281 vecp->i_len = sizeof(struct xfs_icdinode); 271 vecp->i_len = sizeof(struct xfs_icdinode);
282 vecp->i_type = XLOG_REG_TYPE_ICORE; 272 vecp->i_type = XLOG_REG_TYPE_ICORE;
283 vecp++; 273 vecp++;
@@ -323,18 +313,17 @@ xfs_inode_item_format(
323 ASSERT(ip->i_df.if_u1.if_extents != NULL); 313 ASSERT(ip->i_df.if_u1.if_extents != NULL);
324 ASSERT(ip->i_d.di_nextents > 0); 314 ASSERT(ip->i_d.di_nextents > 0);
325 ASSERT(iip->ili_extents_buf == NULL); 315 ASSERT(iip->ili_extents_buf == NULL);
326 nrecs = ip->i_df.if_bytes / 316 ASSERT((ip->i_df.if_bytes /
327 (uint)sizeof(xfs_bmbt_rec_t); 317 (uint)sizeof(xfs_bmbt_rec_t)) > 0);
328 ASSERT(nrecs > 0);
329#ifdef XFS_NATIVE_HOST 318#ifdef XFS_NATIVE_HOST
330 if (nrecs == ip->i_d.di_nextents) { 319 if (ip->i_d.di_nextents == ip->i_df.if_bytes /
320 (uint)sizeof(xfs_bmbt_rec_t)) {
331 /* 321 /*
332 * There are no delayed allocation 322 * There are no delayed allocation
333 * extents, so just point to the 323 * extents, so just point to the
334 * real extents array. 324 * real extents array.
335 */ 325 */
336 vecp->i_addr = 326 vecp->i_addr = ip->i_df.if_u1.if_extents;
337 (char *)(ip->i_df.if_u1.if_extents);
338 vecp->i_len = ip->i_df.if_bytes; 327 vecp->i_len = ip->i_df.if_bytes;
339 vecp->i_type = XLOG_REG_TYPE_IEXT; 328 vecp->i_type = XLOG_REG_TYPE_IEXT;
340 } else 329 } else
@@ -352,7 +341,7 @@ xfs_inode_item_format(
352 ext_buffer = kmem_alloc(ip->i_df.if_bytes, 341 ext_buffer = kmem_alloc(ip->i_df.if_bytes,
353 KM_SLEEP); 342 KM_SLEEP);
354 iip->ili_extents_buf = ext_buffer; 343 iip->ili_extents_buf = ext_buffer;
355 vecp->i_addr = (xfs_caddr_t)ext_buffer; 344 vecp->i_addr = ext_buffer;
356 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 345 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
357 XFS_DATA_FORK); 346 XFS_DATA_FORK);
358 vecp->i_type = XLOG_REG_TYPE_IEXT; 347 vecp->i_type = XLOG_REG_TYPE_IEXT;
@@ -371,7 +360,7 @@ xfs_inode_item_format(
371 if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { 360 if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
372 ASSERT(ip->i_df.if_broot_bytes > 0); 361 ASSERT(ip->i_df.if_broot_bytes > 0);
373 ASSERT(ip->i_df.if_broot != NULL); 362 ASSERT(ip->i_df.if_broot != NULL);
374 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; 363 vecp->i_addr = ip->i_df.if_broot;
375 vecp->i_len = ip->i_df.if_broot_bytes; 364 vecp->i_len = ip->i_df.if_broot_bytes;
376 vecp->i_type = XLOG_REG_TYPE_IBROOT; 365 vecp->i_type = XLOG_REG_TYPE_IBROOT;
377 vecp++; 366 vecp++;
@@ -389,7 +378,7 @@ xfs_inode_item_format(
389 ASSERT(ip->i_df.if_u1.if_data != NULL); 378 ASSERT(ip->i_df.if_u1.if_data != NULL);
390 ASSERT(ip->i_d.di_size > 0); 379 ASSERT(ip->i_d.di_size > 0);
391 380
392 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data; 381 vecp->i_addr = ip->i_df.if_u1.if_data;
393 /* 382 /*
394 * Round i_bytes up to a word boundary. 383 * Round i_bytes up to a word boundary.
395 * The underlying memory is guaranteed to 384 * The underlying memory is guaranteed to
@@ -437,7 +426,7 @@ xfs_inode_item_format(
437 * Assert that no attribute-related log flags are set. 426 * Assert that no attribute-related log flags are set.
438 */ 427 */
439 if (!XFS_IFORK_Q(ip)) { 428 if (!XFS_IFORK_Q(ip)) {
440 ASSERT(nvecs == iip->ili_item.li_desc->lid_size); 429 ASSERT(nvecs == lip->li_desc->lid_size);
441 iip->ili_format.ilf_size = nvecs; 430 iip->ili_format.ilf_size = nvecs;
442 ASSERT(!(iip->ili_format.ilf_fields & 431 ASSERT(!(iip->ili_format.ilf_fields &
443 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); 432 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
@@ -449,21 +438,21 @@ xfs_inode_item_format(
449 ASSERT(!(iip->ili_format.ilf_fields & 438 ASSERT(!(iip->ili_format.ilf_fields &
450 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); 439 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
451 if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { 440 if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
452 ASSERT(ip->i_afp->if_bytes > 0);
453 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
454 ASSERT(ip->i_d.di_anextents > 0);
455#ifdef DEBUG 441#ifdef DEBUG
456 nrecs = ip->i_afp->if_bytes / 442 int nrecs = ip->i_afp->if_bytes /
457 (uint)sizeof(xfs_bmbt_rec_t); 443 (uint)sizeof(xfs_bmbt_rec_t);
458#endif
459 ASSERT(nrecs > 0); 444 ASSERT(nrecs > 0);
460 ASSERT(nrecs == ip->i_d.di_anextents); 445 ASSERT(nrecs == ip->i_d.di_anextents);
446 ASSERT(ip->i_afp->if_bytes > 0);
447 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
448 ASSERT(ip->i_d.di_anextents > 0);
449#endif
461#ifdef XFS_NATIVE_HOST 450#ifdef XFS_NATIVE_HOST
462 /* 451 /*
463 * There are not delayed allocation extents 452 * There are not delayed allocation extents
464 * for attributes, so just point at the array. 453 * for attributes, so just point at the array.
465 */ 454 */
466 vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents); 455 vecp->i_addr = ip->i_afp->if_u1.if_extents;
467 vecp->i_len = ip->i_afp->if_bytes; 456 vecp->i_len = ip->i_afp->if_bytes;
468#else 457#else
469 ASSERT(iip->ili_aextents_buf == NULL); 458 ASSERT(iip->ili_aextents_buf == NULL);
@@ -473,7 +462,7 @@ xfs_inode_item_format(
473 ext_buffer = kmem_alloc(ip->i_afp->if_bytes, 462 ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
474 KM_SLEEP); 463 KM_SLEEP);
475 iip->ili_aextents_buf = ext_buffer; 464 iip->ili_aextents_buf = ext_buffer;
476 vecp->i_addr = (xfs_caddr_t)ext_buffer; 465 vecp->i_addr = ext_buffer;
477 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 466 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
478 XFS_ATTR_FORK); 467 XFS_ATTR_FORK);
479#endif 468#endif
@@ -490,7 +479,7 @@ xfs_inode_item_format(
490 if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { 479 if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
491 ASSERT(ip->i_afp->if_broot_bytes > 0); 480 ASSERT(ip->i_afp->if_broot_bytes > 0);
492 ASSERT(ip->i_afp->if_broot != NULL); 481 ASSERT(ip->i_afp->if_broot != NULL);
493 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; 482 vecp->i_addr = ip->i_afp->if_broot;
494 vecp->i_len = ip->i_afp->if_broot_bytes; 483 vecp->i_len = ip->i_afp->if_broot_bytes;
495 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; 484 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
496 vecp++; 485 vecp++;
@@ -506,7 +495,7 @@ xfs_inode_item_format(
506 ASSERT(ip->i_afp->if_bytes > 0); 495 ASSERT(ip->i_afp->if_bytes > 0);
507 ASSERT(ip->i_afp->if_u1.if_data != NULL); 496 ASSERT(ip->i_afp->if_u1.if_data != NULL);
508 497
509 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data; 498 vecp->i_addr = ip->i_afp->if_u1.if_data;
510 /* 499 /*
511 * Round i_bytes up to a word boundary. 500 * Round i_bytes up to a word boundary.
512 * The underlying memory is guaranteed to 501 * The underlying memory is guaranteed to
@@ -528,7 +517,7 @@ xfs_inode_item_format(
528 break; 517 break;
529 } 518 }
530 519
531 ASSERT(nvecs == iip->ili_item.li_desc->lid_size); 520 ASSERT(nvecs == lip->li_desc->lid_size);
532 iip->ili_format.ilf_size = nvecs; 521 iip->ili_format.ilf_size = nvecs;
533} 522}
534 523
@@ -539,11 +528,14 @@ xfs_inode_item_format(
539 */ 528 */
540STATIC void 529STATIC void
541xfs_inode_item_pin( 530xfs_inode_item_pin(
542 xfs_inode_log_item_t *iip) 531 struct xfs_log_item *lip)
543{ 532{
544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 533 struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
545 534
546 atomic_inc(&iip->ili_inode->i_pincount); 535 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
536
537 trace_xfs_inode_pin(ip, _RET_IP_);
538 atomic_inc(&ip->i_pincount);
547} 539}
548 540
549 541
@@ -553,28 +545,19 @@ xfs_inode_item_pin(
553 * 545 *
554 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. 546 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
555 */ 547 */
556/* ARGSUSED */
557STATIC void 548STATIC void
558xfs_inode_item_unpin( 549xfs_inode_item_unpin(
559 xfs_inode_log_item_t *iip, 550 struct xfs_log_item *lip,
560 int stale) 551 int remove)
561{ 552{
562 struct xfs_inode *ip = iip->ili_inode; 553 struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
563 554
555 trace_xfs_inode_unpin(ip, _RET_IP_);
564 ASSERT(atomic_read(&ip->i_pincount) > 0); 556 ASSERT(atomic_read(&ip->i_pincount) > 0);
565 if (atomic_dec_and_test(&ip->i_pincount)) 557 if (atomic_dec_and_test(&ip->i_pincount))
566 wake_up(&ip->i_ipin_wait); 558 wake_up(&ip->i_ipin_wait);
567} 559}
568 560
569/* ARGSUSED */
570STATIC void
571xfs_inode_item_unpin_remove(
572 xfs_inode_log_item_t *iip,
573 xfs_trans_t *tp)
574{
575 xfs_inode_item_unpin(iip, 0);
576}
577
578/* 561/*
579 * This is called to attempt to lock the inode associated with this 562 * This is called to attempt to lock the inode associated with this
580 * inode log item, in preparation for the push routine which does the actual 563 * inode log item, in preparation for the push routine which does the actual
@@ -590,19 +573,16 @@ xfs_inode_item_unpin_remove(
590 */ 573 */
591STATIC uint 574STATIC uint
592xfs_inode_item_trylock( 575xfs_inode_item_trylock(
593 xfs_inode_log_item_t *iip) 576 struct xfs_log_item *lip)
594{ 577{
595 register xfs_inode_t *ip; 578 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
596 579 struct xfs_inode *ip = iip->ili_inode;
597 ip = iip->ili_inode;
598 580
599 if (xfs_ipincount(ip) > 0) { 581 if (xfs_ipincount(ip) > 0)
600 return XFS_ITEM_PINNED; 582 return XFS_ITEM_PINNED;
601 }
602 583
603 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 584 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
604 return XFS_ITEM_LOCKED; 585 return XFS_ITEM_LOCKED;
605 }
606 586
607 if (!xfs_iflock_nowait(ip)) { 587 if (!xfs_iflock_nowait(ip)) {
608 /* 588 /*
@@ -628,7 +608,7 @@ xfs_inode_item_trylock(
628 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 608 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
629 ASSERT(iip->ili_format.ilf_fields != 0); 609 ASSERT(iip->ili_format.ilf_fields != 0);
630 ASSERT(iip->ili_logged == 0); 610 ASSERT(iip->ili_logged == 0);
631 ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL); 611 ASSERT(lip->li_flags & XFS_LI_IN_AIL);
632 } 612 }
633#endif 613#endif
634 return XFS_ITEM_SUCCESS; 614 return XFS_ITEM_SUCCESS;
@@ -642,26 +622,18 @@ xfs_inode_item_trylock(
642 */ 622 */
643STATIC void 623STATIC void
644xfs_inode_item_unlock( 624xfs_inode_item_unlock(
645 xfs_inode_log_item_t *iip) 625 struct xfs_log_item *lip)
646{ 626{
647 uint hold; 627 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
648 uint iolocked; 628 struct xfs_inode *ip = iip->ili_inode;
649 uint lock_flags; 629 unsigned short lock_flags;
650 xfs_inode_t *ip;
651 630
652 ASSERT(iip != NULL);
653 ASSERT(iip->ili_inode->i_itemp != NULL); 631 ASSERT(iip->ili_inode->i_itemp != NULL);
654 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 632 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
655 ASSERT((!(iip->ili_inode->i_itemp->ili_flags & 633
656 XFS_ILI_IOLOCKED_EXCL)) ||
657 xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL));
658 ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
659 XFS_ILI_IOLOCKED_SHARED)) ||
660 xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED));
661 /* 634 /*
662 * Clear the transaction pointer in the inode. 635 * Clear the transaction pointer in the inode.
663 */ 636 */
664 ip = iip->ili_inode;
665 ip->i_transp = NULL; 637 ip->i_transp = NULL;
666 638
667 /* 639 /*
@@ -685,34 +657,11 @@ xfs_inode_item_unlock(
685 iip->ili_aextents_buf = NULL; 657 iip->ili_aextents_buf = NULL;
686 } 658 }
687 659
688 /* 660 lock_flags = iip->ili_lock_flags;
689 * Figure out if we should unlock the inode or not. 661 iip->ili_lock_flags = 0;
690 */ 662 if (lock_flags) {
691 hold = iip->ili_flags & XFS_ILI_HOLD; 663 xfs_iunlock(iip->ili_inode, lock_flags);
692 664 IRELE(iip->ili_inode);
693 /*
694 * Before clearing out the flags, remember whether we
695 * are holding the inode's IO lock.
696 */
697 iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
698
699 /*
700 * Clear out the fields of the inode log item particular
701 * to the current transaction.
702 */
703 iip->ili_flags = 0;
704
705 /*
706 * Unlock the inode if XFS_ILI_HOLD was not set.
707 */
708 if (!hold) {
709 lock_flags = XFS_ILOCK_EXCL;
710 if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
711 lock_flags |= XFS_IOLOCK_EXCL;
712 } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
713 lock_flags |= XFS_IOLOCK_SHARED;
714 }
715 xfs_iput(iip->ili_inode, lock_flags);
716 } 665 }
717} 666}
718 667
@@ -724,13 +673,12 @@ xfs_inode_item_unlock(
724 * is the only one that matters. Therefore, simply return the 673 * is the only one that matters. Therefore, simply return the
725 * given lsn. 674 * given lsn.
726 */ 675 */
727/*ARGSUSED*/
728STATIC xfs_lsn_t 676STATIC xfs_lsn_t
729xfs_inode_item_committed( 677xfs_inode_item_committed(
730 xfs_inode_log_item_t *iip, 678 struct xfs_log_item *lip,
731 xfs_lsn_t lsn) 679 xfs_lsn_t lsn)
732{ 680{
733 return (lsn); 681 return lsn;
734} 682}
735 683
736/* 684/*
@@ -742,13 +690,12 @@ xfs_inode_item_committed(
742 */ 690 */
743STATIC void 691STATIC void
744xfs_inode_item_pushbuf( 692xfs_inode_item_pushbuf(
745 xfs_inode_log_item_t *iip) 693 struct xfs_log_item *lip)
746{ 694{
747 xfs_inode_t *ip; 695 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
748 xfs_mount_t *mp; 696 struct xfs_inode *ip = iip->ili_inode;
749 xfs_buf_t *bp; 697 struct xfs_buf *bp;
750 698
751 ip = iip->ili_inode;
752 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 699 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
753 700
754 /* 701 /*
@@ -756,14 +703,13 @@ xfs_inode_item_pushbuf(
756 * inode was taken off the AIL. So, just get out. 703 * inode was taken off the AIL. So, just get out.
757 */ 704 */
758 if (completion_done(&ip->i_flush) || 705 if (completion_done(&ip->i_flush) ||
759 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 706 !(lip->li_flags & XFS_LI_IN_AIL)) {
760 xfs_iunlock(ip, XFS_ILOCK_SHARED); 707 xfs_iunlock(ip, XFS_ILOCK_SHARED);
761 return; 708 return;
762 } 709 }
763 710
764 mp = ip->i_mount; 711 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
765 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, 712 iip->ili_format.ilf_len, XBF_TRYLOCK);
766 iip->ili_format.ilf_len, XBF_TRYLOCK);
767 713
768 xfs_iunlock(ip, XFS_ILOCK_SHARED); 714 xfs_iunlock(ip, XFS_ILOCK_SHARED);
769 if (!bp) 715 if (!bp)
@@ -771,10 +717,8 @@ xfs_inode_item_pushbuf(
771 if (XFS_BUF_ISDELAYWRITE(bp)) 717 if (XFS_BUF_ISDELAYWRITE(bp))
772 xfs_buf_delwri_promote(bp); 718 xfs_buf_delwri_promote(bp);
773 xfs_buf_relse(bp); 719 xfs_buf_relse(bp);
774 return;
775} 720}
776 721
777
778/* 722/*
779 * This is called to asynchronously write the inode associated with this 723 * This is called to asynchronously write the inode associated with this
780 * inode log item out to disk. The inode will already have been locked by 724 * inode log item out to disk. The inode will already have been locked by
@@ -782,14 +726,14 @@ xfs_inode_item_pushbuf(
782 */ 726 */
783STATIC void 727STATIC void
784xfs_inode_item_push( 728xfs_inode_item_push(
785 xfs_inode_log_item_t *iip) 729 struct xfs_log_item *lip)
786{ 730{
787 xfs_inode_t *ip; 731 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
788 732 struct xfs_inode *ip = iip->ili_inode;
789 ip = iip->ili_inode;
790 733
791 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 734 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
792 ASSERT(!completion_done(&ip->i_flush)); 735 ASSERT(!completion_done(&ip->i_flush));
736
793 /* 737 /*
794 * Since we were able to lock the inode's flush lock and 738 * Since we were able to lock the inode's flush lock and
795 * we found it on the AIL, the inode must be dirty. This 739 * we found it on the AIL, the inode must be dirty. This
@@ -812,43 +756,34 @@ xfs_inode_item_push(
812 */ 756 */
813 (void) xfs_iflush(ip, 0); 757 (void) xfs_iflush(ip, 0);
814 xfs_iunlock(ip, XFS_ILOCK_SHARED); 758 xfs_iunlock(ip, XFS_ILOCK_SHARED);
815
816 return;
817} 759}
818 760
819/* 761/*
820 * XXX rcc - this one really has to do something. Probably needs 762 * XXX rcc - this one really has to do something. Probably needs
821 * to stamp in a new field in the incore inode. 763 * to stamp in a new field in the incore inode.
822 */ 764 */
823/* ARGSUSED */
824STATIC void 765STATIC void
825xfs_inode_item_committing( 766xfs_inode_item_committing(
826 xfs_inode_log_item_t *iip, 767 struct xfs_log_item *lip,
827 xfs_lsn_t lsn) 768 xfs_lsn_t lsn)
828{ 769{
829 iip->ili_last_lsn = lsn; 770 INODE_ITEM(lip)->ili_last_lsn = lsn;
830 return;
831} 771}
832 772
833/* 773/*
834 * This is the ops vector shared by all buf log items. 774 * This is the ops vector shared by all buf log items.
835 */ 775 */
836static struct xfs_item_ops xfs_inode_item_ops = { 776static struct xfs_item_ops xfs_inode_item_ops = {
837 .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, 777 .iop_size = xfs_inode_item_size,
838 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 778 .iop_format = xfs_inode_item_format,
839 xfs_inode_item_format, 779 .iop_pin = xfs_inode_item_pin,
840 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, 780 .iop_unpin = xfs_inode_item_unpin,
841 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, 781 .iop_trylock = xfs_inode_item_trylock,
842 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 782 .iop_unlock = xfs_inode_item_unlock,
843 xfs_inode_item_unpin_remove, 783 .iop_committed = xfs_inode_item_committed,
844 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, 784 .iop_push = xfs_inode_item_push,
845 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock, 785 .iop_pushbuf = xfs_inode_item_pushbuf,
846 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 786 .iop_committing = xfs_inode_item_committing
847 xfs_inode_item_committed,
848 .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push,
849 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
850 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
851 xfs_inode_item_committing
852}; 787};
853 788
854 789
@@ -857,25 +792,17 @@ static struct xfs_item_ops xfs_inode_item_ops = {
857 */ 792 */
858void 793void
859xfs_inode_item_init( 794xfs_inode_item_init(
860 xfs_inode_t *ip, 795 struct xfs_inode *ip,
861 xfs_mount_t *mp) 796 struct xfs_mount *mp)
862{ 797{
863 xfs_inode_log_item_t *iip; 798 struct xfs_inode_log_item *iip;
864 799
865 ASSERT(ip->i_itemp == NULL); 800 ASSERT(ip->i_itemp == NULL);
866 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 801 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
867 802
868 iip->ili_item.li_type = XFS_LI_INODE;
869 iip->ili_item.li_ops = &xfs_inode_item_ops;
870 iip->ili_item.li_mountp = mp;
871 iip->ili_item.li_ailp = mp->m_ail;
872 iip->ili_inode = ip; 803 iip->ili_inode = ip;
873 804 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
874 /* 805 &xfs_inode_item_ops);
875 We have zeroed memory. No need ...
876 iip->ili_extents_buf = NULL;
877 */
878
879 iip->ili_format.ilf_type = XFS_LI_INODE; 806 iip->ili_format.ilf_type = XFS_LI_INODE;
880 iip->ili_format.ilf_ino = ip->i_ino; 807 iip->ili_format.ilf_ino = ip->i_ino;
881 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; 808 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
@@ -906,14 +833,14 @@ xfs_inode_item_destroy(
906 * from the AIL if it has not been re-logged, and unlocking the inode's 833 * from the AIL if it has not been re-logged, and unlocking the inode's
907 * flush lock. 834 * flush lock.
908 */ 835 */
909/*ARGSUSED*/
910void 836void
911xfs_iflush_done( 837xfs_iflush_done(
912 xfs_buf_t *bp, 838 struct xfs_buf *bp,
913 xfs_inode_log_item_t *iip) 839 struct xfs_log_item *lip)
914{ 840{
841 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
915 xfs_inode_t *ip = iip->ili_inode; 842 xfs_inode_t *ip = iip->ili_inode;
916 struct xfs_ail *ailp = iip->ili_item.li_ailp; 843 struct xfs_ail *ailp = lip->li_ailp;
917 844
918 /* 845 /*
919 * We only want to pull the item from the AIL if it is 846 * We only want to pull the item from the AIL if it is
@@ -924,12 +851,11 @@ xfs_iflush_done(
924 * the lock since it's cheaper, and then we recheck while 851 * the lock since it's cheaper, and then we recheck while
925 * holding the lock before removing the inode from the AIL. 852 * holding the lock before removing the inode from the AIL.
926 */ 853 */
927 if (iip->ili_logged && 854 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
928 (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
929 spin_lock(&ailp->xa_lock); 855 spin_lock(&ailp->xa_lock);
930 if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { 856 if (lip->li_lsn == iip->ili_flush_lsn) {
931 /* xfs_trans_ail_delete() drops the AIL lock. */ 857 /* xfs_trans_ail_delete() drops the AIL lock. */
932 xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip); 858 xfs_trans_ail_delete(ailp, lip);
933 } else { 859 } else {
934 spin_unlock(&ailp->xa_lock); 860 spin_unlock(&ailp->xa_lock);
935 } 861 }
@@ -947,8 +873,6 @@ xfs_iflush_done(
947 * Release the inode's flush lock since we're done with it. 873 * Release the inode's flush lock since we're done with it.
948 */ 874 */
949 xfs_ifunlock(ip); 875 xfs_ifunlock(ip);
950
951 return;
952} 876}
953 877
954/* 878/*
@@ -964,10 +888,8 @@ xfs_iflush_abort(
964 xfs_inode_t *ip) 888 xfs_inode_t *ip)
965{ 889{
966 xfs_inode_log_item_t *iip = ip->i_itemp; 890 xfs_inode_log_item_t *iip = ip->i_itemp;
967 xfs_mount_t *mp;
968 891
969 iip = ip->i_itemp; 892 iip = ip->i_itemp;
970 mp = ip->i_mount;
971 if (iip) { 893 if (iip) {
972 struct xfs_ail *ailp = iip->ili_item.li_ailp; 894 struct xfs_ail *ailp = iip->ili_item.li_ailp;
973 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 895 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
@@ -998,10 +920,10 @@ xfs_iflush_abort(
998 920
999void 921void
1000xfs_istale_done( 922xfs_istale_done(
1001 xfs_buf_t *bp, 923 struct xfs_buf *bp,
1002 xfs_inode_log_item_t *iip) 924 struct xfs_log_item *lip)
1003{ 925{
1004 xfs_iflush_abort(iip->ili_inode); 926 xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
1005} 927}
1006 928
1007/* 929/*
@@ -1014,9 +936,8 @@ xfs_inode_item_format_convert(
1014 xfs_inode_log_format_t *in_f) 936 xfs_inode_log_format_t *in_f)
1015{ 937{
1016 if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { 938 if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
1017 xfs_inode_log_format_32_t *in_f32; 939 xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
1018 940
1019 in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
1020 in_f->ilf_type = in_f32->ilf_type; 941 in_f->ilf_type = in_f32->ilf_type;
1021 in_f->ilf_size = in_f32->ilf_size; 942 in_f->ilf_size = in_f32->ilf_size;
1022 in_f->ilf_fields = in_f32->ilf_fields; 943 in_f->ilf_fields = in_f32->ilf_fields;
@@ -1032,9 +953,8 @@ xfs_inode_item_format_convert(
1032 in_f->ilf_boffset = in_f32->ilf_boffset; 953 in_f->ilf_boffset = in_f32->ilf_boffset;
1033 return 0; 954 return 0;
1034 } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ 955 } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
1035 xfs_inode_log_format_64_t *in_f64; 956 xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
1036 957
1037 in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
1038 in_f->ilf_type = in_f64->ilf_type; 958 in_f->ilf_type = in_f64->ilf_type;
1039 in_f->ilf_size = in_f64->ilf_size; 959 in_f->ilf_size = in_f64->ilf_size;
1040 in_f->ilf_fields = in_f64->ilf_fields; 960 in_f->ilf_fields = in_f64->ilf_fields;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9a467958ecdd..d3dee61e6d91 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -103,12 +103,6 @@ typedef struct xfs_inode_log_format_64 {
103 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ 103 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
104 XFS_ILOG_ABROOT) 104 XFS_ILOG_ABROOT)
105 105
106#define XFS_ILI_HOLD 0x1
107#define XFS_ILI_IOLOCKED_EXCL 0x2
108#define XFS_ILI_IOLOCKED_SHARED 0x4
109
110#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
111
112static inline int xfs_ilog_fbroot(int w) 106static inline int xfs_ilog_fbroot(int w)
113{ 107{
114 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); 108 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
@@ -137,7 +131,7 @@ typedef struct xfs_inode_log_item {
137 struct xfs_inode *ili_inode; /* inode ptr */ 131 struct xfs_inode *ili_inode; /* inode ptr */
138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ 132 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ 133 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
140 unsigned short ili_flags; /* misc flags */ 134 unsigned short ili_lock_flags; /* lock flags */
141 unsigned short ili_logged; /* flushed logged data */ 135 unsigned short ili_logged; /* flushed logged data */
142 unsigned int ili_last_fields; /* fields when flushed */ 136 unsigned int ili_last_fields; /* fields when flushed */
143 struct xfs_bmbt_rec *ili_extents_buf; /* array of logged 137 struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
@@ -161,8 +155,8 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
161 155
162extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); 156extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
163extern void xfs_inode_item_destroy(struct xfs_inode *); 157extern void xfs_inode_item_destroy(struct xfs_inode *);
164extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *); 158extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
165extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *); 159extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
166extern void xfs_iflush_abort(struct xfs_inode *); 160extern void xfs_iflush_abort(struct xfs_inode *);
167extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, 161extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
168 xfs_inode_log_format_t *); 162 xfs_inode_log_format_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b65039951a0..20576146369f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -23,19 +23,14 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 32#include "xfs_dinode.h"
37#include "xfs_inode.h" 33#include "xfs_inode.h"
38#include "xfs_ialloc.h"
39#include "xfs_btree.h" 34#include "xfs_btree.h"
40#include "xfs_bmap.h" 35#include "xfs_bmap.h"
41#include "xfs_rtalloc.h" 36#include "xfs_rtalloc.h"
@@ -55,71 +50,33 @@
55#define XFS_STRAT_WRITE_IMAPS 2 50#define XFS_STRAT_WRITE_IMAPS 2
56#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 51#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
57 52
58STATIC int 53STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
59xfs_imap_to_bmap( 54 int, struct xfs_bmbt_irec *, int *);
60 xfs_inode_t *ip, 55STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
61 xfs_off_t offset, 56 struct xfs_bmbt_irec *, int *);
62 xfs_bmbt_irec_t *imap, 57STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
63 xfs_iomap_t *iomapp, 58 struct xfs_bmbt_irec *, int *);
64 int imaps, /* Number of imap entries */
65 int iomaps, /* Number of iomap entries */
66 int flags)
67{
68 xfs_mount_t *mp = ip->i_mount;
69 int pbm;
70 xfs_fsblock_t start_block;
71
72
73 for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
74 iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
75 iomapp->iomap_delta = offset - iomapp->iomap_offset;
76 iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
77 iomapp->iomap_flags = flags;
78
79 if (XFS_IS_REALTIME_INODE(ip)) {
80 iomapp->iomap_flags |= IOMAP_REALTIME;
81 iomapp->iomap_target = mp->m_rtdev_targp;
82 } else {
83 iomapp->iomap_target = mp->m_ddev_targp;
84 }
85 start_block = imap->br_startblock;
86 if (start_block == HOLESTARTBLOCK) {
87 iomapp->iomap_bn = IOMAP_DADDR_NULL;
88 iomapp->iomap_flags |= IOMAP_HOLE;
89 } else if (start_block == DELAYSTARTBLOCK) {
90 iomapp->iomap_bn = IOMAP_DADDR_NULL;
91 iomapp->iomap_flags |= IOMAP_DELAY;
92 } else {
93 iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
94 if (ISUNWRITTEN(imap))
95 iomapp->iomap_flags |= IOMAP_UNWRITTEN;
96 }
97
98 offset += iomapp->iomap_bsize - iomapp->iomap_delta;
99 }
100 return pbm; /* Return the number filled */
101}
102 59
103int 60int
104xfs_iomap( 61xfs_iomap(
105 xfs_inode_t *ip, 62 struct xfs_inode *ip,
106 xfs_off_t offset, 63 xfs_off_t offset,
107 ssize_t count, 64 ssize_t count,
108 int flags, 65 int flags,
109 xfs_iomap_t *iomapp, 66 struct xfs_bmbt_irec *imap,
110 int *niomaps) 67 int *nimaps,
68 int *new)
111{ 69{
112 xfs_mount_t *mp = ip->i_mount; 70 struct xfs_mount *mp = ip->i_mount;
113 xfs_fileoff_t offset_fsb, end_fsb; 71 xfs_fileoff_t offset_fsb, end_fsb;
114 int error = 0; 72 int error = 0;
115 int lockmode = 0; 73 int lockmode = 0;
116 xfs_bmbt_irec_t imap; 74 int bmapi_flags = 0;
117 int nimaps = 1;
118 int bmapi_flags = 0;
119 int iomap_flags = 0;
120 75
121 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 76 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
122 77
78 *new = 0;
79
123 if (XFS_FORCED_SHUTDOWN(mp)) 80 if (XFS_FORCED_SHUTDOWN(mp))
124 return XFS_ERROR(EIO); 81 return XFS_ERROR(EIO);
125 82
@@ -160,8 +117,8 @@ xfs_iomap(
160 117
161 error = xfs_bmapi(NULL, ip, offset_fsb, 118 error = xfs_bmapi(NULL, ip, offset_fsb,
162 (xfs_filblks_t)(end_fsb - offset_fsb), 119 (xfs_filblks_t)(end_fsb - offset_fsb),
163 bmapi_flags, NULL, 0, &imap, 120 bmapi_flags, NULL, 0, imap,
164 &nimaps, NULL, NULL); 121 nimaps, NULL);
165 122
166 if (error) 123 if (error)
167 goto out; 124 goto out;
@@ -169,46 +126,41 @@ xfs_iomap(
169 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { 126 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
170 case BMAPI_WRITE: 127 case BMAPI_WRITE:
171 /* If we found an extent, return it */ 128 /* If we found an extent, return it */
172 if (nimaps && 129 if (*nimaps &&
173 (imap.br_startblock != HOLESTARTBLOCK) && 130 (imap->br_startblock != HOLESTARTBLOCK) &&
174 (imap.br_startblock != DELAYSTARTBLOCK)) { 131 (imap->br_startblock != DELAYSTARTBLOCK)) {
175 trace_xfs_iomap_found(ip, offset, count, flags, &imap); 132 trace_xfs_iomap_found(ip, offset, count, flags, imap);
176 break; 133 break;
177 } 134 }
178 135
179 if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { 136 if (flags & BMAPI_DIRECT) {
180 error = xfs_iomap_write_direct(ip, offset, count, flags, 137 error = xfs_iomap_write_direct(ip, offset, count, flags,
181 &imap, &nimaps, nimaps); 138 imap, nimaps);
182 } else { 139 } else {
183 error = xfs_iomap_write_delay(ip, offset, count, flags, 140 error = xfs_iomap_write_delay(ip, offset, count, flags,
184 &imap, &nimaps); 141 imap, nimaps);
185 } 142 }
186 if (!error) { 143 if (!error) {
187 trace_xfs_iomap_alloc(ip, offset, count, flags, &imap); 144 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
188 } 145 }
189 iomap_flags = IOMAP_NEW; 146 *new = 1;
190 break; 147 break;
191 case BMAPI_ALLOCATE: 148 case BMAPI_ALLOCATE:
192 /* If we found an extent, return it */ 149 /* If we found an extent, return it */
193 xfs_iunlock(ip, lockmode); 150 xfs_iunlock(ip, lockmode);
194 lockmode = 0; 151 lockmode = 0;
195 152
196 if (nimaps && !isnullstartblock(imap.br_startblock)) { 153 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
197 trace_xfs_iomap_found(ip, offset, count, flags, &imap); 154 trace_xfs_iomap_found(ip, offset, count, flags, imap);
198 break; 155 break;
199 } 156 }
200 157
201 error = xfs_iomap_write_allocate(ip, offset, count, 158 error = xfs_iomap_write_allocate(ip, offset, count,
202 &imap, &nimaps); 159 imap, nimaps);
203 break; 160 break;
204 } 161 }
205 162
206 if (nimaps) { 163 ASSERT(*nimaps <= 1);
207 *niomaps = xfs_imap_to_bmap(ip, offset, &imap,
208 iomapp, nimaps, *niomaps, iomap_flags);
209 } else if (niomaps) {
210 *niomaps = 0;
211 }
212 164
213out: 165out:
214 if (lockmode) 166 if (lockmode)
@@ -216,7 +168,6 @@ out:
216 return XFS_ERROR(error); 168 return XFS_ERROR(error);
217} 169}
218 170
219
220STATIC int 171STATIC int
221xfs_iomap_eof_align_last_fsb( 172xfs_iomap_eof_align_last_fsb(
222 xfs_mount_t *mp, 173 xfs_mount_t *mp,
@@ -285,15 +236,14 @@ xfs_cmn_err_fsblock_zero(
285 return EFSCORRUPTED; 236 return EFSCORRUPTED;
286} 237}
287 238
288int 239STATIC int
289xfs_iomap_write_direct( 240xfs_iomap_write_direct(
290 xfs_inode_t *ip, 241 xfs_inode_t *ip,
291 xfs_off_t offset, 242 xfs_off_t offset,
292 size_t count, 243 size_t count,
293 int flags, 244 int flags,
294 xfs_bmbt_irec_t *ret_imap, 245 xfs_bmbt_irec_t *imap,
295 int *nmaps, 246 int *nmaps)
296 int found)
297{ 247{
298 xfs_mount_t *mp = ip->i_mount; 248 xfs_mount_t *mp = ip->i_mount;
299 xfs_fileoff_t offset_fsb; 249 xfs_fileoff_t offset_fsb;
@@ -306,7 +256,6 @@ xfs_iomap_write_direct(
306 int quota_flag; 256 int quota_flag;
307 int rt; 257 int rt;
308 xfs_trans_t *tp; 258 xfs_trans_t *tp;
309 xfs_bmbt_irec_t imap;
310 xfs_bmap_free_t free_list; 259 xfs_bmap_free_t free_list;
311 uint qblocks, resblks, resrtextents; 260 uint qblocks, resblks, resrtextents;
312 int committed; 261 int committed;
@@ -330,10 +279,10 @@ xfs_iomap_write_direct(
330 if (error) 279 if (error)
331 goto error_out; 280 goto error_out;
332 } else { 281 } else {
333 if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) 282 if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
334 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 283 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
335 ret_imap->br_blockcount + 284 imap->br_blockcount +
336 ret_imap->br_startoff); 285 imap->br_startoff);
337 } 286 }
338 count_fsb = last_fsb - offset_fsb; 287 count_fsb = last_fsb - offset_fsb;
339 ASSERT(count_fsb > 0); 288 ASSERT(count_fsb > 0);
@@ -379,20 +328,22 @@ xfs_iomap_write_direct(
379 if (error) 328 if (error)
380 goto error1; 329 goto error1;
381 330
382 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 331 xfs_trans_ijoin(tp, ip);
383 xfs_trans_ihold(tp, ip);
384 332
385 bmapi_flag = XFS_BMAPI_WRITE; 333 bmapi_flag = XFS_BMAPI_WRITE;
386 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) 334 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
387 bmapi_flag |= XFS_BMAPI_PREALLOC; 335 bmapi_flag |= XFS_BMAPI_PREALLOC;
388 336
389 /* 337 /*
390 * Issue the xfs_bmapi() call to allocate the blocks 338 * Issue the xfs_bmapi() call to allocate the blocks.
339 *
340 * From this point onwards we overwrite the imap pointer that the
341 * caller gave to us.
391 */ 342 */
392 xfs_bmap_init(&free_list, &firstfsb); 343 xfs_bmap_init(&free_list, &firstfsb);
393 nimaps = 1; 344 nimaps = 1;
394 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, 345 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
395 &firstfsb, 0, &imap, &nimaps, &free_list, NULL); 346 &firstfsb, 0, imap, &nimaps, &free_list);
396 if (error) 347 if (error)
397 goto error0; 348 goto error0;
398 349
@@ -414,12 +365,11 @@ xfs_iomap_write_direct(
414 goto error_out; 365 goto error_out;
415 } 366 }
416 367
417 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) { 368 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
418 error = xfs_cmn_err_fsblock_zero(ip, &imap); 369 error = xfs_cmn_err_fsblock_zero(ip, imap);
419 goto error_out; 370 goto error_out;
420 } 371 }
421 372
422 *ret_imap = imap;
423 *nmaps = 1; 373 *nmaps = 1;
424 return 0; 374 return 0;
425 375
@@ -470,7 +420,7 @@ xfs_iomap_eof_want_preallocate(
470 imaps = nimaps; 420 imaps = nimaps;
471 firstblock = NULLFSBLOCK; 421 firstblock = NULLFSBLOCK;
472 error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, 422 error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
473 &firstblock, 0, imap, &imaps, NULL, NULL); 423 &firstblock, 0, imap, &imaps, NULL);
474 if (error) 424 if (error)
475 return error; 425 return error;
476 for (n = 0; n < imaps; n++) { 426 for (n = 0; n < imaps; n++) {
@@ -485,7 +435,7 @@ xfs_iomap_eof_want_preallocate(
485 return 0; 435 return 0;
486} 436}
487 437
488int 438STATIC int
489xfs_iomap_write_delay( 439xfs_iomap_write_delay(
490 xfs_inode_t *ip, 440 xfs_inode_t *ip,
491 xfs_off_t offset, 441 xfs_off_t offset,
@@ -545,7 +495,7 @@ retry:
545 (xfs_filblks_t)(last_fsb - offset_fsb), 495 (xfs_filblks_t)(last_fsb - offset_fsb),
546 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 496 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
547 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 497 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
548 &nimaps, NULL, NULL); 498 &nimaps, NULL);
549 if (error && (error != ENOSPC)) 499 if (error && (error != ENOSPC))
550 return XFS_ERROR(error); 500 return XFS_ERROR(error);
551 501
@@ -588,12 +538,12 @@ retry:
588 * We no longer bother to look at the incoming map - all we have to 538 * We no longer bother to look at the incoming map - all we have to
589 * guarantee is that whatever we allocate fills the required range. 539 * guarantee is that whatever we allocate fills the required range.
590 */ 540 */
591int 541STATIC int
592xfs_iomap_write_allocate( 542xfs_iomap_write_allocate(
593 xfs_inode_t *ip, 543 xfs_inode_t *ip,
594 xfs_off_t offset, 544 xfs_off_t offset,
595 size_t count, 545 size_t count,
596 xfs_bmbt_irec_t *map, 546 xfs_bmbt_irec_t *imap,
597 int *retmap) 547 int *retmap)
598{ 548{
599 xfs_mount_t *mp = ip->i_mount; 549 xfs_mount_t *mp = ip->i_mount;
@@ -602,7 +552,6 @@ xfs_iomap_write_allocate(
602 xfs_fsblock_t first_block; 552 xfs_fsblock_t first_block;
603 xfs_bmap_free_t free_list; 553 xfs_bmap_free_t free_list;
604 xfs_filblks_t count_fsb; 554 xfs_filblks_t count_fsb;
605 xfs_bmbt_irec_t imap;
606 xfs_trans_t *tp; 555 xfs_trans_t *tp;
607 int nimaps, committed; 556 int nimaps, committed;
608 int error = 0; 557 int error = 0;
@@ -618,8 +567,8 @@ xfs_iomap_write_allocate(
618 return XFS_ERROR(error); 567 return XFS_ERROR(error);
619 568
620 offset_fsb = XFS_B_TO_FSBT(mp, offset); 569 offset_fsb = XFS_B_TO_FSBT(mp, offset);
621 count_fsb = map->br_blockcount; 570 count_fsb = imap->br_blockcount;
622 map_start_fsb = map->br_startoff; 571 map_start_fsb = imap->br_startoff;
623 572
624 XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); 573 XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
625 574
@@ -647,8 +596,7 @@ xfs_iomap_write_allocate(
647 return XFS_ERROR(error); 596 return XFS_ERROR(error);
648 } 597 }
649 xfs_ilock(ip, XFS_ILOCK_EXCL); 598 xfs_ilock(ip, XFS_ILOCK_EXCL);
650 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 599 xfs_trans_ijoin(tp, ip);
651 xfs_trans_ihold(tp, ip);
652 600
653 xfs_bmap_init(&free_list, &first_block); 601 xfs_bmap_init(&free_list, &first_block);
654 602
@@ -699,10 +647,15 @@ xfs_iomap_write_allocate(
699 } 647 }
700 } 648 }
701 649
702 /* Go get the actual blocks */ 650 /*
651 * Go get the actual blocks.
652 *
653 * From this point onwards we overwrite the imap
654 * pointer that the caller gave to us.
655 */
703 error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, 656 error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
704 XFS_BMAPI_WRITE, &first_block, 1, 657 XFS_BMAPI_WRITE, &first_block, 1,
705 &imap, &nimaps, &free_list, NULL); 658 imap, &nimaps, &free_list);
706 if (error) 659 if (error)
707 goto trans_cancel; 660 goto trans_cancel;
708 661
@@ -721,13 +674,12 @@ xfs_iomap_write_allocate(
721 * See if we were able to allocate an extent that 674 * See if we were able to allocate an extent that
722 * covers at least part of the callers request 675 * covers at least part of the callers request
723 */ 676 */
724 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 677 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
725 return xfs_cmn_err_fsblock_zero(ip, &imap); 678 return xfs_cmn_err_fsblock_zero(ip, imap);
726 679
727 if ((offset_fsb >= imap.br_startoff) && 680 if ((offset_fsb >= imap->br_startoff) &&
728 (offset_fsb < (imap.br_startoff + 681 (offset_fsb < (imap->br_startoff +
729 imap.br_blockcount))) { 682 imap->br_blockcount))) {
730 *map = imap;
731 *retmap = 1; 683 *retmap = 1;
732 XFS_STATS_INC(xs_xstrat_quick); 684 XFS_STATS_INC(xs_xstrat_quick);
733 return 0; 685 return 0;
@@ -737,8 +689,8 @@ xfs_iomap_write_allocate(
737 * So far we have not mapped the requested part of the 689 * So far we have not mapped the requested part of the
738 * file, just surrounding data, try again. 690 * file, just surrounding data, try again.
739 */ 691 */
740 count_fsb -= imap.br_blockcount; 692 count_fsb -= imap->br_blockcount;
741 map_start_fsb = imap.br_startoff + imap.br_blockcount; 693 map_start_fsb = imap->br_startoff + imap->br_blockcount;
742 } 694 }
743 695
744trans_cancel: 696trans_cancel:
@@ -811,8 +763,7 @@ xfs_iomap_write_unwritten(
811 } 763 }
812 764
813 xfs_ilock(ip, XFS_ILOCK_EXCL); 765 xfs_ilock(ip, XFS_ILOCK_EXCL);
814 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 766 xfs_trans_ijoin(tp, ip);
815 xfs_trans_ihold(tp, ip);
816 767
817 /* 768 /*
818 * Modify the unwritten extent state of the buffer. 769 * Modify the unwritten extent state of the buffer.
@@ -821,7 +772,7 @@ xfs_iomap_write_unwritten(
821 nimaps = 1; 772 nimaps = 1;
822 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, 773 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
823 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 774 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
824 1, &imap, &nimaps, &free_list, NULL); 775 1, &imap, &nimaps, &free_list);
825 if (error) 776 if (error)
826 goto error_on_bmapi_transaction; 777 goto error_on_bmapi_transaction;
827 778
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f29990991..7748a430f50d 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,16 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL)) 21/* base extent manipulation calls */
22#define BMAPI_READ (1 << 0) /* read extents */
23#define BMAPI_WRITE (1 << 1) /* create extents */
24#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
22 25
23 26/* modifiers */
24typedef enum { /* iomap_flags values */ 27#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
25 IOMAP_READ = 0, /* mapping for a read */ 28#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
26 IOMAP_HOLE = 0x02, /* mapping covers a hole */ 29#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
27 IOMAP_DELAY = 0x04, /* mapping covers delalloc region */ 30#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
28 IOMAP_REALTIME = 0x10, /* mapping on the realtime device */
29 IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */
30 /* but uninitialized file data */
31 IOMAP_NEW = 0x40 /* just allocate */
32} iomap_flags_t;
33
34typedef enum {
35 /* base extent manipulation calls */
36 BMAPI_READ = (1 << 0), /* read extents */
37 BMAPI_WRITE = (1 << 1), /* create extents */
38 BMAPI_ALLOCATE = (1 << 2), /* delayed allocate to real extents */
39 /* modifiers */
40 BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */
41 BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */
42 BMAPI_MMAP = (1 << 6), /* allocate for mmap write */
43 BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */
44} bmapi_flags_t;
45 31
46#define BMAPI_FLAGS \ 32#define BMAPI_FLAGS \
47 { BMAPI_READ, "READ" }, \ 33 { BMAPI_READ, "READ" }, \
@@ -49,46 +35,13 @@ typedef enum {
49 { BMAPI_ALLOCATE, "ALLOCATE" }, \ 35 { BMAPI_ALLOCATE, "ALLOCATE" }, \
50 { BMAPI_IGNSTATE, "IGNSTATE" }, \ 36 { BMAPI_IGNSTATE, "IGNSTATE" }, \
51 { BMAPI_DIRECT, "DIRECT" }, \ 37 { BMAPI_DIRECT, "DIRECT" }, \
52 { BMAPI_MMAP, "MMAP" }, \
53 { BMAPI_TRYLOCK, "TRYLOCK" } 38 { BMAPI_TRYLOCK, "TRYLOCK" }
54 39
55/*
56 * xfs_iomap_t: File system I/O map
57 *
58 * The iomap_bn field is expressed in 512-byte blocks, and is where the
59 * mapping starts on disk.
60 *
61 * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
62 * iomap_offset is the offset of the mapping in the file itself.
63 * iomap_bsize is the size of the mapping, iomap_delta is the
64 * desired data's offset into the mapping, given the offset supplied
65 * to the file I/O map routine.
66 *
67 * When a request is made to read beyond the logical end of the object,
68 * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
69 * to the actual amount of underlying storage that has been allocated, if any.
70 */
71
72typedef struct xfs_iomap {
73 xfs_daddr_t iomap_bn; /* first 512B blk of mapping */
74 xfs_buftarg_t *iomap_target;
75 xfs_off_t iomap_offset; /* offset of mapping, bytes */
76 xfs_off_t iomap_bsize; /* size of mapping, bytes */
77 xfs_off_t iomap_delta; /* offset into mapping, bytes */
78 iomap_flags_t iomap_flags;
79} xfs_iomap_t;
80
81struct xfs_inode; 40struct xfs_inode;
82struct xfs_bmbt_irec; 41struct xfs_bmbt_irec;
83 42
84extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 43extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
85 struct xfs_iomap *, int *); 44 struct xfs_bmbt_irec *, int *, int *);
86extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
87 int, struct xfs_bmbt_irec *, int *, int);
88extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
89 struct xfs_bmbt_irec *, int *);
90extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
91 struct xfs_bmbt_irec *, int *);
92extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 45extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
93 46
94#endif /* __XFS_IOMAP_H__*/ 47#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b1b801e4a28e..7e3626e5925c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -24,20 +24,17 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_ialloc.h" 33#include "xfs_ialloc.h"
38#include "xfs_itable.h" 34#include "xfs_itable.h"
39#include "xfs_error.h" 35#include "xfs_error.h"
40#include "xfs_btree.h" 36#include "xfs_btree.h"
37#include "xfs_trace.h"
41 38
42STATIC int 39STATIC int
43xfs_internal_inum( 40xfs_internal_inum(
@@ -49,24 +46,40 @@ xfs_internal_inum(
49 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); 46 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
50} 47}
51 48
52STATIC int 49/*
53xfs_bulkstat_one_iget( 50 * Return stat information for one inode.
54 xfs_mount_t *mp, /* mount point for filesystem */ 51 * Return 0 if ok, else errno.
55 xfs_ino_t ino, /* inode number to get data for */ 52 */
56 xfs_daddr_t bno, /* starting bno of inode cluster */ 53int
57 xfs_bstat_t *buf, /* return buffer */ 54xfs_bulkstat_one_int(
58 int *stat) /* BULKSTAT_RV_... */ 55 struct xfs_mount *mp, /* mount point for filesystem */
56 xfs_ino_t ino, /* inode to get data for */
57 void __user *buffer, /* buffer to place output in */
58 int ubsize, /* size of buffer */
59 bulkstat_one_fmt_pf formatter, /* formatter, copy to user */
60 int *ubused, /* bytes used by me */
61 int *stat) /* BULKSTAT_RV_... */
59{ 62{
60 xfs_icdinode_t *dic; /* dinode core info pointer */ 63 struct xfs_icdinode *dic; /* dinode core info pointer */
61 xfs_inode_t *ip; /* incore inode pointer */ 64 struct xfs_inode *ip; /* incore inode pointer */
62 struct inode *inode; 65 struct inode *inode;
63 int error; 66 struct xfs_bstat *buf; /* return buffer */
67 int error = 0; /* error value */
68
69 *stat = BULKSTAT_RV_NOTHING;
70
71 if (!buffer || xfs_internal_inum(mp, ino))
72 return XFS_ERROR(EINVAL);
73
74 buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
75 if (!buf)
76 return XFS_ERROR(ENOMEM);
64 77
65 error = xfs_iget(mp, NULL, ino, 78 error = xfs_iget(mp, NULL, ino,
66 XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno); 79 XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip);
67 if (error) { 80 if (error) {
68 *stat = BULKSTAT_RV_NOTHING; 81 *stat = BULKSTAT_RV_NOTHING;
69 return error; 82 goto out_free;
70 } 83 }
71 84
72 ASSERT(ip != NULL); 85 ASSERT(ip != NULL);
@@ -127,77 +140,17 @@ xfs_bulkstat_one_iget(
127 buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; 140 buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
128 break; 141 break;
129 } 142 }
143 xfs_iunlock(ip, XFS_ILOCK_SHARED);
144 IRELE(ip);
130 145
131 xfs_iput(ip, XFS_ILOCK_SHARED); 146 error = formatter(buffer, ubsize, ubused, buf);
132 return error;
133}
134
135STATIC void
136xfs_bulkstat_one_dinode(
137 xfs_mount_t *mp, /* mount point for filesystem */
138 xfs_ino_t ino, /* inode number to get data for */
139 xfs_dinode_t *dic, /* dinode inode pointer */
140 xfs_bstat_t *buf) /* return buffer */
141{
142 /*
143 * The inode format changed when we moved the link count and
144 * made it 32 bits long. If this is an old format inode,
145 * convert it in memory to look like a new one. If it gets
146 * flushed to disk we will convert back before flushing or
147 * logging it. We zero out the new projid field and the old link
148 * count field. We'll handle clearing the pad field (the remains
149 * of the old uuid field) when we actually convert the inode to
150 * the new format. We don't change the version number so that we
151 * can distinguish this from a real new format inode.
152 */
153 if (dic->di_version == 1) {
154 buf->bs_nlink = be16_to_cpu(dic->di_onlink);
155 buf->bs_projid = 0;
156 } else {
157 buf->bs_nlink = be32_to_cpu(dic->di_nlink);
158 buf->bs_projid = be16_to_cpu(dic->di_projid);
159 }
160 147
161 buf->bs_ino = ino; 148 if (!error)
162 buf->bs_mode = be16_to_cpu(dic->di_mode); 149 *stat = BULKSTAT_RV_DIDONE;
163 buf->bs_uid = be32_to_cpu(dic->di_uid);
164 buf->bs_gid = be32_to_cpu(dic->di_gid);
165 buf->bs_size = be64_to_cpu(dic->di_size);
166 buf->bs_atime.tv_sec = be32_to_cpu(dic->di_atime.t_sec);
167 buf->bs_atime.tv_nsec = be32_to_cpu(dic->di_atime.t_nsec);
168 buf->bs_mtime.tv_sec = be32_to_cpu(dic->di_mtime.t_sec);
169 buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
170 buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
171 buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
172 buf->bs_xflags = xfs_dic2xflags(dic);
173 buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
174 buf->bs_extents = be32_to_cpu(dic->di_nextents);
175 buf->bs_gen = be32_to_cpu(dic->di_gen);
176 memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
177 buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
178 buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
179 buf->bs_aextents = be16_to_cpu(dic->di_anextents);
180 buf->bs_forkoff = XFS_DFORK_BOFF(dic);
181 150
182 switch (dic->di_format) { 151 out_free:
183 case XFS_DINODE_FMT_DEV: 152 kmem_free(buf);
184 buf->bs_rdev = xfs_dinode_get_rdev(dic); 153 return error;
185 buf->bs_blksize = BLKDEV_IOSIZE;
186 buf->bs_blocks = 0;
187 break;
188 case XFS_DINODE_FMT_LOCAL:
189 case XFS_DINODE_FMT_UUID:
190 buf->bs_rdev = 0;
191 buf->bs_blksize = mp->m_sb.sb_blocksize;
192 buf->bs_blocks = 0;
193 break;
194 case XFS_DINODE_FMT_EXTENTS:
195 case XFS_DINODE_FMT_BTREE:
196 buf->bs_rdev = 0;
197 buf->bs_blksize = mp->m_sb.sb_blocksize;
198 buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
199 break;
200 }
201} 154}
202 155
203/* Return 0 on success or positive error */ 156/* Return 0 on success or positive error */
@@ -217,118 +170,17 @@ xfs_bulkstat_one_fmt(
217 return 0; 170 return 0;
218} 171}
219 172
220/*
221 * Return stat information for one inode.
222 * Return 0 if ok, else errno.
223 */
224int /* error status */
225xfs_bulkstat_one_int(
226 xfs_mount_t *mp, /* mount point for filesystem */
227 xfs_ino_t ino, /* inode number to get data for */
228 void __user *buffer, /* buffer to place output in */
229 int ubsize, /* size of buffer */
230 bulkstat_one_fmt_pf formatter, /* formatter, copy to user */
231 xfs_daddr_t bno, /* starting bno of inode cluster */
232 int *ubused, /* bytes used by me */
233 void *dibuff, /* on-disk inode buffer */
234 int *stat) /* BULKSTAT_RV_... */
235{
236 xfs_bstat_t *buf; /* return buffer */
237 int error = 0; /* error value */
238 xfs_dinode_t *dip; /* dinode inode pointer */
239
240 dip = (xfs_dinode_t *)dibuff;
241 *stat = BULKSTAT_RV_NOTHING;
242
243 if (!buffer || xfs_internal_inum(mp, ino))
244 return XFS_ERROR(EINVAL);
245
246 buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
247
248 if (dip == NULL) {
249 /* We're not being passed a pointer to a dinode. This happens
250 * if BULKSTAT_FG_IGET is selected. Do the iget.
251 */
252 error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat);
253 if (error)
254 goto out_free;
255 } else {
256 xfs_bulkstat_one_dinode(mp, ino, dip, buf);
257 }
258
259 error = formatter(buffer, ubsize, ubused, buf);
260 if (error)
261 goto out_free;
262
263 *stat = BULKSTAT_RV_DIDONE;
264
265 out_free:
266 kmem_free(buf);
267 return error;
268}
269
270int 173int
271xfs_bulkstat_one( 174xfs_bulkstat_one(
272 xfs_mount_t *mp, /* mount point for filesystem */ 175 xfs_mount_t *mp, /* mount point for filesystem */
273 xfs_ino_t ino, /* inode number to get data for */ 176 xfs_ino_t ino, /* inode number to get data for */
274 void __user *buffer, /* buffer to place output in */ 177 void __user *buffer, /* buffer to place output in */
275 int ubsize, /* size of buffer */ 178 int ubsize, /* size of buffer */
276 void *private_data, /* my private data */
277 xfs_daddr_t bno, /* starting bno of inode cluster */
278 int *ubused, /* bytes used by me */ 179 int *ubused, /* bytes used by me */
279 void *dibuff, /* on-disk inode buffer */
280 int *stat) /* BULKSTAT_RV_... */ 180 int *stat) /* BULKSTAT_RV_... */
281{ 181{
282 return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, 182 return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
283 xfs_bulkstat_one_fmt, bno, 183 xfs_bulkstat_one_fmt, ubused, stat);
284 ubused, dibuff, stat);
285}
286
287/*
288 * Test to see whether we can use the ondisk inode directly, based
289 * on the given bulkstat flags, filling in dipp accordingly.
290 * Returns zero if the inode is dodgey.
291 */
292STATIC int
293xfs_bulkstat_use_dinode(
294 xfs_mount_t *mp,
295 int flags,
296 xfs_buf_t *bp,
297 int clustidx,
298 xfs_dinode_t **dipp)
299{
300 xfs_dinode_t *dip;
301 unsigned int aformat;
302
303 *dipp = NULL;
304 if (!bp || (flags & BULKSTAT_FG_IGET))
305 return 1;
306 dip = (xfs_dinode_t *)
307 xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog);
308 /*
309 * Check the buffer containing the on-disk inode for di_mode == 0.
310 * This is to prevent xfs_bulkstat from picking up just reclaimed
311 * inodes that have their in-core state initialized but not flushed
312 * to disk yet. This is a temporary hack that would require a proper
313 * fix in the future.
314 */
315 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
316 !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
317 !dip->di_mode)
318 return 0;
319 if (flags & BULKSTAT_FG_QUICK) {
320 *dipp = dip;
321 return 1;
322 }
323 /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
324 aformat = dip->di_aformat;
325 if ((XFS_DFORK_Q(dip) == 0) ||
326 (aformat == XFS_DINODE_FMT_LOCAL) ||
327 (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
328 *dipp = dip;
329 return 1;
330 }
331 return 1;
332} 184}
333 185
334#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) 186#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
@@ -342,10 +194,8 @@ xfs_bulkstat(
342 xfs_ino_t *lastinop, /* last inode returned */ 194 xfs_ino_t *lastinop, /* last inode returned */
343 int *ubcountp, /* size of buffer/count returned */ 195 int *ubcountp, /* size of buffer/count returned */
344 bulkstat_one_pf formatter, /* func that'd fill a single buf */ 196 bulkstat_one_pf formatter, /* func that'd fill a single buf */
345 void *private_data,/* private data for formatter */
346 size_t statstruct_size, /* sizeof struct filling */ 197 size_t statstruct_size, /* sizeof struct filling */
347 char __user *ubuffer, /* buffer with inode stats */ 198 char __user *ubuffer, /* buffer with inode stats */
348 int flags, /* defined in xfs_itable.h */
349 int *done) /* 1 if there are more stats to get */ 199 int *done) /* 1 if there are more stats to get */
350{ 200{
351 xfs_agblock_t agbno=0;/* allocation group block number */ 201 xfs_agblock_t agbno=0;/* allocation group block number */
@@ -380,14 +230,12 @@ xfs_bulkstat(
380 int ubelem; /* spaces used in user's buffer */ 230 int ubelem; /* spaces used in user's buffer */
381 int ubused; /* bytes used by formatter */ 231 int ubused; /* bytes used by formatter */
382 xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */ 232 xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
383 xfs_dinode_t *dip; /* ptr into bp for specific inode */
384 233
385 /* 234 /*
386 * Get the last inode value, see if there's nothing to do. 235 * Get the last inode value, see if there's nothing to do.
387 */ 236 */
388 ino = (xfs_ino_t)*lastinop; 237 ino = (xfs_ino_t)*lastinop;
389 lastino = ino; 238 lastino = ino;
390 dip = NULL;
391 agno = XFS_INO_TO_AGNO(mp, ino); 239 agno = XFS_INO_TO_AGNO(mp, ino);
392 agino = XFS_INO_TO_AGINO(mp, ino); 240 agino = XFS_INO_TO_AGINO(mp, ino);
393 if (agno >= mp->m_sb.sb_agcount || 241 if (agno >= mp->m_sb.sb_agcount ||
@@ -612,37 +460,6 @@ xfs_bulkstat(
612 irbp->ir_startino) + 460 irbp->ir_startino) +
613 ((chunkidx & nimask) >> 461 ((chunkidx & nimask) >>
614 mp->m_sb.sb_inopblog); 462 mp->m_sb.sb_inopblog);
615
616 if (flags & (BULKSTAT_FG_QUICK |
617 BULKSTAT_FG_INLINE)) {
618 int offset;
619
620 ino = XFS_AGINO_TO_INO(mp, agno,
621 agino);
622 bno = XFS_AGB_TO_DADDR(mp, agno,
623 agbno);
624
625 /*
626 * Get the inode cluster buffer
627 */
628 if (bp)
629 xfs_buf_relse(bp);
630
631 error = xfs_inotobp(mp, NULL, ino, &dip,
632 &bp, &offset,
633 XFS_IGET_BULKSTAT);
634
635 if (!error)
636 clustidx = offset / mp->m_sb.sb_inodesize;
637 if (XFS_TEST_ERROR(error != 0,
638 mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
639 XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
640 bp = NULL;
641 ubleft = 0;
642 rval = error;
643 break;
644 }
645 }
646 } 463 }
647 ino = XFS_AGINO_TO_INO(mp, agno, agino); 464 ino = XFS_AGINO_TO_INO(mp, agno, agino);
648 bno = XFS_AGB_TO_DADDR(mp, agno, agbno); 465 bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
@@ -658,35 +475,13 @@ xfs_bulkstat(
658 * when the chunk is used up. 475 * when the chunk is used up.
659 */ 476 */
660 irbp->ir_freecount++; 477 irbp->ir_freecount++;
661 if (!xfs_bulkstat_use_dinode(mp, flags, bp,
662 clustidx, &dip)) {
663 lastino = ino;
664 continue;
665 }
666 /*
667 * If we need to do an iget, cannot hold bp.
668 * Drop it, until starting the next cluster.
669 */
670 if ((flags & BULKSTAT_FG_INLINE) && !dip) {
671 if (bp)
672 xfs_buf_relse(bp);
673 bp = NULL;
674 }
675 478
676 /* 479 /*
677 * Get the inode and fill in a single buffer. 480 * Get the inode and fill in a single buffer.
678 * BULKSTAT_FG_QUICK uses dip to fill it in.
679 * BULKSTAT_FG_IGET uses igets.
680 * BULKSTAT_FG_INLINE uses dip if we have an
681 * inline attr fork, else igets.
682 * See: xfs_bulkstat_one & xfs_dm_bulkstat_one.
683 * This is also used to count inodes/blks, etc
684 * in xfs_qm_quotacheck.
685 */ 481 */
686 ubused = statstruct_size; 482 ubused = statstruct_size;
687 error = formatter(mp, ino, ubufp, 483 error = formatter(mp, ino, ubufp, ubleft,
688 ubleft, private_data, 484 &ubused, &fmterror);
689 bno, &ubused, dip, &fmterror);
690 if (fmterror == BULKSTAT_RV_NOTHING) { 485 if (fmterror == BULKSTAT_RV_NOTHING) {
691 if (error && error != ENOENT && 486 if (error && error != ENOENT &&
692 error != EINVAL) { 487 error != EINVAL) {
@@ -778,8 +573,7 @@ xfs_bulkstat_single(
778 */ 573 */
779 574
780 ino = (xfs_ino_t)*lastinop; 575 ino = (xfs_ino_t)*lastinop;
781 error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 576 error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res);
782 NULL, 0, NULL, NULL, &res);
783 if (error) { 577 if (error) {
784 /* 578 /*
785 * Special case way failed, do it the "long" way 579 * Special case way failed, do it the "long" way
@@ -788,8 +582,7 @@ xfs_bulkstat_single(
788 (*lastinop)--; 582 (*lastinop)--;
789 count = 1; 583 count = 1;
790 if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one, 584 if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
791 NULL, sizeof(xfs_bstat_t), buffer, 585 sizeof(xfs_bstat_t), buffer, done))
792 BULKSTAT_FG_IGET, done))
793 return error; 586 return error;
794 if (count == 0 || (xfs_ino_t)*lastinop != ino) 587 if (count == 0 || (xfs_ino_t)*lastinop != ino)
795 return error == EFSCORRUPTED ? 588 return error == EFSCORRUPTED ?
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 20792bf45946..97295d91d170 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -27,10 +27,7 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
27 xfs_ino_t ino, 27 xfs_ino_t ino,
28 void __user *buffer, 28 void __user *buffer,
29 int ubsize, 29 int ubsize,
30 void *private_data,
31 xfs_daddr_t bno,
32 int *ubused, 30 int *ubused,
33 void *dip,
34 int *stat); 31 int *stat);
35 32
36/* 33/*
@@ -41,13 +38,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
41#define BULKSTAT_RV_GIVEUP 2 38#define BULKSTAT_RV_GIVEUP 2
42 39
43/* 40/*
44 * Values for bulkstat flag argument.
45 */
46#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */
47#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */
48#define BULKSTAT_FG_INLINE 0x4 /* No iget if inline attrs */
49
50/*
51 * Return stat information in bulk (by-inode) for the filesystem. 41 * Return stat information in bulk (by-inode) for the filesystem.
52 */ 42 */
53int /* error status */ 43int /* error status */
@@ -56,10 +46,8 @@ xfs_bulkstat(
56 xfs_ino_t *lastino, /* last inode returned */ 46 xfs_ino_t *lastino, /* last inode returned */
57 int *count, /* size of buffer/count returned */ 47 int *count, /* size of buffer/count returned */
58 bulkstat_one_pf formatter, /* func that'd fill a single buf */ 48 bulkstat_one_pf formatter, /* func that'd fill a single buf */
59 void *private_data, /* private data for formatter */
60 size_t statstruct_size,/* sizeof struct that we're filling */ 49 size_t statstruct_size,/* sizeof struct that we're filling */
61 char __user *ubuffer,/* buffer with inode stats */ 50 char __user *ubuffer,/* buffer with inode stats */
62 int flags, /* flag to control access method */
63 int *done); /* 1 if there are more stats to get */ 51 int *done); /* 1 if there are more stats to get */
64 52
65int 53int
@@ -82,9 +70,7 @@ xfs_bulkstat_one_int(
82 void __user *buffer, 70 void __user *buffer,
83 int ubsize, 71 int ubsize,
84 bulkstat_one_fmt_pf formatter, 72 bulkstat_one_fmt_pf formatter,
85 xfs_daddr_t bno,
86 int *ubused, 73 int *ubused,
87 void *dibuff,
88 int *stat); 74 int *stat);
89 75
90int 76int
@@ -93,10 +79,7 @@ xfs_bulkstat_one(
93 xfs_ino_t ino, 79 xfs_ino_t ino,
94 void __user *buffer, 80 void __user *buffer,
95 int ubsize, 81 int ubsize,
96 void *private_data,
97 xfs_daddr_t bno,
98 int *ubused, 82 int *ubused,
99 void *dibuff,
100 int *stat); 83 int *stat);
101 84
102typedef int (*inumbers_fmt_pf)( 85typedef int (*inumbers_fmt_pf)(
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2be019136287..33f718f92a48 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,8 +24,6 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_error.h" 28#include "xfs_error.h"
31#include "xfs_log_priv.h" 29#include "xfs_log_priv.h"
@@ -35,8 +33,6 @@
35#include "xfs_ialloc_btree.h" 33#include "xfs_ialloc_btree.h"
36#include "xfs_log_recover.h" 34#include "xfs_log_recover.h"
37#include "xfs_trans_priv.h" 35#include "xfs_trans_priv.h"
38#include "xfs_dir2_sf.h"
39#include "xfs_attr_sf.h"
40#include "xfs_dinode.h" 36#include "xfs_dinode.h"
41#include "xfs_inode.h" 37#include "xfs_inode.h"
42#include "xfs_rw.h" 38#include "xfs_rw.h"
@@ -44,13 +40,8 @@
44 40
45kmem_zone_t *xfs_log_ticket_zone; 41kmem_zone_t *xfs_log_ticket_zone;
46 42
47#define xlog_write_adv_cnt(ptr, len, off, bytes) \
48 { (ptr) += (bytes); \
49 (len) -= (bytes); \
50 (off) += (bytes);}
51
52/* Local miscellaneous function prototypes */ 43/* Local miscellaneous function prototypes */
53STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, 44STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
54 xlog_in_core_t **, xfs_lsn_t *); 45 xlog_in_core_t **, xfs_lsn_t *);
55STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, 46STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
56 xfs_buftarg_t *log_target, 47 xfs_buftarg_t *log_target,
@@ -59,11 +50,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
59STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 50STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
61STATIC void xlog_dealloc_log(xlog_t *log); 52STATIC void xlog_dealloc_log(xlog_t *log);
62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
63 int nentries, struct xlog_ticket *tic,
64 xfs_lsn_t *start_lsn,
65 xlog_in_core_t **commit_iclog,
66 uint flags);
67 53
68/* local state machine functions */ 54/* local state machine functions */
69STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 55STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -93,16 +79,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log,
93STATIC void xlog_ungrant_log_space(xlog_t *log, 79STATIC void xlog_ungrant_log_space(xlog_t *log,
94 xlog_ticket_t *ticket); 80 xlog_ticket_t *ticket);
95 81
96
97/* local ticket functions */
98STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
99 int unit_bytes,
100 int count,
101 char clientid,
102 uint flags);
103
104#if defined(DEBUG) 82#if defined(DEBUG)
105STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); 83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
106STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 84STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
107STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
108 int count, boolean_t syncing); 86 int count, boolean_t syncing);
@@ -258,7 +236,7 @@ xfs_log_done(
258 * If we get an error, just continue and give back the log ticket. 236 * If we get an error, just continue and give back the log ticket.
259 */ 237 */
260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 238 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
261 (xlog_commit_record(mp, ticket, iclog, &lsn)))) { 239 (xlog_commit_record(log, ticket, iclog, &lsn)))) {
262 lsn = (xfs_lsn_t) -1; 240 lsn = (xfs_lsn_t) -1;
263 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 241 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
264 flags |= XFS_LOG_REL_PERM_RESERV; 242 flags |= XFS_LOG_REL_PERM_RESERV;
@@ -355,7 +333,6 @@ xfs_log_reserve(
355 int retval = 0; 333 int retval = 0;
356 334
357 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 335 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
358 ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
359 336
360 if (XLOG_FORCED_SHUTDOWN(log)) 337 if (XLOG_FORCED_SHUTDOWN(log))
361 return XFS_ERROR(EIO); 338 return XFS_ERROR(EIO);
@@ -367,6 +344,15 @@ xfs_log_reserve(
367 ASSERT(flags & XFS_LOG_PERM_RESERV); 344 ASSERT(flags & XFS_LOG_PERM_RESERV);
368 internal_ticket = *ticket; 345 internal_ticket = *ticket;
369 346
347 /*
348 * this is a new transaction on the ticket, so we need to
349 * change the transaction ID so that the next transaction has a
350 * different TID in the log. Just add one to the existing tid
351 * so that we can see chains of rolling transactions in the log
352 * easily.
353 */
354 internal_ticket->t_tid++;
355
370 trace_xfs_log_reserve(log, internal_ticket); 356 trace_xfs_log_reserve(log, internal_ticket);
371 357
372 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 358 xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -374,7 +360,8 @@ xfs_log_reserve(
374 } else { 360 } else {
375 /* may sleep if need to allocate more tickets */ 361 /* may sleep if need to allocate more tickets */
376 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, 362 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
377 client, flags); 363 client, flags,
364 KM_SLEEP|KM_MAYFAIL);
378 if (!internal_ticket) 365 if (!internal_ticket)
379 return XFS_ERROR(ENOMEM); 366 return XFS_ERROR(ENOMEM);
380 internal_ticket->t_trans_type = t_type; 367 internal_ticket->t_trans_type = t_type;
@@ -459,6 +446,13 @@ xfs_log_mount(
459 /* Normal transactions can now occur */ 446 /* Normal transactions can now occur */
460 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 447 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
461 448
449 /*
450 * Now the log has been fully initialised and we know were our
451 * space grant counters are, we can initialise the permanent ticket
452 * needed for delayed logging to work.
453 */
454 xlog_cil_init_post_recovery(mp->m_log);
455
462 return 0; 456 return 0;
463 457
464out_destroy_ail: 458out_destroy_ail:
@@ -516,18 +510,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
516#ifdef DEBUG 510#ifdef DEBUG
517 xlog_in_core_t *first_iclog; 511 xlog_in_core_t *first_iclog;
518#endif 512#endif
519 xfs_log_iovec_t reg[1];
520 xlog_ticket_t *tic = NULL; 513 xlog_ticket_t *tic = NULL;
521 xfs_lsn_t lsn; 514 xfs_lsn_t lsn;
522 int error; 515 int error;
523 516
524 /* the data section must be 32 bit size aligned */
525 struct {
526 __uint16_t magic;
527 __uint16_t pad1;
528 __uint32_t pad2; /* may as well make it 64 bits */
529 } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
530
531 /* 517 /*
532 * Don't write out unmount record on read-only mounts. 518 * Don't write out unmount record on read-only mounts.
533 * Or, if we are doing a forced umount (typically because of IO errors). 519 * Or, if we are doing a forced umount (typically because of IO errors).
@@ -549,16 +535,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
549 } while (iclog != first_iclog); 535 } while (iclog != first_iclog);
550#endif 536#endif
551 if (! (XLOG_FORCED_SHUTDOWN(log))) { 537 if (! (XLOG_FORCED_SHUTDOWN(log))) {
552 reg[0].i_addr = (void*)&magic;
553 reg[0].i_len = sizeof(magic);
554 reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
555
556 error = xfs_log_reserve(mp, 600, 1, &tic, 538 error = xfs_log_reserve(mp, 600, 1, &tic,
557 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); 539 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
558 if (!error) { 540 if (!error) {
541 /* the data section must be 32 bit size aligned */
542 struct {
543 __uint16_t magic;
544 __uint16_t pad1;
545 __uint32_t pad2; /* may as well make it 64 bits */
546 } magic = {
547 .magic = XLOG_UNMOUNT_TYPE,
548 };
549 struct xfs_log_iovec reg = {
550 .i_addr = &magic,
551 .i_len = sizeof(magic),
552 .i_type = XLOG_REG_TYPE_UNMOUNT,
553 };
554 struct xfs_log_vec vec = {
555 .lv_niovecs = 1,
556 .lv_iovecp = &reg,
557 };
558
559 /* remove inited flag */ 559 /* remove inited flag */
560 ((xlog_ticket_t *)tic)->t_flags = 0; 560 tic->t_flags = 0;
561 error = xlog_write(mp, reg, 1, tic, &lsn, 561 error = xlog_write(log, &vec, tic, &lsn,
562 NULL, XLOG_UNMOUNT_TRANS); 562 NULL, XLOG_UNMOUNT_TRANS);
563 /* 563 /*
564 * At this point, we're umounting anyway, 564 * At this point, we're umounting anyway,
@@ -648,10 +648,30 @@ xfs_log_unmount(xfs_mount_t *mp)
648 xlog_dealloc_log(mp->m_log); 648 xlog_dealloc_log(mp->m_log);
649} 649}
650 650
651void
652xfs_log_item_init(
653 struct xfs_mount *mp,
654 struct xfs_log_item *item,
655 int type,
656 struct xfs_item_ops *ops)
657{
658 item->li_mountp = mp;
659 item->li_ailp = mp->m_ail;
660 item->li_type = type;
661 item->li_ops = ops;
662 item->li_lv = NULL;
663
664 INIT_LIST_HEAD(&item->li_ail);
665 INIT_LIST_HEAD(&item->li_cil);
666}
667
651/* 668/*
652 * Write region vectors to log. The write happens using the space reservation 669 * Write region vectors to log. The write happens using the space reservation
653 * of the ticket (tic). It is not a requirement that all writes for a given 670 * of the ticket (tic). It is not a requirement that all writes for a given
654 * transaction occur with one call to xfs_log_write(). 671 * transaction occur with one call to xfs_log_write(). However, it is important
672 * to note that the transaction reservation code makes an assumption about the
673 * number of log headers a transaction requires that may be violated if you
674 * don't pass all the transaction vectors in one call....
655 */ 675 */
656int 676int
657xfs_log_write( 677xfs_log_write(
@@ -663,11 +683,15 @@ xfs_log_write(
663{ 683{
664 struct log *log = mp->m_log; 684 struct log *log = mp->m_log;
665 int error; 685 int error;
686 struct xfs_log_vec vec = {
687 .lv_niovecs = nentries,
688 .lv_iovecp = reg,
689 };
666 690
667 if (XLOG_FORCED_SHUTDOWN(log)) 691 if (XLOG_FORCED_SHUTDOWN(log))
668 return XFS_ERROR(EIO); 692 return XFS_ERROR(EIO);
669 693
670 error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0); 694 error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
671 if (error) 695 if (error)
672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 696 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
673 return error; 697 return error;
@@ -1018,8 +1042,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1018 xlog_in_core_t *iclog, *prev_iclog=NULL; 1042 xlog_in_core_t *iclog, *prev_iclog=NULL;
1019 xfs_buf_t *bp; 1043 xfs_buf_t *bp;
1020 int i; 1044 int i;
1021 int iclogsize;
1022 int error = ENOMEM; 1045 int error = ENOMEM;
1046 uint log2_size = 0;
1023 1047
1024 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); 1048 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1025 if (!log) { 1049 if (!log) {
@@ -1045,29 +1069,30 @@ xlog_alloc_log(xfs_mount_t *mp,
1045 1069
1046 error = EFSCORRUPTED; 1070 error = EFSCORRUPTED;
1047 if (xfs_sb_version_hassector(&mp->m_sb)) { 1071 if (xfs_sb_version_hassector(&mp->m_sb)) {
1048 log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; 1072 log2_size = mp->m_sb.sb_logsectlog;
1049 if (log->l_sectbb_log < 0 || 1073 if (log2_size < BBSHIFT) {
1050 log->l_sectbb_log > mp->m_sectbb_log) { 1074 xlog_warn("XFS: Log sector size too small "
1051 xlog_warn("XFS: Log sector size (0x%x) out of range.", 1075 "(0x%x < 0x%x)", log2_size, BBSHIFT);
1052 log->l_sectbb_log);
1053 goto out_free_log; 1076 goto out_free_log;
1054 } 1077 }
1055 1078
1056 /* for larger sector sizes, must have v2 or external log */ 1079 log2_size -= BBSHIFT;
1057 if (log->l_sectbb_log != 0 && 1080 if (log2_size > mp->m_sectbb_log) {
1058 (log->l_logBBstart != 0 && 1081 xlog_warn("XFS: Log sector size too large "
1059 !xfs_sb_version_haslogv2(&mp->m_sb))) { 1082 "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
1060 xlog_warn("XFS: log sector size (0x%x) invalid "
1061 "for configuration.", log->l_sectbb_log);
1062 goto out_free_log; 1083 goto out_free_log;
1063 } 1084 }
1064 if (mp->m_sb.sb_logsectlog < BBSHIFT) { 1085
1065 xlog_warn("XFS: Log sector log (0x%x) too small.", 1086 /* for larger sector sizes, must have v2 or external log */
1066 mp->m_sb.sb_logsectlog); 1087 if (log2_size && log->l_logBBstart > 0 &&
1088 !xfs_sb_version_haslogv2(&mp->m_sb)) {
1089
1090 xlog_warn("XFS: log sector size (0x%x) invalid "
1091 "for configuration.", log2_size);
1067 goto out_free_log; 1092 goto out_free_log;
1068 } 1093 }
1069 } 1094 }
1070 log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; 1095 log->l_sectBBsize = 1 << log2_size;
1071 1096
1072 xlog_get_iclog_buffer_size(mp, log); 1097 xlog_get_iclog_buffer_size(mp, log);
1073 1098
@@ -1096,7 +1121,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1096 * with different amounts of memory. See the definition of 1121 * with different amounts of memory. See the definition of
1097 * xlog_in_core_t in xfs_log_priv.h for details. 1122 * xlog_in_core_t in xfs_log_priv.h for details.
1098 */ 1123 */
1099 iclogsize = log->l_iclog_size;
1100 ASSERT(log->l_iclog_size >= 4096); 1124 ASSERT(log->l_iclog_size >= 4096);
1101 for (i=0; i < log->l_iclog_bufs; i++) { 1125 for (i=0; i < log->l_iclog_bufs; i++) {
1102 *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); 1126 *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
@@ -1147,6 +1171,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1147 *iclogp = log->l_iclog; /* complete ring */ 1171 *iclogp = log->l_iclog; /* complete ring */
1148 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1172 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1149 1173
1174 error = xlog_cil_init(log);
1175 if (error)
1176 goto out_free_iclog;
1150 return log; 1177 return log;
1151 1178
1152out_free_iclog: 1179out_free_iclog:
@@ -1174,26 +1201,31 @@ out:
1174 * ticket. Return the lsn of the commit record. 1201 * ticket. Return the lsn of the commit record.
1175 */ 1202 */
1176STATIC int 1203STATIC int
1177xlog_commit_record(xfs_mount_t *mp, 1204xlog_commit_record(
1178 xlog_ticket_t *ticket, 1205 struct log *log,
1179 xlog_in_core_t **iclog, 1206 struct xlog_ticket *ticket,
1180 xfs_lsn_t *commitlsnp) 1207 struct xlog_in_core **iclog,
1208 xfs_lsn_t *commitlsnp)
1181{ 1209{
1182 int error; 1210 struct xfs_mount *mp = log->l_mp;
1183 xfs_log_iovec_t reg[1]; 1211 int error;
1184 1212 struct xfs_log_iovec reg = {
1185 reg[0].i_addr = NULL; 1213 .i_addr = NULL,
1186 reg[0].i_len = 0; 1214 .i_len = 0,
1187 reg[0].i_type = XLOG_REG_TYPE_COMMIT; 1215 .i_type = XLOG_REG_TYPE_COMMIT,
1216 };
1217 struct xfs_log_vec vec = {
1218 .lv_niovecs = 1,
1219 .lv_iovecp = &reg,
1220 };
1188 1221
1189 ASSERT_ALWAYS(iclog); 1222 ASSERT_ALWAYS(iclog);
1190 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, 1223 error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
1191 iclog, XLOG_COMMIT_TRANS))) { 1224 XLOG_COMMIT_TRANS);
1225 if (error)
1192 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1226 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1193 }
1194 return error; 1227 return error;
1195} /* xlog_commit_record */ 1228}
1196
1197 1229
1198/* 1230/*
1199 * Push on the buffer cache code if we ever use more than 75% of the on-disk 1231 * Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1389,11 +1421,8 @@ xlog_sync(xlog_t *log,
1389 XFS_BUF_BUSY(bp); 1421 XFS_BUF_BUSY(bp);
1390 XFS_BUF_ASYNC(bp); 1422 XFS_BUF_ASYNC(bp);
1391 bp->b_flags |= XBF_LOG_BUFFER; 1423 bp->b_flags |= XBF_LOG_BUFFER;
1392 /* 1424
1393 * Do an ordered write for the log block. 1425 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1394 * Its unnecessary to flush the first split block in the log wrap case.
1395 */
1396 if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER))
1397 XFS_BUF_ORDERED(bp); 1426 XFS_BUF_ORDERED(bp);
1398 1427
1399 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1428 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
@@ -1468,6 +1497,8 @@ xlog_dealloc_log(xlog_t *log)
1468 xlog_in_core_t *iclog, *next_iclog; 1497 xlog_in_core_t *iclog, *next_iclog;
1469 int i; 1498 int i;
1470 1499
1500 xlog_cil_destroy(log);
1501
1471 iclog = log->l_iclog; 1502 iclog = log->l_iclog;
1472 for (i=0; i<log->l_iclog_bufs; i++) { 1503 for (i=0; i<log->l_iclog_bufs; i++) {
1473 sv_destroy(&iclog->ic_force_wait); 1504 sv_destroy(&iclog->ic_force_wait);
@@ -1510,8 +1541,10 @@ xlog_state_finish_copy(xlog_t *log,
1510 * print out info relating to regions written which consume 1541 * print out info relating to regions written which consume
1511 * the reservation 1542 * the reservation
1512 */ 1543 */
1513STATIC void 1544void
1514xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1545xlog_print_tic_res(
1546 struct xfs_mount *mp,
1547 struct xlog_ticket *ticket)
1515{ 1548{
1516 uint i; 1549 uint i;
1517 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1550 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1611,6 +1644,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1611 "bad-rtype" : res_type_str[r_type-1]), 1644 "bad-rtype" : res_type_str[r_type-1]),
1612 ticket->t_res_arr[i].r_len); 1645 ticket->t_res_arr[i].r_len);
1613 } 1646 }
1647
1648 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1649 "xfs_log_write: reservation ran out. Need to up reservation");
1650 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1651}
1652
1653/*
1654 * Calculate the potential space needed by the log vector. Each region gets
1655 * its own xlog_op_header_t and may need to be double word aligned.
1656 */
1657static int
1658xlog_write_calc_vec_length(
1659 struct xlog_ticket *ticket,
1660 struct xfs_log_vec *log_vector)
1661{
1662 struct xfs_log_vec *lv;
1663 int headers = 0;
1664 int len = 0;
1665 int i;
1666
1667 /* acct for start rec of xact */
1668 if (ticket->t_flags & XLOG_TIC_INITED)
1669 headers++;
1670
1671 for (lv = log_vector; lv; lv = lv->lv_next) {
1672 headers += lv->lv_niovecs;
1673
1674 for (i = 0; i < lv->lv_niovecs; i++) {
1675 struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
1676
1677 len += vecp->i_len;
1678 xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
1679 }
1680 }
1681
1682 ticket->t_res_num_ophdrs += headers;
1683 len += headers * sizeof(struct xlog_op_header);
1684
1685 return len;
1686}
1687
1688/*
1689 * If first write for transaction, insert start record We can't be trying to
1690 * commit if we are inited. We can't have any "partial_copy" if we are inited.
1691 */
1692static int
1693xlog_write_start_rec(
1694 struct xlog_op_header *ophdr,
1695 struct xlog_ticket *ticket)
1696{
1697 if (!(ticket->t_flags & XLOG_TIC_INITED))
1698 return 0;
1699
1700 ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
1701 ophdr->oh_clientid = ticket->t_clientid;
1702 ophdr->oh_len = 0;
1703 ophdr->oh_flags = XLOG_START_TRANS;
1704 ophdr->oh_res2 = 0;
1705
1706 ticket->t_flags &= ~XLOG_TIC_INITED;
1707
1708 return sizeof(struct xlog_op_header);
1709}
1710
1711static xlog_op_header_t *
1712xlog_write_setup_ophdr(
1713 struct log *log,
1714 struct xlog_op_header *ophdr,
1715 struct xlog_ticket *ticket,
1716 uint flags)
1717{
1718 ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
1719 ophdr->oh_clientid = ticket->t_clientid;
1720 ophdr->oh_res2 = 0;
1721
1722 /* are we copying a commit or unmount record? */
1723 ophdr->oh_flags = flags;
1724
1725 /*
1726 * We've seen logs corrupted with bad transaction client ids. This
1727 * makes sure that XFS doesn't generate them on. Turn this into an EIO
1728 * and shut down the filesystem.
1729 */
1730 switch (ophdr->oh_clientid) {
1731 case XFS_TRANSACTION:
1732 case XFS_VOLUME:
1733 case XFS_LOG:
1734 break;
1735 default:
1736 xfs_fs_cmn_err(CE_WARN, log->l_mp,
1737 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1738 ophdr->oh_clientid, ticket);
1739 return NULL;
1740 }
1741
1742 return ophdr;
1743}
1744
1745/*
1746 * Set up the parameters of the region copy into the log. This has
1747 * to handle region write split across multiple log buffers - this
1748 * state is kept external to this function so that this code can
1749 * can be written in an obvious, self documenting manner.
1750 */
1751static int
1752xlog_write_setup_copy(
1753 struct xlog_ticket *ticket,
1754 struct xlog_op_header *ophdr,
1755 int space_available,
1756 int space_required,
1757 int *copy_off,
1758 int *copy_len,
1759 int *last_was_partial_copy,
1760 int *bytes_consumed)
1761{
1762 int still_to_copy;
1763
1764 still_to_copy = space_required - *bytes_consumed;
1765 *copy_off = *bytes_consumed;
1766
1767 if (still_to_copy <= space_available) {
1768 /* write of region completes here */
1769 *copy_len = still_to_copy;
1770 ophdr->oh_len = cpu_to_be32(*copy_len);
1771 if (*last_was_partial_copy)
1772 ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1773 *last_was_partial_copy = 0;
1774 *bytes_consumed = 0;
1775 return 0;
1776 }
1777
1778 /* partial write of region, needs extra log op header reservation */
1779 *copy_len = space_available;
1780 ophdr->oh_len = cpu_to_be32(*copy_len);
1781 ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
1782 if (*last_was_partial_copy)
1783 ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
1784 *bytes_consumed += *copy_len;
1785 (*last_was_partial_copy)++;
1786
1787 /* account for new log op header */
1788 ticket->t_curr_res -= sizeof(struct xlog_op_header);
1789 ticket->t_res_num_ophdrs++;
1790
1791 return sizeof(struct xlog_op_header);
1792}
1793
1794static int
1795xlog_write_copy_finish(
1796 struct log *log,
1797 struct xlog_in_core *iclog,
1798 uint flags,
1799 int *record_cnt,
1800 int *data_cnt,
1801 int *partial_copy,
1802 int *partial_copy_len,
1803 int log_offset,
1804 struct xlog_in_core **commit_iclog)
1805{
1806 if (*partial_copy) {
1807 /*
1808 * This iclog has already been marked WANT_SYNC by
1809 * xlog_state_get_iclog_space.
1810 */
1811 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1812 *record_cnt = 0;
1813 *data_cnt = 0;
1814 return xlog_state_release_iclog(log, iclog);
1815 }
1816
1817 *partial_copy = 0;
1818 *partial_copy_len = 0;
1819
1820 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1821 /* no more space in this iclog - push it. */
1822 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1823 *record_cnt = 0;
1824 *data_cnt = 0;
1825
1826 spin_lock(&log->l_icloglock);
1827 xlog_state_want_sync(log, iclog);
1828 spin_unlock(&log->l_icloglock);
1829
1830 if (!commit_iclog)
1831 return xlog_state_release_iclog(log, iclog);
1832 ASSERT(flags & XLOG_COMMIT_TRANS);
1833 *commit_iclog = iclog;
1834 }
1835
1836 return 0;
1614} 1837}
1615 1838
1616/* 1839/*
@@ -1653,211 +1876,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1653 * we don't update ic_offset until the end when we know exactly how many 1876 * we don't update ic_offset until the end when we know exactly how many
1654 * bytes have been written out. 1877 * bytes have been written out.
1655 */ 1878 */
1656STATIC int 1879int
1657xlog_write( 1880xlog_write(
1658 struct xfs_mount *mp, 1881 struct log *log,
1659 struct xfs_log_iovec reg[], 1882 struct xfs_log_vec *log_vector,
1660 int nentries,
1661 struct xlog_ticket *ticket, 1883 struct xlog_ticket *ticket,
1662 xfs_lsn_t *start_lsn, 1884 xfs_lsn_t *start_lsn,
1663 struct xlog_in_core **commit_iclog, 1885 struct xlog_in_core **commit_iclog,
1664 uint flags) 1886 uint flags)
1665{ 1887{
1666 xlog_t *log = mp->m_log; 1888 struct xlog_in_core *iclog = NULL;
1667 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ 1889 struct xfs_log_iovec *vecp;
1668 xlog_op_header_t *logop_head; /* ptr to log operation header */ 1890 struct xfs_log_vec *lv;
1669 __psint_t ptr; /* copy address into data region */ 1891 int len;
1670 int len; /* # xlog_write() bytes 2 still copy */ 1892 int index;
1671 int index; /* region index currently copying */ 1893 int partial_copy = 0;
1672 int log_offset; /* offset (from 0) into data region */ 1894 int partial_copy_len = 0;
1673 int start_rec_copy; /* # bytes to copy for start record */ 1895 int contwr = 0;
1674 int partial_copy; /* did we split a region? */ 1896 int record_cnt = 0;
1675 int partial_copy_len;/* # bytes copied if split region */ 1897 int data_cnt = 0;
1676 int need_copy; /* # bytes need to memcpy this region */ 1898 int error;
1677 int copy_len; /* # bytes actually memcpy'ing */
1678 int copy_off; /* # bytes from entry start */
1679 int contwr; /* continued write of in-core log? */
1680 int error;
1681 int record_cnt = 0, data_cnt = 0;
1682
1683 partial_copy_len = partial_copy = 0;
1684
1685 /* Calculate potential maximum space. Each region gets its own
1686 * xlog_op_header_t and may need to be double word aligned.
1687 */
1688 len = 0;
1689 if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */
1690 len += sizeof(xlog_op_header_t);
1691 ticket->t_res_num_ophdrs++;
1692 }
1693
1694 for (index = 0; index < nentries; index++) {
1695 len += sizeof(xlog_op_header_t); /* each region gets >= 1 */
1696 ticket->t_res_num_ophdrs++;
1697 len += reg[index].i_len;
1698 xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
1699 }
1700 contwr = *start_lsn = 0;
1701 1899
1702 if (ticket->t_curr_res < len) { 1900 *start_lsn = 0;
1703 xlog_print_tic_res(mp, ticket);
1704#ifdef DEBUG
1705 xlog_panic(
1706 "xfs_log_write: reservation ran out. Need to up reservation");
1707#else
1708 /* Customer configurable panic */
1709 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1710 "xfs_log_write: reservation ran out. Need to up reservation");
1711 /* If we did not panic, shutdown the filesystem */
1712 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1713#endif
1714 } else
1715 ticket->t_curr_res -= len;
1716 1901
1717 for (index = 0; index < nentries; ) { 1902 len = xlog_write_calc_vec_length(ticket, log_vector);
1718 if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, 1903 if (log->l_cilp) {
1719 &contwr, &log_offset))) 1904 /*
1720 return error; 1905 * Region headers and bytes are already accounted for.
1906 * We only need to take into account start records and
1907 * split regions in this function.
1908 */
1909 if (ticket->t_flags & XLOG_TIC_INITED)
1910 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1721 1911
1722 ASSERT(log_offset <= iclog->ic_size - 1); 1912 /*
1723 ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); 1913 * Commit record headers need to be accounted for. These
1914 * come in as separate writes so are easy to detect.
1915 */
1916 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
1917 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1918 } else
1919 ticket->t_curr_res -= len;
1920
1921 if (ticket->t_curr_res < 0)
1922 xlog_print_tic_res(log->l_mp, ticket);
1923
1924 index = 0;
1925 lv = log_vector;
1926 vecp = lv->lv_iovecp;
1927 while (lv && index < lv->lv_niovecs) {
1928 void *ptr;
1929 int log_offset;
1930
1931 error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
1932 &contwr, &log_offset);
1933 if (error)
1934 return error;
1724 1935
1725 /* start_lsn is the first lsn written to. That's all we need. */ 1936 ASSERT(log_offset <= iclog->ic_size - 1);
1726 if (! *start_lsn) 1937 ptr = iclog->ic_datap + log_offset;
1727 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
1728 1938
1729 /* This loop writes out as many regions as can fit in the amount 1939 /* start_lsn is the first lsn written to. That's all we need. */
1730 * of space which was allocated by xlog_state_get_iclog_space(). 1940 if (!*start_lsn)
1731 */ 1941 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
1732 while (index < nentries) {
1733 ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
1734 ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
1735 start_rec_copy = 0;
1736
1737 /* If first write for transaction, insert start record.
1738 * We can't be trying to commit if we are inited. We can't
1739 * have any "partial_copy" if we are inited.
1740 */
1741 if (ticket->t_flags & XLOG_TIC_INITED) {
1742 logop_head = (xlog_op_header_t *)ptr;
1743 logop_head->oh_tid = cpu_to_be32(ticket->t_tid);
1744 logop_head->oh_clientid = ticket->t_clientid;
1745 logop_head->oh_len = 0;
1746 logop_head->oh_flags = XLOG_START_TRANS;
1747 logop_head->oh_res2 = 0;
1748 ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */
1749 record_cnt++;
1750
1751 start_rec_copy = sizeof(xlog_op_header_t);
1752 xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
1753 }
1754 1942
1755 /* Copy log operation header directly into data section */ 1943 /*
1756 logop_head = (xlog_op_header_t *)ptr; 1944 * This loop writes out as many regions as can fit in the amount
1757 logop_head->oh_tid = cpu_to_be32(ticket->t_tid); 1945 * of space which was allocated by xlog_state_get_iclog_space().
1758 logop_head->oh_clientid = ticket->t_clientid; 1946 */
1759 logop_head->oh_res2 = 0; 1947 while (lv && index < lv->lv_niovecs) {
1948 struct xfs_log_iovec *reg = &vecp[index];
1949 struct xlog_op_header *ophdr;
1950 int start_rec_copy;
1951 int copy_len;
1952 int copy_off;
1953
1954 ASSERT(reg->i_len % sizeof(__int32_t) == 0);
1955 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
1956
1957 start_rec_copy = xlog_write_start_rec(ptr, ticket);
1958 if (start_rec_copy) {
1959 record_cnt++;
1960 xlog_write_adv_cnt(&ptr, &len, &log_offset,
1961 start_rec_copy);
1962 }
1760 1963
1761 /* header copied directly */ 1964 ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
1762 xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); 1965 if (!ophdr)
1966 return XFS_ERROR(EIO);
1763 1967
1764 /* are we copying a commit or unmount record? */ 1968 xlog_write_adv_cnt(&ptr, &len, &log_offset,
1765 logop_head->oh_flags = flags; 1969 sizeof(struct xlog_op_header));
1970
1971 len += xlog_write_setup_copy(ticket, ophdr,
1972 iclog->ic_size-log_offset,
1973 reg->i_len,
1974 &copy_off, &copy_len,
1975 &partial_copy,
1976 &partial_copy_len);
1977 xlog_verify_dest_ptr(log, ptr);
1978
1979 /* copy region */
1980 ASSERT(copy_len >= 0);
1981 memcpy(ptr, reg->i_addr + copy_off, copy_len);
1982 xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
1983
1984 copy_len += start_rec_copy + sizeof(xlog_op_header_t);
1985 record_cnt++;
1986 data_cnt += contwr ? copy_len : 0;
1987
1988 error = xlog_write_copy_finish(log, iclog, flags,
1989 &record_cnt, &data_cnt,
1990 &partial_copy,
1991 &partial_copy_len,
1992 log_offset,
1993 commit_iclog);
1994 if (error)
1995 return error;
1766 1996
1767 /* 1997 /*
1768 * We've seen logs corrupted with bad transaction client 1998 * if we had a partial copy, we need to get more iclog
1769 * ids. This makes sure that XFS doesn't generate them on. 1999 * space but we don't want to increment the region
1770 * Turn this into an EIO and shut down the filesystem. 2000 * index because there is still more is this region to
1771 */ 2001 * write.
1772 switch (logop_head->oh_clientid) { 2002 *
1773 case XFS_TRANSACTION: 2003 * If we completed writing this region, and we flushed
1774 case XFS_VOLUME: 2004 * the iclog (indicated by resetting of the record
1775 case XFS_LOG: 2005 * count), then we also need to get more log space. If
1776 break; 2006 * this was the last record, though, we are done and
1777 default: 2007 * can just return.
1778 xfs_fs_cmn_err(CE_WARN, mp, 2008 */
1779 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 2009 if (partial_copy)
1780 logop_head->oh_clientid, ticket); 2010 break;
1781 return XFS_ERROR(EIO);
1782 }
1783 2011
1784 /* Partial write last time? => (partial_copy != 0) 2012 if (++index == lv->lv_niovecs) {
1785 * need_copy is the amount we'd like to copy if everything could 2013 lv = lv->lv_next;
1786 * fit in the current memcpy. 2014 index = 0;
1787 */ 2015 if (lv)
1788 need_copy = reg[index].i_len - partial_copy_len; 2016 vecp = lv->lv_iovecp;
1789 2017 }
1790 copy_off = partial_copy_len; 2018 if (record_cnt == 0) {
1791 if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ 2019 if (!lv)
1792 copy_len = need_copy; 2020 return 0;
1793 logop_head->oh_len = cpu_to_be32(copy_len); 2021 break;
1794 if (partial_copy) 2022 }
1795 logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1796 partial_copy_len = partial_copy = 0;
1797 } else { /* partial write */
1798 copy_len = iclog->ic_size - log_offset;
1799 logop_head->oh_len = cpu_to_be32(copy_len);
1800 logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
1801 if (partial_copy)
1802 logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
1803 partial_copy_len += copy_len;
1804 partial_copy++;
1805 len += sizeof(xlog_op_header_t); /* from splitting of region */
1806 /* account for new log op header */
1807 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1808 ticket->t_res_num_ophdrs++;
1809 }
1810 xlog_verify_dest_ptr(log, ptr);
1811
1812 /* copy region */
1813 ASSERT(copy_len >= 0);
1814 memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
1815 xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
1816
1817 /* make copy_len total bytes copied, including headers */
1818 copy_len += start_rec_copy + sizeof(xlog_op_header_t);
1819 record_cnt++;
1820 data_cnt += contwr ? copy_len : 0;
1821 if (partial_copy) { /* copied partial region */
1822 /* already marked WANT_SYNC by xlog_state_get_iclog_space */
1823 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1824 record_cnt = data_cnt = 0;
1825 if ((error = xlog_state_release_iclog(log, iclog)))
1826 return error;
1827 break; /* don't increment index */
1828 } else { /* copied entire region */
1829 index++;
1830 partial_copy_len = partial_copy = 0;
1831
1832 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1833 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1834 record_cnt = data_cnt = 0;
1835 spin_lock(&log->l_icloglock);
1836 xlog_state_want_sync(log, iclog);
1837 spin_unlock(&log->l_icloglock);
1838 if (commit_iclog) {
1839 ASSERT(flags & XLOG_COMMIT_TRANS);
1840 *commit_iclog = iclog;
1841 } else if ((error = xlog_state_release_iclog(log, iclog)))
1842 return error;
1843 if (index == nentries)
1844 return 0; /* we are done */
1845 else
1846 break;
1847 } 2023 }
1848 } /* if (partial_copy) */ 2024 }
1849 } /* while (index < nentries) */ 2025
1850 } /* for (index = 0; index < nentries; ) */ 2026 ASSERT(len == 0);
1851 ASSERT(len == 0); 2027
2028 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
2029 if (!commit_iclog)
2030 return xlog_state_release_iclog(log, iclog);
1852 2031
1853 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1854 if (commit_iclog) {
1855 ASSERT(flags & XLOG_COMMIT_TRANS); 2032 ASSERT(flags & XLOG_COMMIT_TRANS);
1856 *commit_iclog = iclog; 2033 *commit_iclog = iclog;
1857 return 0; 2034 return 0;
1858 } 2035}
1859 return xlog_state_release_iclog(log, iclog);
1860} /* xlog_write */
1861 2036
1862 2037
1863/***************************************************************************** 2038/*****************************************************************************
@@ -2840,6 +3015,9 @@ _xfs_log_force(
2840 3015
2841 XFS_STATS_INC(xs_log_force); 3016 XFS_STATS_INC(xs_log_force);
2842 3017
3018 if (log->l_cilp)
3019 xlog_cil_force(log);
3020
2843 spin_lock(&log->l_icloglock); 3021 spin_lock(&log->l_icloglock);
2844 3022
2845 iclog = log->l_iclog; 3023 iclog = log->l_iclog;
@@ -2989,6 +3167,12 @@ _xfs_log_force_lsn(
2989 3167
2990 XFS_STATS_INC(xs_log_force); 3168 XFS_STATS_INC(xs_log_force);
2991 3169
3170 if (log->l_cilp) {
3171 lsn = xlog_cil_force_lsn(log, lsn);
3172 if (lsn == NULLCOMMITLSN)
3173 return 0;
3174 }
3175
2992try_again: 3176try_again:
2993 spin_lock(&log->l_icloglock); 3177 spin_lock(&log->l_icloglock);
2994 iclog = log->l_iclog; 3178 iclog = log->l_iclog;
@@ -3153,20 +3337,30 @@ xfs_log_ticket_get(
3153 return ticket; 3337 return ticket;
3154} 3338}
3155 3339
3340xlog_tid_t
3341xfs_log_get_trans_ident(
3342 struct xfs_trans *tp)
3343{
3344 return tp->t_ticket->t_tid;
3345}
3346
3156/* 3347/*
3157 * Allocate and initialise a new log ticket. 3348 * Allocate and initialise a new log ticket.
3158 */ 3349 */
3159STATIC xlog_ticket_t * 3350xlog_ticket_t *
3160xlog_ticket_alloc(xlog_t *log, 3351xlog_ticket_alloc(
3161 int unit_bytes, 3352 struct log *log,
3162 int cnt, 3353 int unit_bytes,
3163 char client, 3354 int cnt,
3164 uint xflags) 3355 char client,
3356 uint xflags,
3357 int alloc_flags)
3165{ 3358{
3166 xlog_ticket_t *tic; 3359 struct xlog_ticket *tic;
3167 uint num_headers; 3360 uint num_headers;
3361 int iclog_space;
3168 3362
3169 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); 3363 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3170 if (!tic) 3364 if (!tic)
3171 return NULL; 3365 return NULL;
3172 3366
@@ -3208,16 +3402,40 @@ xlog_ticket_alloc(xlog_t *log,
3208 /* for start-rec */ 3402 /* for start-rec */
3209 unit_bytes += sizeof(xlog_op_header_t); 3403 unit_bytes += sizeof(xlog_op_header_t);
3210 3404
3211 /* for LR headers */ 3405 /*
3212 num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); 3406 * for LR headers - the space for data in an iclog is the size minus
3407 * the space used for the headers. If we use the iclog size, then we
3408 * undercalculate the number of headers required.
3409 *
3410 * Furthermore - the addition of op headers for split-recs might
3411 * increase the space required enough to require more log and op
3412 * headers, so take that into account too.
3413 *
3414 * IMPORTANT: This reservation makes the assumption that if this
3415 * transaction is the first in an iclog and hence has the LR headers
3416 * accounted to it, then the remaining space in the iclog is
3417 * exclusively for this transaction. i.e. if the transaction is larger
3418 * than the iclog, it will be the only thing in that iclog.
3419 * Fundamentally, this means we must pass the entire log vector to
3420 * xlog_write to guarantee this.
3421 */
3422 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
3423 num_headers = howmany(unit_bytes, iclog_space);
3424
3425 /* for split-recs - ophdrs added when data split over LRs */
3426 unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3427
3428 /* add extra header reservations if we overrun */
3429 while (!num_headers ||
3430 howmany(unit_bytes, iclog_space) > num_headers) {
3431 unit_bytes += sizeof(xlog_op_header_t);
3432 num_headers++;
3433 }
3213 unit_bytes += log->l_iclog_hsize * num_headers; 3434 unit_bytes += log->l_iclog_hsize * num_headers;
3214 3435
3215 /* for commit-rec LR header - note: padding will subsume the ophdr */ 3436 /* for commit-rec LR header - note: padding will subsume the ophdr */
3216 unit_bytes += log->l_iclog_hsize; 3437 unit_bytes += log->l_iclog_hsize;
3217 3438
3218 /* for split-recs - ophdrs added when data split over LRs */
3219 unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3220
3221 /* for roundoff padding for transaction data and one for commit record */ 3439 /* for roundoff padding for transaction data and one for commit record */
3222 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 3440 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
3223 log->l_mp->m_sb.sb_logsunit > 1) { 3441 log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3233,13 +3451,13 @@ xlog_ticket_alloc(xlog_t *log,
3233 tic->t_curr_res = unit_bytes; 3451 tic->t_curr_res = unit_bytes;
3234 tic->t_cnt = cnt; 3452 tic->t_cnt = cnt;
3235 tic->t_ocnt = cnt; 3453 tic->t_ocnt = cnt;
3236 tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); 3454 tic->t_tid = random32();
3237 tic->t_clientid = client; 3455 tic->t_clientid = client;
3238 tic->t_flags = XLOG_TIC_INITED; 3456 tic->t_flags = XLOG_TIC_INITED;
3239 tic->t_trans_type = 0; 3457 tic->t_trans_type = 0;
3240 if (xflags & XFS_LOG_PERM_RESERV) 3458 if (xflags & XFS_LOG_PERM_RESERV)
3241 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3459 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3242 sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); 3460 sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
3243 3461
3244 xlog_tic_reset_res(tic); 3462 xlog_tic_reset_res(tic);
3245 3463
@@ -3260,20 +3478,22 @@ xlog_ticket_alloc(xlog_t *log,
3260 * part of the log in case we trash the log structure. 3478 * part of the log in case we trash the log structure.
3261 */ 3479 */
3262void 3480void
3263xlog_verify_dest_ptr(xlog_t *log, 3481xlog_verify_dest_ptr(
3264 __psint_t ptr) 3482 struct log *log,
3483 char *ptr)
3265{ 3484{
3266 int i; 3485 int i;
3267 int good_ptr = 0; 3486 int good_ptr = 0;
3268 3487
3269 for (i=0; i < log->l_iclog_bufs; i++) { 3488 for (i = 0; i < log->l_iclog_bufs; i++) {
3270 if (ptr >= (__psint_t)log->l_iclog_bak[i] && 3489 if (ptr >= log->l_iclog_bak[i] &&
3271 ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) 3490 ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
3272 good_ptr++; 3491 good_ptr++;
3273 } 3492 }
3274 if (! good_ptr) 3493
3494 if (!good_ptr)
3275 xlog_panic("xlog_verify_dest_ptr: invalid ptr"); 3495 xlog_panic("xlog_verify_dest_ptr: invalid ptr");
3276} /* xlog_verify_dest_ptr */ 3496}
3277 3497
3278STATIC void 3498STATIC void
3279xlog_verify_grant_head(xlog_t *log, int equals) 3499xlog_verify_grant_head(xlog_t *log, int equals)
@@ -3459,6 +3679,11 @@ xlog_state_ioerror(
3459 * c. nothing new gets queued up after (a) and (b) are done. 3679 * c. nothing new gets queued up after (a) and (b) are done.
3460 * d. if !logerror, flush the iclogs to disk, then seal them off 3680 * d. if !logerror, flush the iclogs to disk, then seal them off
3461 * for business. 3681 * for business.
3682 *
3683 * Note: for delayed logging the !logerror case needs to flush the regions
3684 * held in memory out to the iclogs before flushing them to disk. This needs
3685 * to be done before the log is marked as shutdown, otherwise the flush to the
3686 * iclogs will fail.
3462 */ 3687 */
3463int 3688int
3464xfs_log_force_umount( 3689xfs_log_force_umount(
@@ -3492,6 +3717,16 @@ xfs_log_force_umount(
3492 return 1; 3717 return 1;
3493 } 3718 }
3494 retval = 0; 3719 retval = 0;
3720
3721 /*
3722 * Flush the in memory commit item list before marking the log as
3723 * being shut down. We need to do it in this order to ensure all the
3724 * completed transactions are flushed to disk with the xfs_log_force()
3725 * call below.
3726 */
3727 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3728 xlog_cil_force(log);
3729
3495 /* 3730 /*
3496 * We must hold both the GRANT lock and the LOG lock, 3731 * We must hold both the GRANT lock and the LOG lock,
3497 * before we mark the filesystem SHUTDOWN and wake 3732 * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 97a24c7795a4..916eb7db14d9 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
19#define __XFS_LOG_H__ 19#define __XFS_LOG_H__
20 20
21/* get lsn fields */ 21/* get lsn fields */
22
23#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) 22#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
24#define BLOCK_LSN(lsn) ((uint)(lsn)) 23#define BLOCK_LSN(lsn) ((uint)(lsn))
25 24
@@ -56,14 +55,10 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
56/* 55/*
57 * Flags to xfs_log_reserve() 56 * Flags to xfs_log_reserve()
58 * 57 *
59 * XFS_LOG_SLEEP: If space is not available, sleep (default)
60 * XFS_LOG_NOSLEEP: If space is not available, return error
61 * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are 58 * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are
62 * performed against this type of reservation, the reservation 59 * performed against this type of reservation, the reservation
63 * is not decreased. Long running transactions should use this. 60 * is not decreased. Long running transactions should use this.
64 */ 61 */
65#define XFS_LOG_SLEEP 0x0
66#define XFS_LOG_NOSLEEP 0x1
67#define XFS_LOG_PERM_RESERV 0x2 62#define XFS_LOG_PERM_RESERV 0x2
68 63
69/* 64/*
@@ -105,11 +100,20 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
105#define XLOG_REG_TYPE_MAX 19 100#define XLOG_REG_TYPE_MAX 19
106 101
107typedef struct xfs_log_iovec { 102typedef struct xfs_log_iovec {
108 xfs_caddr_t i_addr; /* beginning address of region */ 103 void *i_addr; /* beginning address of region */
109 int i_len; /* length in bytes of region */ 104 int i_len; /* length in bytes of region */
110 uint i_type; /* type of region */ 105 uint i_type; /* type of region */
111} xfs_log_iovec_t; 106} xfs_log_iovec_t;
112 107
108struct xfs_log_vec {
109 struct xfs_log_vec *lv_next; /* next lv in build list */
110 int lv_niovecs; /* number of iovecs in lv */
111 struct xfs_log_iovec *lv_iovecp; /* iovec array */
112 struct xfs_log_item *lv_item; /* owner */
113 char *lv_buf; /* formatted buffer */
114 int lv_buf_len; /* size of formatted buffer */
115};
116
113/* 117/*
114 * Structure used to pass callback function and the function's argument 118 * Structure used to pass callback function and the function's argument
115 * to the log manager. 119 * to the log manager.
@@ -126,6 +130,14 @@ typedef struct xfs_log_callback {
126struct xfs_mount; 130struct xfs_mount;
127struct xlog_in_core; 131struct xlog_in_core;
128struct xlog_ticket; 132struct xlog_ticket;
133struct xfs_log_item;
134struct xfs_item_ops;
135struct xfs_trans;
136
137void xfs_log_item_init(struct xfs_mount *mp,
138 struct xfs_log_item *item,
139 int type,
140 struct xfs_item_ops *ops);
129 141
130xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 142xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
131 struct xlog_ticket *ticket, 143 struct xlog_ticket *ticket,
@@ -174,13 +186,15 @@ int xfs_log_need_covered(struct xfs_mount *mp);
174 186
175void xlog_iodone(struct xfs_buf *); 187void xlog_iodone(struct xfs_buf *);
176 188
177struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); 189struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
178void xfs_log_ticket_put(struct xlog_ticket *ticket); 190void xfs_log_ticket_put(struct xlog_ticket *ticket);
179 191
180#endif 192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
181
182
183extern int xlog_debug; /* set to 1 to enable real log */
184 193
194int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
195 struct xfs_log_vec *log_vector,
196 xfs_lsn_t *commit_lsn, int flags);
197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
185 198
199#endif
186#endif /* __XFS_LOG_H__ */ 200#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..7e206fc1fa36
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,780 @@
1/*
2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_log_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_mount.h"
30#include "xfs_error.h"
31#include "xfs_alloc.h"
32
33/*
34 * Perform initial CIL structure initialisation. If the CIL is not
35 * enabled in this filesystem, ensure the log->l_cilp is null so
36 * we can check this conditional to determine if we are doing delayed
37 * logging or not.
38 */
39int
40xlog_cil_init(
41 struct log *log)
42{
43 struct xfs_cil *cil;
44 struct xfs_cil_ctx *ctx;
45
46 log->l_cilp = NULL;
47 if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
48 return 0;
49
50 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
51 if (!cil)
52 return ENOMEM;
53
54 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
55 if (!ctx) {
56 kmem_free(cil);
57 return ENOMEM;
58 }
59
60 INIT_LIST_HEAD(&cil->xc_cil);
61 INIT_LIST_HEAD(&cil->xc_committing);
62 spin_lock_init(&cil->xc_cil_lock);
63 init_rwsem(&cil->xc_ctx_lock);
64 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
65
66 INIT_LIST_HEAD(&ctx->committing);
67 INIT_LIST_HEAD(&ctx->busy_extents);
68 ctx->sequence = 1;
69 ctx->cil = cil;
70 cil->xc_ctx = ctx;
71 cil->xc_current_sequence = ctx->sequence;
72
73 cil->xc_log = log;
74 log->l_cilp = cil;
75 return 0;
76}
77
78void
79xlog_cil_destroy(
80 struct log *log)
81{
82 if (!log->l_cilp)
83 return;
84
85 if (log->l_cilp->xc_ctx) {
86 if (log->l_cilp->xc_ctx->ticket)
87 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
88 kmem_free(log->l_cilp->xc_ctx);
89 }
90
91 ASSERT(list_empty(&log->l_cilp->xc_cil));
92 kmem_free(log->l_cilp);
93}
94
95/*
96 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
97 * recover, so we don't allow failure here. Also, we allocate in a context that
98 * we don't want to be issuing transactions from, so we need to tell the
99 * allocation code this as well.
100 *
101 * We don't reserve any space for the ticket - we are going to steal whatever
102 * space we require from transactions as they commit. To ensure we reserve all
103 * the space required, we need to set the current reservation of the ticket to
104 * zero so that we know to steal the initial transaction overhead from the
105 * first transaction commit.
106 */
107static struct xlog_ticket *
108xlog_cil_ticket_alloc(
109 struct log *log)
110{
111 struct xlog_ticket *tic;
112
113 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
114 KM_SLEEP|KM_NOFS);
115 tic->t_trans_type = XFS_TRANS_CHECKPOINT;
116
117 /*
118 * set the current reservation to zero so we know to steal the basic
119 * transaction overhead reservation from the first transaction commit.
120 */
121 tic->t_curr_res = 0;
122 return tic;
123}
124
125/*
126 * After the first stage of log recovery is done, we know where the head and
127 * tail of the log are. We need this log initialisation done before we can
128 * initialise the first CIL checkpoint context.
129 *
130 * Here we allocate a log ticket to track space usage during a CIL push. This
131 * ticket is passed to xlog_write() directly so that we don't slowly leak log
132 * space by failing to account for space used by log headers and additional
133 * region headers for split regions.
134 */
135void
136xlog_cil_init_post_recovery(
137 struct log *log)
138{
139 if (!log->l_cilp)
140 return;
141
142 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
143 log->l_cilp->xc_ctx->sequence = 1;
144 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
145 log->l_curr_block);
146}
147
148/*
149 * Insert the log item into the CIL and calculate the difference in space
150 * consumed by the item. Add the space to the checkpoint ticket and calculate
151 * if the change requires additional log metadata. If it does, take that space
152 * as well. Remove the amount of space we addded to the checkpoint ticket from
153 * the current transaction ticket so that the accounting works out correctly.
154 *
155 * If this is the first time the item is being placed into the CIL in this
156 * context, pin it so it can't be written to disk until the CIL is flushed to
157 * the iclog and the iclog written to disk.
158 */
159static void
160xlog_cil_insert(
161 struct log *log,
162 struct xlog_ticket *ticket,
163 struct xfs_log_item *item,
164 struct xfs_log_vec *lv)
165{
166 struct xfs_cil *cil = log->l_cilp;
167 struct xfs_log_vec *old = lv->lv_item->li_lv;
168 struct xfs_cil_ctx *ctx = cil->xc_ctx;
169 int len;
170 int diff_iovecs;
171 int iclog_space;
172
173 if (old) {
174 /* existing lv on log item, space used is a delta */
175 ASSERT(!list_empty(&item->li_cil));
176 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
177
178 len = lv->lv_buf_len - old->lv_buf_len;
179 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
180 kmem_free(old->lv_buf);
181 kmem_free(old);
182 } else {
183 /* new lv, must pin the log item */
184 ASSERT(!lv->lv_item->li_lv);
185 ASSERT(list_empty(&item->li_cil));
186
187 len = lv->lv_buf_len;
188 diff_iovecs = lv->lv_niovecs;
189 IOP_PIN(lv->lv_item);
190
191 }
192 len += diff_iovecs * sizeof(xlog_op_header_t);
193
194 /* attach new log vector to log item */
195 lv->lv_item->li_lv = lv;
196
197 spin_lock(&cil->xc_cil_lock);
198 list_move_tail(&item->li_cil, &cil->xc_cil);
199 ctx->nvecs += diff_iovecs;
200
201 /*
202 * If this is the first time the item is being committed to the CIL,
203 * store the sequence number on the log item so we can tell
204 * in future commits whether this is the first checkpoint the item is
205 * being committed into.
206 */
207 if (!item->li_seq)
208 item->li_seq = ctx->sequence;
209
210 /*
211 * Now transfer enough transaction reservation to the context ticket
212 * for the checkpoint. The context ticket is special - the unit
213 * reservation has to grow as well as the current reservation as we
214 * steal from tickets so we can correctly determine the space used
215 * during the transaction commit.
216 */
217 if (ctx->ticket->t_curr_res == 0) {
218 /* first commit in checkpoint, steal the header reservation */
219 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
220 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
221 ticket->t_curr_res -= ctx->ticket->t_unit_res;
222 }
223
224 /* do we need space for more log record headers? */
225 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
226 if (len > 0 && (ctx->space_used / iclog_space !=
227 (ctx->space_used + len) / iclog_space)) {
228 int hdrs;
229
230 hdrs = (len + iclog_space - 1) / iclog_space;
231 /* need to take into account split region headers, too */
232 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
233 ctx->ticket->t_unit_res += hdrs;
234 ctx->ticket->t_curr_res += hdrs;
235 ticket->t_curr_res -= hdrs;
236 ASSERT(ticket->t_curr_res >= len);
237 }
238 ticket->t_curr_res -= len;
239 ctx->space_used += len;
240
241 spin_unlock(&cil->xc_cil_lock);
242}
243
244/*
245 * Format log item into a flat buffers
246 *
247 * For delayed logging, we need to hold a formatted buffer containing all the
248 * changes on the log item. This enables us to relog the item in memory and
249 * write it out asynchronously without needing to relock the object that was
250 * modified at the time it gets written into the iclog.
251 *
252 * This function builds a vector for the changes in each log item in the
253 * transaction. It then works out the length of the buffer needed for each log
254 * item, allocates them and formats the vector for the item into the buffer.
255 * The buffer is then attached to the log item are then inserted into the
256 * Committed Item List for tracking until the next checkpoint is written out.
257 *
258 * We don't set up region headers during this process; we simply copy the
259 * regions into the flat buffer. We can do this because we still have to do a
260 * formatting step to write the regions into the iclog buffer. Writing the
261 * ophdrs during the iclog write means that we can support splitting large
262 * regions across iclog boundares without needing a change in the format of the
263 * item/region encapsulation.
264 *
265 * Hence what we need to do now is change the rewrite the vector array to point
266 * to the copied region inside the buffer we just allocated. This allows us to
267 * format the regions into the iclog as though they are being formatted
268 * directly out of the objects themselves.
269 */
270static void
271xlog_cil_format_items(
272 struct log *log,
273 struct xfs_log_vec *log_vector)
274{
275 struct xfs_log_vec *lv;
276
277 ASSERT(log_vector);
278 for (lv = log_vector; lv; lv = lv->lv_next) {
279 void *ptr;
280 int index;
281 int len = 0;
282
283 /* build the vector array and calculate it's length */
284 IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
285 for (index = 0; index < lv->lv_niovecs; index++)
286 len += lv->lv_iovecp[index].i_len;
287
288 lv->lv_buf_len = len;
289 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
290 ptr = lv->lv_buf;
291
292 for (index = 0; index < lv->lv_niovecs; index++) {
293 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
294
295 memcpy(ptr, vec->i_addr, vec->i_len);
296 vec->i_addr = ptr;
297 ptr += vec->i_len;
298 }
299 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
300 }
301}
302
303static void
304xlog_cil_insert_items(
305 struct log *log,
306 struct xfs_log_vec *log_vector,
307 struct xlog_ticket *ticket,
308 xfs_lsn_t *start_lsn)
309{
310 struct xfs_log_vec *lv;
311
312 if (start_lsn)
313 *start_lsn = log->l_cilp->xc_ctx->sequence;
314
315 ASSERT(log_vector);
316 for (lv = log_vector; lv; lv = lv->lv_next)
317 xlog_cil_insert(log, ticket, lv->lv_item, lv);
318}
319
320static void
321xlog_cil_free_logvec(
322 struct xfs_log_vec *log_vector)
323{
324 struct xfs_log_vec *lv;
325
326 for (lv = log_vector; lv; ) {
327 struct xfs_log_vec *next = lv->lv_next;
328 kmem_free(lv->lv_buf);
329 kmem_free(lv);
330 lv = next;
331 }
332}
333
334/*
335 * Mark all items committed and clear busy extents. We free the log vector
336 * chains in a separate pass so that we unpin the log items as quickly as
337 * possible.
338 */
339static void
340xlog_cil_committed(
341 void *args,
342 int abort)
343{
344 struct xfs_cil_ctx *ctx = args;
345 struct xfs_log_vec *lv;
346 int abortflag = abort ? XFS_LI_ABORTED : 0;
347 struct xfs_busy_extent *busyp, *n;
348
349 /* unpin all the log items */
350 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
351 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
352 abortflag);
353 }
354
355 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
356 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
357
358 spin_lock(&ctx->cil->xc_cil_lock);
359 list_del(&ctx->committing);
360 spin_unlock(&ctx->cil->xc_cil_lock);
361
362 xlog_cil_free_logvec(ctx->lv_chain);
363 kmem_free(ctx);
364}
365
366/*
367 * Push the Committed Item List to the log. If @push_seq flag is zero, then it
368 * is a background flush and so we can chose to ignore it. Otherwise, if the
369 * current sequence is the same as @push_seq we need to do a flush. If
370 * @push_seq is less than the current sequence, then it has already been
371 * flushed and we don't need to do anything - the caller will wait for it to
372 * complete if necessary.
373 *
374 * @push_seq is a value rather than a flag because that allows us to do an
375 * unlocked check of the sequence number for a match. Hence we can allows log
376 * forces to run racily and not issue pushes for the same sequence twice. If we
377 * get a race between multiple pushes for the same sequence they will block on
378 * the first one and then abort, hence avoiding needless pushes.
379 */
380STATIC int
381xlog_cil_push(
382 struct log *log,
383 xfs_lsn_t push_seq)
384{
385 struct xfs_cil *cil = log->l_cilp;
386 struct xfs_log_vec *lv;
387 struct xfs_cil_ctx *ctx;
388 struct xfs_cil_ctx *new_ctx;
389 struct xlog_in_core *commit_iclog;
390 struct xlog_ticket *tic;
391 int num_lv;
392 int num_iovecs;
393 int len;
394 int error = 0;
395 struct xfs_trans_header thdr;
396 struct xfs_log_iovec lhdr;
397 struct xfs_log_vec lvhdr = { NULL };
398 xfs_lsn_t commit_lsn;
399
400 if (!cil)
401 return 0;
402
403 ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
404
405 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
406 new_ctx->ticket = xlog_cil_ticket_alloc(log);
407
408 /*
409 * Lock out transaction commit, but don't block for background pushes
410 * unless we are well over the CIL space limit. See the definition of
411 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
412 * used here.
413 */
414 if (!down_write_trylock(&cil->xc_ctx_lock)) {
415 if (!push_seq &&
416 cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
417 goto out_free_ticket;
418 down_write(&cil->xc_ctx_lock);
419 }
420 ctx = cil->xc_ctx;
421
422 /* check if we've anything to push */
423 if (list_empty(&cil->xc_cil))
424 goto out_skip;
425
426 /* check for spurious background flush */
427 if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
428 goto out_skip;
429
430 /* check for a previously pushed seqeunce */
431 if (push_seq && push_seq < cil->xc_ctx->sequence)
432 goto out_skip;
433
434 /*
435 * pull all the log vectors off the items in the CIL, and
436 * remove the items from the CIL. We don't need the CIL lock
437 * here because it's only needed on the transaction commit
438 * side which is currently locked out by the flush lock.
439 */
440 lv = NULL;
441 num_lv = 0;
442 num_iovecs = 0;
443 len = 0;
444 while (!list_empty(&cil->xc_cil)) {
445 struct xfs_log_item *item;
446 int i;
447
448 item = list_first_entry(&cil->xc_cil,
449 struct xfs_log_item, li_cil);
450 list_del_init(&item->li_cil);
451 if (!ctx->lv_chain)
452 ctx->lv_chain = item->li_lv;
453 else
454 lv->lv_next = item->li_lv;
455 lv = item->li_lv;
456 item->li_lv = NULL;
457
458 num_lv++;
459 num_iovecs += lv->lv_niovecs;
460 for (i = 0; i < lv->lv_niovecs; i++)
461 len += lv->lv_iovecp[i].i_len;
462 }
463
464 /*
465 * initialise the new context and attach it to the CIL. Then attach
466 * the current context to the CIL committing lsit so it can be found
467 * during log forces to extract the commit lsn of the sequence that
468 * needs to be forced.
469 */
470 INIT_LIST_HEAD(&new_ctx->committing);
471 INIT_LIST_HEAD(&new_ctx->busy_extents);
472 new_ctx->sequence = ctx->sequence + 1;
473 new_ctx->cil = cil;
474 cil->xc_ctx = new_ctx;
475
476 /*
477 * mirror the new sequence into the cil structure so that we can do
478 * unlocked checks against the current sequence in log forces without
479 * risking deferencing a freed context pointer.
480 */
481 cil->xc_current_sequence = new_ctx->sequence;
482
483 /*
484 * The switch is now done, so we can drop the context lock and move out
485 * of a shared context. We can't just go straight to the commit record,
486 * though - we need to synchronise with previous and future commits so
487 * that the commit records are correctly ordered in the log to ensure
488 * that we process items during log IO completion in the correct order.
489 *
490 * For example, if we get an EFI in one checkpoint and the EFD in the
491 * next (e.g. due to log forces), we do not want the checkpoint with
492 * the EFD to be committed before the checkpoint with the EFI. Hence
493 * we must strictly order the commit records of the checkpoints so
494 * that: a) the checkpoint callbacks are attached to the iclogs in the
495 * correct order; and b) the checkpoints are replayed in correct order
496 * in log recovery.
497 *
498 * Hence we need to add this context to the committing context list so
499 * that higher sequences will wait for us to write out a commit record
500 * before they do.
501 */
502 spin_lock(&cil->xc_cil_lock);
503 list_add(&ctx->committing, &cil->xc_committing);
504 spin_unlock(&cil->xc_cil_lock);
505 up_write(&cil->xc_ctx_lock);
506
507 /*
508 * Build a checkpoint transaction header and write it to the log to
509 * begin the transaction. We need to account for the space used by the
510 * transaction header here as it is not accounted for in xlog_write().
511 *
512 * The LSN we need to pass to the log items on transaction commit is
513 * the LSN reported by the first log vector write. If we use the commit
514 * record lsn then we can move the tail beyond the grant write head.
515 */
516 tic = ctx->ticket;
517 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
518 thdr.th_type = XFS_TRANS_CHECKPOINT;
519 thdr.th_tid = tic->t_tid;
520 thdr.th_num_items = num_iovecs;
521 lhdr.i_addr = &thdr;
522 lhdr.i_len = sizeof(xfs_trans_header_t);
523 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
524 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
525
526 lvhdr.lv_niovecs = 1;
527 lvhdr.lv_iovecp = &lhdr;
528 lvhdr.lv_next = ctx->lv_chain;
529
530 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
531 if (error)
532 goto out_abort;
533
534 /*
535 * now that we've written the checkpoint into the log, strictly
536 * order the commit records so replay will get them in the right order.
537 */
538restart:
539 spin_lock(&cil->xc_cil_lock);
540 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
541 /*
542 * Higher sequences will wait for this one so skip them.
543 * Don't wait for own own sequence, either.
544 */
545 if (new_ctx->sequence >= ctx->sequence)
546 continue;
547 if (!new_ctx->commit_lsn) {
548 /*
549 * It is still being pushed! Wait for the push to
550 * complete, then start again from the beginning.
551 */
552 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
553 goto restart;
554 }
555 }
556 spin_unlock(&cil->xc_cil_lock);
557
558 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
559 if (error || commit_lsn == -1)
560 goto out_abort;
561
562 /* attach all the transactions w/ busy extents to iclog */
563 ctx->log_cb.cb_func = xlog_cil_committed;
564 ctx->log_cb.cb_arg = ctx;
565 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
566 if (error)
567 goto out_abort;
568
569 /*
570 * now the checkpoint commit is complete and we've attached the
571 * callbacks to the iclog we can assign the commit LSN to the context
572 * and wake up anyone who is waiting for the commit to complete.
573 */
574 spin_lock(&cil->xc_cil_lock);
575 ctx->commit_lsn = commit_lsn;
576 sv_broadcast(&cil->xc_commit_wait);
577 spin_unlock(&cil->xc_cil_lock);
578
579 /* release the hounds! */
580 return xfs_log_release_iclog(log->l_mp, commit_iclog);
581
582out_skip:
583 up_write(&cil->xc_ctx_lock);
584out_free_ticket:
585 xfs_log_ticket_put(new_ctx->ticket);
586 kmem_free(new_ctx);
587 return 0;
588
589out_abort:
590 xlog_cil_committed(ctx, XFS_LI_ABORTED);
591 return XFS_ERROR(EIO);
592}
593
594/*
595 * Commit a transaction with the given vector to the Committed Item List.
596 *
597 * To do this, we need to format the item, pin it in memory if required and
598 * account for the space used by the transaction. Once we have done that we
599 * need to release the unused reservation for the transaction, attach the
600 * transaction to the checkpoint context so we carry the busy extents through
601 * to checkpoint completion, and then unlock all the items in the transaction.
602 *
603 * For more specific information about the order of operations in
604 * xfs_log_commit_cil() please refer to the comments in
605 * xfs_trans_commit_iclog().
606 *
607 * Called with the context lock already held in read mode to lock out
608 * background commit, returns without it held once background commits are
609 * allowed again.
610 */
611int
612xfs_log_commit_cil(
613 struct xfs_mount *mp,
614 struct xfs_trans *tp,
615 struct xfs_log_vec *log_vector,
616 xfs_lsn_t *commit_lsn,
617 int flags)
618{
619 struct log *log = mp->m_log;
620 int log_flags = 0;
621 int push = 0;
622
623 if (flags & XFS_TRANS_RELEASE_LOG_RES)
624 log_flags = XFS_LOG_REL_PERM_RESERV;
625
626 if (XLOG_FORCED_SHUTDOWN(log)) {
627 xlog_cil_free_logvec(log_vector);
628 return XFS_ERROR(EIO);
629 }
630
631 /*
632 * do all the hard work of formatting items (including memory
633 * allocation) outside the CIL context lock. This prevents stalling CIL
634 * pushes when we are low on memory and a transaction commit spends a
635 * lot of time in memory reclaim.
636 */
637 xlog_cil_format_items(log, log_vector);
638
639 /* lock out background commit */
640 down_read(&log->l_cilp->xc_ctx_lock);
641 xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
642
643 /* check we didn't blow the reservation */
644 if (tp->t_ticket->t_curr_res < 0)
645 xlog_print_tic_res(log->l_mp, tp->t_ticket);
646
647 /* attach the transaction to the CIL if it has any busy extents */
648 if (!list_empty(&tp->t_busy)) {
649 spin_lock(&log->l_cilp->xc_cil_lock);
650 list_splice_init(&tp->t_busy,
651 &log->l_cilp->xc_ctx->busy_extents);
652 spin_unlock(&log->l_cilp->xc_cil_lock);
653 }
654
655 tp->t_commit_lsn = *commit_lsn;
656 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
657 xfs_trans_unreserve_and_mod_sb(tp);
658
659 /*
660 * Once all the items of the transaction have been copied to the CIL,
661 * the items can be unlocked and freed.
662 *
663 * This needs to be done before we drop the CIL context lock because we
664 * have to update state in the log items and unlock them before they go
665 * to disk. If we don't, then the CIL checkpoint can race with us and
666 * we can run checkpoint completion before we've updated and unlocked
667 * the log items. This affects (at least) processing of stale buffers,
668 * inodes and EFIs.
669 */
670 xfs_trans_free_items(tp, *commit_lsn, 0);
671
672 /* check for background commit before unlock */
673 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
674 push = 1;
675
676 up_read(&log->l_cilp->xc_ctx_lock);
677
678 /*
679 * We need to push CIL every so often so we don't cache more than we
680 * can fit in the log. The limit really is that a checkpoint can't be
681 * more than half the log (the current checkpoint is not allowed to
682 * overwrite the previous checkpoint), but commit latency and memory
683 * usage limit this to a smaller size in most cases.
684 */
685 if (push)
686 xlog_cil_push(log, 0);
687 return 0;
688}
689
690/*
691 * Conditionally push the CIL based on the sequence passed in.
692 *
693 * We only need to push if we haven't already pushed the sequence
694 * number given. Hence the only time we will trigger a push here is
695 * if the push sequence is the same as the current context.
696 *
697 * We return the current commit lsn to allow the callers to determine if a
698 * iclog flush is necessary following this call.
699 *
700 * XXX: Initially, just push the CIL unconditionally and return whatever
701 * commit lsn is there. It'll be empty, so this is broken for now.
702 */
703xfs_lsn_t
704xlog_cil_force_lsn(
705 struct log *log,
706 xfs_lsn_t sequence)
707{
708 struct xfs_cil *cil = log->l_cilp;
709 struct xfs_cil_ctx *ctx;
710 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
711
712 ASSERT(sequence <= cil->xc_current_sequence);
713
714 /*
715 * check to see if we need to force out the current context.
716 * xlog_cil_push() handles racing pushes for the same sequence,
717 * so no need to deal with it here.
718 */
719 if (sequence == cil->xc_current_sequence)
720 xlog_cil_push(log, sequence);
721
722 /*
723 * See if we can find a previous sequence still committing.
724 * We need to wait for all previous sequence commits to complete
725 * before allowing the force of push_seq to go ahead. Hence block
726 * on commits for those as well.
727 */
728restart:
729 spin_lock(&cil->xc_cil_lock);
730 list_for_each_entry(ctx, &cil->xc_committing, committing) {
731 if (ctx->sequence > sequence)
732 continue;
733 if (!ctx->commit_lsn) {
734 /*
735 * It is still being pushed! Wait for the push to
736 * complete, then start again from the beginning.
737 */
738 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
739 goto restart;
740 }
741 if (ctx->sequence != sequence)
742 continue;
743 /* found it! */
744 commit_lsn = ctx->commit_lsn;
745 }
746 spin_unlock(&cil->xc_cil_lock);
747 return commit_lsn;
748}
749
750/*
751 * Check if the current log item was first committed in this sequence.
752 * We can't rely on just the log item being in the CIL, we have to check
753 * the recorded commit sequence number.
754 *
755 * Note: for this to be used in a non-racy manner, it has to be called with
756 * CIL flushing locked out. As a result, it should only be used during the
757 * transaction commit process when deciding what to format into the item.
758 */
759bool
760xfs_log_item_in_current_chkpt(
761 struct xfs_log_item *lip)
762{
763 struct xfs_cil_ctx *ctx;
764
765 if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
766 return false;
767 if (list_empty(&lip->li_cil))
768 return false;
769
770 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
771
772 /*
773 * li_seq is written on the first commit of a log item to record the
774 * first checkpoint it is written to. Hence if it is different to the
775 * current sequence, we're in a new checkpoint.
776 */
777 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
778 return false;
779 return true;
780}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fd02a18facd5..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
154 shutdown */ 154 shutdown */
155typedef __uint32_t xlog_tid_t;
156
157 155
158#ifdef __KERNEL__ 156#ifdef __KERNEL__
159/* 157/*
@@ -379,6 +377,105 @@ typedef struct xlog_in_core {
379} xlog_in_core_t; 377} xlog_in_core_t;
380 378
381/* 379/*
380 * The CIL context is used to aggregate per-transaction details as well be
381 * passed to the iclog for checkpoint post-commit processing. After being
382 * passed to the iclog, another context needs to be allocated for tracking the
383 * next set of transactions to be aggregated into a checkpoint.
384 */
385struct xfs_cil;
386
387struct xfs_cil_ctx {
388 struct xfs_cil *cil;
389 xfs_lsn_t sequence; /* chkpt sequence # */
390 xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
391 xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
392 struct xlog_ticket *ticket; /* chkpt ticket */
393 int nvecs; /* number of regions */
394 int space_used; /* aggregate size of regions */
395 struct list_head busy_extents; /* busy extents in chkpt */
396 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
397 xfs_log_callback_t log_cb; /* completion callback hook. */
398 struct list_head committing; /* ctx committing list */
399};
400
401/*
402 * Committed Item List structure
403 *
404 * This structure is used to track log items that have been committed but not
405 * yet written into the log. It is used only when the delayed logging mount
406 * option is enabled.
407 *
408 * This structure tracks the list of committing checkpoint contexts so
409 * we can avoid the problem of having to hold out new transactions during a
410 * flush until we have a the commit record LSN of the checkpoint. We can
411 * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
412 * sequence match and extract the commit LSN directly from there. If the
413 * checkpoint is still in the process of committing, we can block waiting for
414 * the commit LSN to be determined as well. This should make synchronous
415 * operations almost as efficient as the old logging methods.
416 */
417struct xfs_cil {
418 struct log *xc_log;
419 struct list_head xc_cil;
420 spinlock_t xc_cil_lock;
421 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing;
424 sv_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence;
426};
427
428/*
429 * The amount of log space we allow the CIL to aggregate is difficult to size.
430 * Whatever we choose, we have to make sure we can get a reservation for the
431 * log space effectively, that it is large enough to capture sufficient
432 * relogging to reduce log buffer IO significantly, but it is not too large for
433 * the log or induces too much latency when writing out through the iclogs. We
434 * track both space consumed and the number of vectors in the checkpoint
435 * context, so we need to decide which to use for limiting.
436 *
437 * Every log buffer we write out during a push needs a header reserved, which
438 * is at least one sector and more for v2 logs. Hence we need a reservation of
439 * at least 512 bytes per 32k of log space just for the LR headers. That means
440 * 16KB of reservation per megabyte of delayed logging space we will consume,
441 * plus various headers. The number of headers will vary based on the num of
442 * io vectors, so limiting on a specific number of vectors is going to result
443 * in transactions of varying size. IOWs, it is more consistent to track and
444 * limit space consumed in the log rather than by the number of objects being
445 * logged in order to prevent checkpoint ticket overruns.
446 *
447 * Further, use of static reservations through the log grant mechanism is
448 * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
449 * grant) and a significant deadlock potential because regranting write space
450 * can block on log pushes. Hence if we have to regrant log space during a log
451 * push, we can deadlock.
452 *
453 * However, we can avoid this by use of a dynamic "reservation stealing"
454 * technique during transaction commit whereby unused reservation space in the
455 * transaction ticket is transferred to the CIL ctx commit ticket to cover the
456 * space needed by the checkpoint transaction. This means that we never need to
457 * specifically reserve space for the CIL checkpoint transaction, nor do we
458 * need to regrant space once the checkpoint completes. This also means the
459 * checkpoint transaction ticket is specific to the checkpoint context, rather
460 * than the CIL itself.
461 *
462 * With dynamic reservations, we can effectively make up arbitrary limits for
463 * the checkpoint size so long as they don't violate any other size rules.
464 * Recovery imposes a rule that no transaction exceed half the log, so we are
465 * limited by that. Furthermore, the log transaction reservation subsystem
466 * tries to keep 25% of the log free, so we need to keep below that limit or we
467 * risk running out of free log space to start any new transactions.
468 *
469 * In order to keep background CIL push efficient, we will set a lower
470 * threshold at which background pushing is attempted without blocking current
471 * transaction commits. A separate, higher bound defines when CIL pushes are
472 * enforced to ensure we stay within our maximum checkpoint size bounds.
473 * threshold, yet give us plenty of space for aggregation on large logs.
474 */
475#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
476#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
477
478/*
382 * The reservation head lsn is not made up of a cycle number and block number. 479 * The reservation head lsn is not made up of a cycle number and block number.
383 * Instead, it uses a cycle number and byte number. Logs don't expect to 480 * Instead, it uses a cycle number and byte number. Logs don't expect to
384 * overflow 31 bits worth of byte offset, so using a byte number will mean 481 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +485,7 @@ typedef struct log {
388 /* The following fields don't need locking */ 485 /* The following fields don't need locking */
389 struct xfs_mount *l_mp; /* mount point */ 486 struct xfs_mount *l_mp; /* mount point */
390 struct xfs_ail *l_ailp; /* AIL log is working with */ 487 struct xfs_ail *l_ailp; /* AIL log is working with */
488 struct xfs_cil *l_cilp; /* CIL log is working with */
391 struct xfs_buf *l_xbuf; /* extra buffer for log 489 struct xfs_buf *l_xbuf; /* extra buffer for log
392 * wrapping */ 490 * wrapping */
393 struct xfs_buftarg *l_targ; /* buftarg of log */ 491 struct xfs_buftarg *l_targ; /* buftarg of log */
@@ -396,9 +494,7 @@ typedef struct log {
396 struct xfs_buf_cancel **l_buf_cancel_table; 494 struct xfs_buf_cancel **l_buf_cancel_table;
397 int l_iclog_hsize; /* size of iclog header */ 495 int l_iclog_hsize; /* size of iclog header */
398 int l_iclog_heads; /* # of iclog header sectors */ 496 int l_iclog_heads; /* # of iclog header sectors */
399 uint l_sectbb_log; /* log2 of sector size in BBs */ 497 uint l_sectBBsize; /* sector size in BBs (2^n) */
400 uint l_sectbb_mask; /* sector size (in BBs)
401 * alignment mask */
402 int l_iclog_size; /* size of log in bytes */ 498 int l_iclog_size; /* size of log in bytes */
403 int l_iclog_size_log; /* log power size of log */ 499 int l_iclog_size_log; /* log power size of log */
404 int l_iclog_bufs; /* number of iclog buffers */ 500 int l_iclog_bufs; /* number of iclog buffers */
@@ -440,14 +536,48 @@ typedef struct log {
440 536
441#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 537#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
442 538
443
444/* common routines */ 539/* common routines */
445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 540extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
446extern int xlog_recover(xlog_t *log); 541extern int xlog_recover(xlog_t *log);
447extern int xlog_recover_finish(xlog_t *log); 542extern int xlog_recover_finish(xlog_t *log);
448extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 543extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
449 544
450extern kmem_zone_t *xfs_log_ticket_zone; 545extern kmem_zone_t *xfs_log_ticket_zone;
546struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
547 int count, char client, uint xflags,
548 int alloc_flags);
549
550
551static inline void
552xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
553{
554 *ptr += bytes;
555 *len -= bytes;
556 *off += bytes;
557}
558
559void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
560int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
561 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
562 xlog_in_core_t **commit_iclog, uint flags);
563
564/*
565 * Committed Item List interfaces
566 */
567int xlog_cil_init(struct log *log);
568void xlog_cil_init_post_recovery(struct log *log);
569void xlog_cil_destroy(struct log *log);
570
571/*
572 * CIL force routines
573 */
574xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
575
576static inline void
577xlog_cil_force(struct log *log)
578{
579 xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
580}
451 581
452/* 582/*
453 * Unmount record type is used as a pseudo transaction type for the ticket. 583 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22e6efdc17ea..6f3f5fa37acf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,15 +24,11 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_error.h" 28#include "xfs_error.h"
31#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h" 32#include "xfs_dinode.h"
37#include "xfs_inode.h" 33#include "xfs_inode.h"
38#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
@@ -56,33 +52,61 @@ STATIC void xlog_recover_check_summary(xlog_t *);
56#define xlog_recover_check_summary(log) 52#define xlog_recover_check_summary(log)
57#endif 53#endif
58 54
59
60/* 55/*
61 * Sector aligned buffer routines for buffer create/read/write/access 56 * Sector aligned buffer routines for buffer create/read/write/access
62 */ 57 */
63 58
64#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ 59/*
65 ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ 60 * Verify the given count of basic blocks is valid number of blocks
66 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) 61 * to specify for an operation involving the given XFS log buffer.
67#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) 62 * Returns nonzero if the count is valid, 0 otherwise.
63 */
68 64
65static inline int
66xlog_buf_bbcount_valid(
67 xlog_t *log,
68 int bbcount)
69{
70 return bbcount > 0 && bbcount <= log->l_logBBsize;
71}
72
73/*
74 * Allocate a buffer to hold log data. The buffer needs to be able
75 * to map to a range of nbblks basic blocks at any valid (basic
76 * block) offset within the log.
77 */
69STATIC xfs_buf_t * 78STATIC xfs_buf_t *
70xlog_get_bp( 79xlog_get_bp(
71 xlog_t *log, 80 xlog_t *log,
72 int nbblks) 81 int nbblks)
73{ 82{
74 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 83 if (!xlog_buf_bbcount_valid(log, nbblks)) {
75 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 84 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
76 XFS_ERROR_REPORT("xlog_get_bp(1)", 85 nbblks);
77 XFS_ERRLEVEL_HIGH, log->l_mp); 86 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
78 return NULL; 87 return NULL;
79 } 88 }
80 89
81 if (log->l_sectbb_log) { 90 /*
82 if (nbblks > 1) 91 * We do log I/O in units of log sectors (a power-of-2
83 nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 92 * multiple of the basic block size), so we round up the
84 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 93 * requested size to acommodate the basic blocks required
85 } 94 * for complete log sectors.
95 *
96 * In addition, the buffer may be used for a non-sector-
97 * aligned block offset, in which case an I/O of the
98 * requested size could extend beyond the end of the
99 * buffer. If the requested size is only 1 basic block it
100 * will never straddle a sector boundary, so this won't be
101 * an issue. Nor will this be a problem if the log I/O is
102 * done in basic blocks (sector size 1). But otherwise we
103 * extend the buffer by one extra log sector to ensure
104 * there's space to accomodate this possiblility.
105 */
106 if (nbblks > 1 && log->l_sectBBsize > 1)
107 nbblks += log->l_sectBBsize;
108 nbblks = round_up(nbblks, log->l_sectBBsize);
109
86 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 110 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
87} 111}
88 112
@@ -93,6 +117,10 @@ xlog_put_bp(
93 xfs_buf_free(bp); 117 xfs_buf_free(bp);
94} 118}
95 119
120/*
121 * Return the address of the start of the given block number's data
122 * in a log buffer. The buffer covers a log sector-aligned region.
123 */
96STATIC xfs_caddr_t 124STATIC xfs_caddr_t
97xlog_align( 125xlog_align(
98 xlog_t *log, 126 xlog_t *log,
@@ -100,15 +128,10 @@ xlog_align(
100 int nbblks, 128 int nbblks,
101 xfs_buf_t *bp) 129 xfs_buf_t *bp)
102{ 130{
103 xfs_caddr_t ptr; 131 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
104 132
105 if (!log->l_sectbb_log) 133 ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
106 return XFS_BUF_PTR(bp); 134 return XFS_BUF_PTR(bp) + BBTOB(offset);
107
108 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
109 ASSERT(XFS_BUF_SIZE(bp) >=
110 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
111 return ptr;
112} 135}
113 136
114 137
@@ -124,21 +147,18 @@ xlog_bread_noalign(
124{ 147{
125 int error; 148 int error;
126 149
127 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 150 if (!xlog_buf_bbcount_valid(log, nbblks)) {
128 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 151 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
129 XFS_ERROR_REPORT("xlog_bread(1)", 152 nbblks);
130 XFS_ERRLEVEL_HIGH, log->l_mp); 153 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
131 return EFSCORRUPTED; 154 return EFSCORRUPTED;
132 } 155 }
133 156
134 if (log->l_sectbb_log) { 157 blk_no = round_down(blk_no, log->l_sectBBsize);
135 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 158 nbblks = round_up(nbblks, log->l_sectBBsize);
136 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
137 }
138 159
139 ASSERT(nbblks > 0); 160 ASSERT(nbblks > 0);
140 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 161 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
141 ASSERT(bp);
142 162
143 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 163 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
144 XFS_BUF_READ(bp); 164 XFS_BUF_READ(bp);
@@ -186,17 +206,15 @@ xlog_bwrite(
186{ 206{
187 int error; 207 int error;
188 208
189 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 209 if (!xlog_buf_bbcount_valid(log, nbblks)) {
190 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 210 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
191 XFS_ERROR_REPORT("xlog_bwrite(1)", 211 nbblks);
192 XFS_ERRLEVEL_HIGH, log->l_mp); 212 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
193 return EFSCORRUPTED; 213 return EFSCORRUPTED;
194 } 214 }
195 215
196 if (log->l_sectbb_log) { 216 blk_no = round_down(blk_no, log->l_sectBBsize);
197 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 217 nbblks = round_up(nbblks, log->l_sectBBsize);
198 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
199 }
200 218
201 ASSERT(nbblks > 0); 219 ASSERT(nbblks > 0);
202 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 220 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -327,39 +345,38 @@ xlog_find_cycle_start(
327{ 345{
328 xfs_caddr_t offset; 346 xfs_caddr_t offset;
329 xfs_daddr_t mid_blk; 347 xfs_daddr_t mid_blk;
348 xfs_daddr_t end_blk;
330 uint mid_cycle; 349 uint mid_cycle;
331 int error; 350 int error;
332 351
333 mid_blk = BLK_AVG(first_blk, *last_blk); 352 end_blk = *last_blk;
334 while (mid_blk != first_blk && mid_blk != *last_blk) { 353 mid_blk = BLK_AVG(first_blk, end_blk);
354 while (mid_blk != first_blk && mid_blk != end_blk) {
335 error = xlog_bread(log, mid_blk, 1, bp, &offset); 355 error = xlog_bread(log, mid_blk, 1, bp, &offset);
336 if (error) 356 if (error)
337 return error; 357 return error;
338 mid_cycle = xlog_get_cycle(offset); 358 mid_cycle = xlog_get_cycle(offset);
339 if (mid_cycle == cycle) { 359 if (mid_cycle == cycle)
340 *last_blk = mid_blk; 360 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
341 /* last_half_cycle == mid_cycle */ 361 else
342 } else { 362 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
343 first_blk = mid_blk; 363 mid_blk = BLK_AVG(first_blk, end_blk);
344 /* first_half_cycle == mid_cycle */
345 }
346 mid_blk = BLK_AVG(first_blk, *last_blk);
347 } 364 }
348 ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || 365 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
349 (mid_blk == *last_blk && mid_blk-1 == first_blk)); 366 (mid_blk == end_blk && mid_blk-1 == first_blk));
367
368 *last_blk = end_blk;
350 369
351 return 0; 370 return 0;
352} 371}
353 372
354/* 373/*
355 * Check that the range of blocks does not contain the cycle number 374 * Check that a range of blocks does not contain stop_on_cycle_no.
356 * given. The scan needs to occur from front to back and the ptr into the 375 * Fill in *new_blk with the block offset where such a block is
357 * region must be updated since a later routine will need to perform another 376 * found, or with -1 (an invalid block number) if there is no such
358 * test. If the region is completely good, we end up returning the same 377 * block in the range. The scan needs to occur from front to back
359 * last block number. 378 * and the pointer into the region must be updated since a later
360 * 379 * routine will need to perform another test.
361 * Set blkno to -1 if we encounter no errors. This is an invalid block number
362 * since we don't ever expect logs to get this large.
363 */ 380 */
364STATIC int 381STATIC int
365xlog_find_verify_cycle( 382xlog_find_verify_cycle(
@@ -376,12 +393,16 @@ xlog_find_verify_cycle(
376 xfs_caddr_t buf = NULL; 393 xfs_caddr_t buf = NULL;
377 int error = 0; 394 int error = 0;
378 395
396 /*
397 * Greedily allocate a buffer big enough to handle the full
398 * range of basic blocks we'll be examining. If that fails,
399 * try a smaller size. We need to be able to read at least
400 * a log sector, or we're out of luck.
401 */
379 bufblks = 1 << ffs(nbblks); 402 bufblks = 1 << ffs(nbblks);
380
381 while (!(bp = xlog_get_bp(log, bufblks))) { 403 while (!(bp = xlog_get_bp(log, bufblks))) {
382 /* can't get enough memory to do everything in one big buffer */
383 bufblks >>= 1; 404 bufblks >>= 1;
384 if (bufblks <= log->l_sectbb_log) 405 if (bufblks < log->l_sectBBsize)
385 return ENOMEM; 406 return ENOMEM;
386 } 407 }
387 408
@@ -629,7 +650,7 @@ xlog_find_head(
629 * In this case we want to find the first block with cycle 650 * In this case we want to find the first block with cycle
630 * number matching last_half_cycle. We expect the log to be 651 * number matching last_half_cycle. We expect the log to be
631 * some variation on 652 * some variation on
632 * x + 1 ... | x ... 653 * x + 1 ... | x ... | x
633 * The first block with cycle number x (last_half_cycle) will 654 * The first block with cycle number x (last_half_cycle) will
634 * be where the new head belongs. First we do a binary search 655 * be where the new head belongs. First we do a binary search
635 * for the first occurrence of last_half_cycle. The binary 656 * for the first occurrence of last_half_cycle. The binary
@@ -639,11 +660,13 @@ xlog_find_head(
639 * the log, then we look for occurrences of last_half_cycle - 1 660 * the log, then we look for occurrences of last_half_cycle - 1
640 * at the end of the log. The cases we're looking for look 661 * at the end of the log. The cases we're looking for look
641 * like 662 * like
642 * x + 1 ... | x | x + 1 | x ... 663 * v binary search stopped here
643 * ^ binary search stopped here 664 * x + 1 ... | x | x + 1 | x ... | x
665 * ^ but we want to locate this spot
644 * or 666 * or
645 * x + 1 ... | x ... | x - 1 | x
646 * <---------> less than scan distance 667 * <---------> less than scan distance
668 * x + 1 ... | x ... | x - 1 | x
669 * ^ we want to locate this spot
647 */ 670 */
648 stop_on_cycle = last_half_cycle; 671 stop_on_cycle = last_half_cycle;
649 if ((error = xlog_find_cycle_start(log, bp, first_blk, 672 if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -699,16 +722,16 @@ xlog_find_head(
699 * certainly not the head of the log. By searching for 722 * certainly not the head of the log. By searching for
700 * last_half_cycle-1 we accomplish that. 723 * last_half_cycle-1 we accomplish that.
701 */ 724 */
702 start_blk = log_bbnum - num_scan_bblks + head_blk;
703 ASSERT(head_blk <= INT_MAX && 725 ASSERT(head_blk <= INT_MAX &&
704 (xfs_daddr_t) num_scan_bblks - head_blk >= 0); 726 (xfs_daddr_t) num_scan_bblks >= head_blk);
727 start_blk = log_bbnum - (num_scan_bblks - head_blk);
705 if ((error = xlog_find_verify_cycle(log, start_blk, 728 if ((error = xlog_find_verify_cycle(log, start_blk,
706 num_scan_bblks - (int)head_blk, 729 num_scan_bblks - (int)head_blk,
707 (stop_on_cycle - 1), &new_blk))) 730 (stop_on_cycle - 1), &new_blk)))
708 goto bp_err; 731 goto bp_err;
709 if (new_blk != -1) { 732 if (new_blk != -1) {
710 head_blk = new_blk; 733 head_blk = new_blk;
711 goto bad_blk; 734 goto validate_head;
712 } 735 }
713 736
714 /* 737 /*
@@ -726,7 +749,7 @@ xlog_find_head(
726 head_blk = new_blk; 749 head_blk = new_blk;
727 } 750 }
728 751
729 bad_blk: 752validate_head:
730 /* 753 /*
731 * Now we need to make sure head_blk is not pointing to a block in 754 * Now we need to make sure head_blk is not pointing to a block in
732 * the middle of a log record. 755 * the middle of a log record.
@@ -748,7 +771,7 @@ xlog_find_head(
748 if ((error = xlog_find_verify_log_record(log, start_blk, 771 if ((error = xlog_find_verify_log_record(log, start_blk,
749 &head_blk, 0)) == -1) { 772 &head_blk, 0)) == -1) {
750 /* We hit the beginning of the log during our search */ 773 /* We hit the beginning of the log during our search */
751 start_blk = log_bbnum - num_scan_bblks + head_blk; 774 start_blk = log_bbnum - (num_scan_bblks - head_blk);
752 new_blk = log_bbnum; 775 new_blk = log_bbnum;
753 ASSERT(start_blk <= INT_MAX && 776 ASSERT(start_blk <= INT_MAX &&
754 (xfs_daddr_t) log_bbnum-start_blk >= 0); 777 (xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -833,12 +856,12 @@ xlog_find_tail(
833 if (*head_blk == 0) { /* special case */ 856 if (*head_blk == 0) { /* special case */
834 error = xlog_bread(log, 0, 1, bp, &offset); 857 error = xlog_bread(log, 0, 1, bp, &offset);
835 if (error) 858 if (error)
836 goto bread_err; 859 goto done;
837 860
838 if (xlog_get_cycle(offset) == 0) { 861 if (xlog_get_cycle(offset) == 0) {
839 *tail_blk = 0; 862 *tail_blk = 0;
840 /* leave all other log inited values alone */ 863 /* leave all other log inited values alone */
841 goto exit; 864 goto done;
842 } 865 }
843 } 866 }
844 867
@@ -849,7 +872,7 @@ xlog_find_tail(
849 for (i = (int)(*head_blk) - 1; i >= 0; i--) { 872 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
850 error = xlog_bread(log, i, 1, bp, &offset); 873 error = xlog_bread(log, i, 1, bp, &offset);
851 if (error) 874 if (error)
852 goto bread_err; 875 goto done;
853 876
854 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { 877 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
855 found = 1; 878 found = 1;
@@ -866,7 +889,7 @@ xlog_find_tail(
866 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { 889 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
867 error = xlog_bread(log, i, 1, bp, &offset); 890 error = xlog_bread(log, i, 1, bp, &offset);
868 if (error) 891 if (error)
869 goto bread_err; 892 goto done;
870 893
871 if (XLOG_HEADER_MAGIC_NUM == 894 if (XLOG_HEADER_MAGIC_NUM ==
872 be32_to_cpu(*(__be32 *)offset)) { 895 be32_to_cpu(*(__be32 *)offset)) {
@@ -941,7 +964,7 @@ xlog_find_tail(
941 umount_data_blk = (i + hblks) % log->l_logBBsize; 964 umount_data_blk = (i + hblks) % log->l_logBBsize;
942 error = xlog_bread(log, umount_data_blk, 1, bp, &offset); 965 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
943 if (error) 966 if (error)
944 goto bread_err; 967 goto done;
945 968
946 op_head = (xlog_op_header_t *)offset; 969 op_head = (xlog_op_header_t *)offset;
947 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 970 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -987,12 +1010,10 @@ xlog_find_tail(
987 * But... if the -device- itself is readonly, just skip this. 1010 * But... if the -device- itself is readonly, just skip this.
988 * We can't recover this device anyway, so it won't matter. 1011 * We can't recover this device anyway, so it won't matter.
989 */ 1012 */
990 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { 1013 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
991 error = xlog_clear_stale_blocks(log, tail_lsn); 1014 error = xlog_clear_stale_blocks(log, tail_lsn);
992 }
993 1015
994bread_err: 1016done:
995exit:
996 xlog_put_bp(bp); 1017 xlog_put_bp(bp);
997 1018
998 if (error) 1019 if (error)
@@ -1152,16 +1173,22 @@ xlog_write_log_records(
1152 xfs_caddr_t offset; 1173 xfs_caddr_t offset;
1153 xfs_buf_t *bp; 1174 xfs_buf_t *bp;
1154 int balign, ealign; 1175 int balign, ealign;
1155 int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 1176 int sectbb = log->l_sectBBsize;
1156 int end_block = start_block + blocks; 1177 int end_block = start_block + blocks;
1157 int bufblks; 1178 int bufblks;
1158 int error = 0; 1179 int error = 0;
1159 int i, j = 0; 1180 int i, j = 0;
1160 1181
1182 /*
1183 * Greedily allocate a buffer big enough to handle the full
1184 * range of basic blocks to be written. If that fails, try
1185 * a smaller size. We need to be able to write at least a
1186 * log sector, or we're out of luck.
1187 */
1161 bufblks = 1 << ffs(blocks); 1188 bufblks = 1 << ffs(blocks);
1162 while (!(bp = xlog_get_bp(log, bufblks))) { 1189 while (!(bp = xlog_get_bp(log, bufblks))) {
1163 bufblks >>= 1; 1190 bufblks >>= 1;
1164 if (bufblks <= log->l_sectbb_log) 1191 if (bufblks < sectbb)
1165 return ENOMEM; 1192 return ENOMEM;
1166 } 1193 }
1167 1194
@@ -1169,7 +1196,7 @@ xlog_write_log_records(
1169 * the buffer in the starting sector not covered by the first 1196 * the buffer in the starting sector not covered by the first
1170 * write below. 1197 * write below.
1171 */ 1198 */
1172 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); 1199 balign = round_down(start_block, sectbb);
1173 if (balign != start_block) { 1200 if (balign != start_block) {
1174 error = xlog_bread_noalign(log, start_block, 1, bp); 1201 error = xlog_bread_noalign(log, start_block, 1, bp);
1175 if (error) 1202 if (error)
@@ -1188,7 +1215,7 @@ xlog_write_log_records(
1188 * the buffer in the final sector not covered by the write. 1215 * the buffer in the final sector not covered by the write.
1189 * If this is the same sector as the above read, skip it. 1216 * If this is the same sector as the above read, skip it.
1190 */ 1217 */
1191 ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); 1218 ealign = round_down(end_block, sectbb);
1192 if (j == 0 && (start_block + endcount > ealign)) { 1219 if (j == 0 && (start_block + endcount > ealign)) {
1193 offset = XFS_BUF_PTR(bp); 1220 offset = XFS_BUF_PTR(bp);
1194 balign = BBTOB(ealign - start_block); 1221 balign = BBTOB(ealign - start_block);
@@ -1408,6 +1435,7 @@ xlog_recover_add_item(
1408 1435
1409STATIC int 1436STATIC int
1410xlog_recover_add_to_cont_trans( 1437xlog_recover_add_to_cont_trans(
1438 struct log *log,
1411 xlog_recover_t *trans, 1439 xlog_recover_t *trans,
1412 xfs_caddr_t dp, 1440 xfs_caddr_t dp,
1413 int len) 1441 int len)
@@ -1434,6 +1462,7 @@ xlog_recover_add_to_cont_trans(
1434 memcpy(&ptr[old_len], dp, len); /* d, s, l */ 1462 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1435 item->ri_buf[item->ri_cnt-1].i_len += len; 1463 item->ri_buf[item->ri_cnt-1].i_len += len;
1436 item->ri_buf[item->ri_cnt-1].i_addr = ptr; 1464 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1465 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1437 return 0; 1466 return 0;
1438} 1467}
1439 1468
@@ -1452,6 +1481,7 @@ xlog_recover_add_to_cont_trans(
1452 */ 1481 */
1453STATIC int 1482STATIC int
1454xlog_recover_add_to_trans( 1483xlog_recover_add_to_trans(
1484 struct log *log,
1455 xlog_recover_t *trans, 1485 xlog_recover_t *trans,
1456 xfs_caddr_t dp, 1486 xfs_caddr_t dp,
1457 int len) 1487 int len)
@@ -1510,6 +1540,7 @@ xlog_recover_add_to_trans(
1510 item->ri_buf[item->ri_cnt].i_addr = ptr; 1540 item->ri_buf[item->ri_cnt].i_addr = ptr;
1511 item->ri_buf[item->ri_cnt].i_len = len; 1541 item->ri_buf[item->ri_cnt].i_len = len;
1512 item->ri_cnt++; 1542 item->ri_cnt++;
1543 trace_xfs_log_recover_item_add(log, trans, item, 0);
1513 return 0; 1544 return 0;
1514} 1545}
1515 1546
@@ -1521,20 +1552,22 @@ xlog_recover_add_to_trans(
1521 */ 1552 */
1522STATIC int 1553STATIC int
1523xlog_recover_reorder_trans( 1554xlog_recover_reorder_trans(
1524 xlog_recover_t *trans) 1555 struct log *log,
1556 xlog_recover_t *trans,
1557 int pass)
1525{ 1558{
1526 xlog_recover_item_t *item, *n; 1559 xlog_recover_item_t *item, *n;
1527 LIST_HEAD(sort_list); 1560 LIST_HEAD(sort_list);
1528 1561
1529 list_splice_init(&trans->r_itemq, &sort_list); 1562 list_splice_init(&trans->r_itemq, &sort_list);
1530 list_for_each_entry_safe(item, n, &sort_list, ri_list) { 1563 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1531 xfs_buf_log_format_t *buf_f; 1564 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1532
1533 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
1534 1565
1535 switch (ITEM_TYPE(item)) { 1566 switch (ITEM_TYPE(item)) {
1536 case XFS_LI_BUF: 1567 case XFS_LI_BUF:
1537 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { 1568 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1569 trace_xfs_log_recover_item_reorder_head(log,
1570 trans, item, pass);
1538 list_move(&item->ri_list, &trans->r_itemq); 1571 list_move(&item->ri_list, &trans->r_itemq);
1539 break; 1572 break;
1540 } 1573 }
@@ -1543,6 +1576,8 @@ xlog_recover_reorder_trans(
1543 case XFS_LI_QUOTAOFF: 1576 case XFS_LI_QUOTAOFF:
1544 case XFS_LI_EFD: 1577 case XFS_LI_EFD:
1545 case XFS_LI_EFI: 1578 case XFS_LI_EFI:
1579 trace_xfs_log_recover_item_reorder_tail(log,
1580 trans, item, pass);
1546 list_move_tail(&item->ri_list, &trans->r_itemq); 1581 list_move_tail(&item->ri_list, &trans->r_itemq);
1547 break; 1582 break;
1548 default: 1583 default:
@@ -1592,8 +1627,10 @@ xlog_recover_do_buffer_pass1(
1592 /* 1627 /*
1593 * If this isn't a cancel buffer item, then just return. 1628 * If this isn't a cancel buffer item, then just return.
1594 */ 1629 */
1595 if (!(flags & XFS_BLI_CANCEL)) 1630 if (!(flags & XFS_BLF_CANCEL)) {
1631 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1596 return; 1632 return;
1633 }
1597 1634
1598 /* 1635 /*
1599 * Insert an xfs_buf_cancel record into the hash table of 1636 * Insert an xfs_buf_cancel record into the hash table of
@@ -1627,6 +1664,7 @@ xlog_recover_do_buffer_pass1(
1627 while (nextp != NULL) { 1664 while (nextp != NULL) {
1628 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1665 if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1629 nextp->bc_refcount++; 1666 nextp->bc_refcount++;
1667 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1630 return; 1668 return;
1631 } 1669 }
1632 prevp = nextp; 1670 prevp = nextp;
@@ -1640,13 +1678,14 @@ xlog_recover_do_buffer_pass1(
1640 bcp->bc_refcount = 1; 1678 bcp->bc_refcount = 1;
1641 bcp->bc_next = NULL; 1679 bcp->bc_next = NULL;
1642 prevp->bc_next = bcp; 1680 prevp->bc_next = bcp;
1681 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1643} 1682}
1644 1683
1645/* 1684/*
1646 * Check to see whether the buffer being recovered has a corresponding 1685 * Check to see whether the buffer being recovered has a corresponding
1647 * entry in the buffer cancel record table. If it does then return 1 1686 * entry in the buffer cancel record table. If it does then return 1
1648 * so that it will be cancelled, otherwise return 0. If the buffer is 1687 * so that it will be cancelled, otherwise return 0. If the buffer is
1649 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement 1688 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1650 * the refcount on the entry in the table and remove it from the table 1689 * the refcount on the entry in the table and remove it from the table
1651 * if this is the last reference. 1690 * if this is the last reference.
1652 * 1691 *
@@ -1671,7 +1710,7 @@ xlog_check_buffer_cancelled(
1671 * There is nothing in the table built in pass one, 1710 * There is nothing in the table built in pass one,
1672 * so this buffer must not be cancelled. 1711 * so this buffer must not be cancelled.
1673 */ 1712 */
1674 ASSERT(!(flags & XFS_BLI_CANCEL)); 1713 ASSERT(!(flags & XFS_BLF_CANCEL));
1675 return 0; 1714 return 0;
1676 } 1715 }
1677 1716
@@ -1683,7 +1722,7 @@ xlog_check_buffer_cancelled(
1683 * There is no corresponding entry in the table built 1722 * There is no corresponding entry in the table built
1684 * in pass one, so this buffer has not been cancelled. 1723 * in pass one, so this buffer has not been cancelled.
1685 */ 1724 */
1686 ASSERT(!(flags & XFS_BLI_CANCEL)); 1725 ASSERT(!(flags & XFS_BLF_CANCEL));
1687 return 0; 1726 return 0;
1688 } 1727 }
1689 1728
@@ -1702,7 +1741,7 @@ xlog_check_buffer_cancelled(
1702 * one in the table and remove it if this is the 1741 * one in the table and remove it if this is the
1703 * last reference. 1742 * last reference.
1704 */ 1743 */
1705 if (flags & XFS_BLI_CANCEL) { 1744 if (flags & XFS_BLF_CANCEL) {
1706 bcp->bc_refcount--; 1745 bcp->bc_refcount--;
1707 if (bcp->bc_refcount == 0) { 1746 if (bcp->bc_refcount == 0) {
1708 if (prevp == NULL) { 1747 if (prevp == NULL) {
@@ -1722,7 +1761,7 @@ xlog_check_buffer_cancelled(
1722 * We didn't find a corresponding entry in the table, so 1761 * We didn't find a corresponding entry in the table, so
1723 * return 0 so that the buffer is NOT cancelled. 1762 * return 0 so that the buffer is NOT cancelled.
1724 */ 1763 */
1725 ASSERT(!(flags & XFS_BLI_CANCEL)); 1764 ASSERT(!(flags & XFS_BLF_CANCEL));
1726 return 0; 1765 return 0;
1727} 1766}
1728 1767
@@ -1779,6 +1818,8 @@ xlog_recover_do_inode_buffer(
1779 unsigned int *data_map = NULL; 1818 unsigned int *data_map = NULL;
1780 unsigned int map_size = 0; 1819 unsigned int map_size = 0;
1781 1820
1821 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1822
1782 switch (buf_f->blf_type) { 1823 switch (buf_f->blf_type) {
1783 case XFS_LI_BUF: 1824 case XFS_LI_BUF:
1784 data_map = buf_f->blf_data_map; 1825 data_map = buf_f->blf_data_map;
@@ -1822,8 +1863,8 @@ xlog_recover_do_inode_buffer(
1822 nbits = xfs_contig_bits(data_map, map_size, 1863 nbits = xfs_contig_bits(data_map, map_size,
1823 bit); 1864 bit);
1824 ASSERT(nbits > 0); 1865 ASSERT(nbits > 0);
1825 reg_buf_offset = bit << XFS_BLI_SHIFT; 1866 reg_buf_offset = bit << XFS_BLF_SHIFT;
1826 reg_buf_bytes = nbits << XFS_BLI_SHIFT; 1867 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1827 item_index++; 1868 item_index++;
1828 } 1869 }
1829 1870
@@ -1837,7 +1878,7 @@ xlog_recover_do_inode_buffer(
1837 } 1878 }
1838 1879
1839 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1880 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1840 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); 1881 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1841 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); 1882 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1842 1883
1843 /* 1884 /*
@@ -1845,9 +1886,8 @@ xlog_recover_do_inode_buffer(
1845 * current di_next_unlinked field. Extract its value 1886 * current di_next_unlinked field. Extract its value
1846 * and copy it to the buffer copy. 1887 * and copy it to the buffer copy.
1847 */ 1888 */
1848 logged_nextp = (xfs_agino_t *) 1889 logged_nextp = item->ri_buf[item_index].i_addr +
1849 ((char *)(item->ri_buf[item_index].i_addr) + 1890 next_unlinked_offset - reg_buf_offset;
1850 (next_unlinked_offset - reg_buf_offset));
1851 if (unlikely(*logged_nextp == 0)) { 1891 if (unlikely(*logged_nextp == 0)) {
1852 xfs_fs_cmn_err(CE_ALERT, mp, 1892 xfs_fs_cmn_err(CE_ALERT, mp,
1853 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", 1893 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field",
@@ -1874,6 +1914,7 @@ xlog_recover_do_inode_buffer(
1874/*ARGSUSED*/ 1914/*ARGSUSED*/
1875STATIC void 1915STATIC void
1876xlog_recover_do_reg_buffer( 1916xlog_recover_do_reg_buffer(
1917 struct xfs_mount *mp,
1877 xlog_recover_item_t *item, 1918 xlog_recover_item_t *item,
1878 xfs_buf_t *bp, 1919 xfs_buf_t *bp,
1879 xfs_buf_log_format_t *buf_f) 1920 xfs_buf_log_format_t *buf_f)
@@ -1885,6 +1926,8 @@ xlog_recover_do_reg_buffer(
1885 unsigned int map_size = 0; 1926 unsigned int map_size = 0;
1886 int error; 1927 int error;
1887 1928
1929 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1930
1888 switch (buf_f->blf_type) { 1931 switch (buf_f->blf_type) {
1889 case XFS_LI_BUF: 1932 case XFS_LI_BUF:
1890 data_map = buf_f->blf_data_map; 1933 data_map = buf_f->blf_data_map;
@@ -1900,9 +1943,9 @@ xlog_recover_do_reg_buffer(
1900 nbits = xfs_contig_bits(data_map, map_size, bit); 1943 nbits = xfs_contig_bits(data_map, map_size, bit);
1901 ASSERT(nbits > 0); 1944 ASSERT(nbits > 0);
1902 ASSERT(item->ri_buf[i].i_addr != NULL); 1945 ASSERT(item->ri_buf[i].i_addr != NULL);
1903 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); 1946 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1904 ASSERT(XFS_BUF_COUNT(bp) >= 1947 ASSERT(XFS_BUF_COUNT(bp) >=
1905 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); 1948 ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
1906 1949
1907 /* 1950 /*
1908 * Do a sanity check if this is a dquot buffer. Just checking 1951 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1911,7 +1954,7 @@ xlog_recover_do_reg_buffer(
1911 */ 1954 */
1912 error = 0; 1955 error = 0;
1913 if (buf_f->blf_flags & 1956 if (buf_f->blf_flags &
1914 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 1957 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1915 if (item->ri_buf[i].i_addr == NULL) { 1958 if (item->ri_buf[i].i_addr == NULL) {
1916 cmn_err(CE_ALERT, 1959 cmn_err(CE_ALERT,
1917 "XFS: NULL dquot in %s.", __func__); 1960 "XFS: NULL dquot in %s.", __func__);
@@ -1923,8 +1966,7 @@ xlog_recover_do_reg_buffer(
1923 item->ri_buf[i].i_len, __func__); 1966 item->ri_buf[i].i_len, __func__);
1924 goto next; 1967 goto next;
1925 } 1968 }
1926 error = xfs_qm_dqcheck((xfs_disk_dquot_t *) 1969 error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
1927 item->ri_buf[i].i_addr,
1928 -1, 0, XFS_QMOPT_DOWARN, 1970 -1, 0, XFS_QMOPT_DOWARN,
1929 "dquot_buf_recover"); 1971 "dquot_buf_recover");
1930 if (error) 1972 if (error)
@@ -1932,9 +1974,9 @@ xlog_recover_do_reg_buffer(
1932 } 1974 }
1933 1975
1934 memcpy(xfs_buf_offset(bp, 1976 memcpy(xfs_buf_offset(bp,
1935 (uint)bit << XFS_BLI_SHIFT), /* dest */ 1977 (uint)bit << XFS_BLF_SHIFT), /* dest */
1936 item->ri_buf[i].i_addr, /* source */ 1978 item->ri_buf[i].i_addr, /* source */
1937 nbits<<XFS_BLI_SHIFT); /* length */ 1979 nbits<<XFS_BLF_SHIFT); /* length */
1938 next: 1980 next:
1939 i++; 1981 i++;
1940 bit += nbits; 1982 bit += nbits;
@@ -2083,6 +2125,8 @@ xlog_recover_do_dquot_buffer(
2083{ 2125{
2084 uint type; 2126 uint type;
2085 2127
2128 trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2129
2086 /* 2130 /*
2087 * Filesystems are required to send in quota flags at mount time. 2131 * Filesystems are required to send in quota flags at mount time.
2088 */ 2132 */
@@ -2091,11 +2135,11 @@ xlog_recover_do_dquot_buffer(
2091 } 2135 }
2092 2136
2093 type = 0; 2137 type = 0;
2094 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) 2138 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2095 type |= XFS_DQ_USER; 2139 type |= XFS_DQ_USER;
2096 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) 2140 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2097 type |= XFS_DQ_PROJ; 2141 type |= XFS_DQ_PROJ;
2098 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) 2142 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2099 type |= XFS_DQ_GROUP; 2143 type |= XFS_DQ_GROUP;
2100 /* 2144 /*
2101 * This type of quotas was turned off, so ignore this buffer 2145 * This type of quotas was turned off, so ignore this buffer
@@ -2103,7 +2147,7 @@ xlog_recover_do_dquot_buffer(
2103 if (log->l_quotaoffs_flag & type) 2147 if (log->l_quotaoffs_flag & type)
2104 return; 2148 return;
2105 2149
2106 xlog_recover_do_reg_buffer(item, bp, buf_f); 2150 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2107} 2151}
2108 2152
2109/* 2153/*
@@ -2116,7 +2160,7 @@ xlog_recover_do_dquot_buffer(
2116 * here which overlaps that may be stale. 2160 * here which overlaps that may be stale.
2117 * 2161 *
2118 * When meta-data buffers are freed at run time we log a buffer item 2162 * When meta-data buffers are freed at run time we log a buffer item
2119 * with the XFS_BLI_CANCEL bit set to indicate that previous copies 2163 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2120 * of the buffer in the log should not be replayed at recovery time. 2164 * of the buffer in the log should not be replayed at recovery time.
2121 * This is so that if the blocks covered by the buffer are reused for 2165 * This is so that if the blocks covered by the buffer are reused for
2122 * file data before we crash we don't end up replaying old, freed 2166 * file data before we crash we don't end up replaying old, freed
@@ -2135,7 +2179,7 @@ xlog_recover_do_buffer_trans(
2135 xlog_recover_item_t *item, 2179 xlog_recover_item_t *item,
2136 int pass) 2180 int pass)
2137{ 2181{
2138 xfs_buf_log_format_t *buf_f; 2182 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2139 xfs_mount_t *mp; 2183 xfs_mount_t *mp;
2140 xfs_buf_t *bp; 2184 xfs_buf_t *bp;
2141 int error; 2185 int error;
@@ -2145,12 +2189,10 @@ xlog_recover_do_buffer_trans(
2145 ushort flags; 2189 ushort flags;
2146 uint buf_flags; 2190 uint buf_flags;
2147 2191
2148 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2149
2150 if (pass == XLOG_RECOVER_PASS1) { 2192 if (pass == XLOG_RECOVER_PASS1) {
2151 /* 2193 /*
2152 * In this pass we're only looking for buf items 2194 * In this pass we're only looking for buf items
2153 * with the XFS_BLI_CANCEL bit set. 2195 * with the XFS_BLF_CANCEL bit set.
2154 */ 2196 */
2155 xlog_recover_do_buffer_pass1(log, buf_f); 2197 xlog_recover_do_buffer_pass1(log, buf_f);
2156 return 0; 2198 return 0;
@@ -2164,9 +2206,11 @@ xlog_recover_do_buffer_trans(
2164 */ 2206 */
2165 cancel = xlog_recover_do_buffer_pass2(log, buf_f); 2207 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2166 if (cancel) { 2208 if (cancel) {
2209 trace_xfs_log_recover_buf_cancel(log, buf_f);
2167 return 0; 2210 return 0;
2168 } 2211 }
2169 } 2212 }
2213 trace_xfs_log_recover_buf_recover(log, buf_f);
2170 switch (buf_f->blf_type) { 2214 switch (buf_f->blf_type) {
2171 case XFS_LI_BUF: 2215 case XFS_LI_BUF:
2172 blkno = buf_f->blf_blkno; 2216 blkno = buf_f->blf_blkno;
@@ -2185,7 +2229,7 @@ xlog_recover_do_buffer_trans(
2185 2229
2186 mp = log->l_mp; 2230 mp = log->l_mp;
2187 buf_flags = XBF_LOCK; 2231 buf_flags = XBF_LOCK;
2188 if (!(flags & XFS_BLI_INODE_BUF)) 2232 if (!(flags & XFS_BLF_INODE_BUF))
2189 buf_flags |= XBF_MAPPED; 2233 buf_flags |= XBF_MAPPED;
2190 2234
2191 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2235 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2198,13 +2242,13 @@ xlog_recover_do_buffer_trans(
2198 } 2242 }
2199 2243
2200 error = 0; 2244 error = 0;
2201 if (flags & XFS_BLI_INODE_BUF) { 2245 if (flags & XFS_BLF_INODE_BUF) {
2202 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2246 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2203 } else if (flags & 2247 } else if (flags &
2204 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 2248 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2205 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2249 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2206 } else { 2250 } else {
2207 xlog_recover_do_reg_buffer(item, bp, buf_f); 2251 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2208 } 2252 }
2209 if (error) 2253 if (error)
2210 return XFS_ERROR(error); 2254 return XFS_ERROR(error);
@@ -2265,10 +2309,9 @@ xlog_recover_do_inode_trans(
2265 } 2309 }
2266 2310
2267 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2311 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2268 in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr; 2312 in_f = item->ri_buf[0].i_addr;
2269 } else { 2313 } else {
2270 in_f = (xfs_inode_log_format_t *)kmem_alloc( 2314 in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2271 sizeof(xfs_inode_log_format_t), KM_SLEEP);
2272 need_free = 1; 2315 need_free = 1;
2273 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 2316 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2274 if (error) 2317 if (error)
@@ -2284,8 +2327,10 @@ xlog_recover_do_inode_trans(
2284 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, 2327 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2285 in_f->ilf_len, 0)) { 2328 in_f->ilf_len, 0)) {
2286 error = 0; 2329 error = 0;
2330 trace_xfs_log_recover_inode_cancel(log, in_f);
2287 goto error; 2331 goto error;
2288 } 2332 }
2333 trace_xfs_log_recover_inode_recover(log, in_f);
2289 2334
2290 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2335 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2291 XBF_LOCK); 2336 XBF_LOCK);
@@ -2314,7 +2359,7 @@ xlog_recover_do_inode_trans(
2314 error = EFSCORRUPTED; 2359 error = EFSCORRUPTED;
2315 goto error; 2360 goto error;
2316 } 2361 }
2317 dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr); 2362 dicp = item->ri_buf[1].i_addr;
2318 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2363 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2319 xfs_buf_relse(bp); 2364 xfs_buf_relse(bp);
2320 xfs_fs_cmn_err(CE_ALERT, mp, 2365 xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2337,6 +2382,7 @@ xlog_recover_do_inode_trans(
2337 /* do nothing */ 2382 /* do nothing */
2338 } else { 2383 } else {
2339 xfs_buf_relse(bp); 2384 xfs_buf_relse(bp);
2385 trace_xfs_log_recover_inode_skip(log, in_f);
2340 error = 0; 2386 error = 0;
2341 goto error; 2387 goto error;
2342 } 2388 }
@@ -2404,7 +2450,7 @@ xlog_recover_do_inode_trans(
2404 } 2450 }
2405 2451
2406 /* The core is in in-core format */ 2452 /* The core is in in-core format */
2407 xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr); 2453 xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
2408 2454
2409 /* the rest is in on-disk format */ 2455 /* the rest is in on-disk format */
2410 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { 2456 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
@@ -2521,7 +2567,7 @@ xlog_recover_do_quotaoff_trans(
2521 return (0); 2567 return (0);
2522 } 2568 }
2523 2569
2524 qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr; 2570 qoff_f = item->ri_buf[0].i_addr;
2525 ASSERT(qoff_f); 2571 ASSERT(qoff_f);
2526 2572
2527 /* 2573 /*
@@ -2565,9 +2611,8 @@ xlog_recover_do_dquot_trans(
2565 if (mp->m_qflags == 0) 2611 if (mp->m_qflags == 0)
2566 return (0); 2612 return (0);
2567 2613
2568 recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr; 2614 recddq = item->ri_buf[1].i_addr;
2569 2615 if (recddq == NULL) {
2570 if (item->ri_buf[1].i_addr == NULL) {
2571 cmn_err(CE_ALERT, 2616 cmn_err(CE_ALERT,
2572 "XFS: NULL dquot in %s.", __func__); 2617 "XFS: NULL dquot in %s.", __func__);
2573 return XFS_ERROR(EIO); 2618 return XFS_ERROR(EIO);
@@ -2597,7 +2642,7 @@ xlog_recover_do_dquot_trans(
2597 * The other possibility, of course, is that the quota subsystem was 2642 * The other possibility, of course, is that the quota subsystem was
2598 * removed since the last mount - ENOSYS. 2643 * removed since the last mount - ENOSYS.
2599 */ 2644 */
2600 dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr; 2645 dq_f = item->ri_buf[0].i_addr;
2601 ASSERT(dq_f); 2646 ASSERT(dq_f);
2602 if ((error = xfs_qm_dqcheck(recddq, 2647 if ((error = xfs_qm_dqcheck(recddq,
2603 dq_f->qlf_id, 2648 dq_f->qlf_id,
@@ -2664,7 +2709,7 @@ xlog_recover_do_efi_trans(
2664 return 0; 2709 return 0;
2665 } 2710 }
2666 2711
2667 efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr; 2712 efi_formatp = item->ri_buf[0].i_addr;
2668 2713
2669 mp = log->l_mp; 2714 mp = log->l_mp;
2670 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2715 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
@@ -2710,7 +2755,7 @@ xlog_recover_do_efd_trans(
2710 return; 2755 return;
2711 } 2756 }
2712 2757
2713 efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr; 2758 efd_formatp = item->ri_buf[0].i_addr;
2714 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2759 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2715 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2760 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2716 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + 2761 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
@@ -2758,11 +2803,12 @@ xlog_recover_do_trans(
2758 int error = 0; 2803 int error = 0;
2759 xlog_recover_item_t *item; 2804 xlog_recover_item_t *item;
2760 2805
2761 error = xlog_recover_reorder_trans(trans); 2806 error = xlog_recover_reorder_trans(log, trans, pass);
2762 if (error) 2807 if (error)
2763 return error; 2808 return error;
2764 2809
2765 list_for_each_entry(item, &trans->r_itemq, ri_list) { 2810 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2811 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2766 switch (ITEM_TYPE(item)) { 2812 switch (ITEM_TYPE(item)) {
2767 case XFS_LI_BUF: 2813 case XFS_LI_BUF:
2768 error = xlog_recover_do_buffer_trans(log, item, pass); 2814 error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2919,8 +2965,9 @@ xlog_recover_process_data(
2919 error = xlog_recover_unmount_trans(trans); 2965 error = xlog_recover_unmount_trans(trans);
2920 break; 2966 break;
2921 case XLOG_WAS_CONT_TRANS: 2967 case XLOG_WAS_CONT_TRANS:
2922 error = xlog_recover_add_to_cont_trans(trans, 2968 error = xlog_recover_add_to_cont_trans(log,
2923 dp, be32_to_cpu(ohead->oh_len)); 2969 trans, dp,
2970 be32_to_cpu(ohead->oh_len));
2924 break; 2971 break;
2925 case XLOG_START_TRANS: 2972 case XLOG_START_TRANS:
2926 xlog_warn( 2973 xlog_warn(
@@ -2930,7 +2977,7 @@ xlog_recover_process_data(
2930 break; 2977 break;
2931 case 0: 2978 case 0:
2932 case XLOG_CONTINUE_TRANS: 2979 case XLOG_CONTINUE_TRANS:
2933 error = xlog_recover_add_to_trans(trans, 2980 error = xlog_recover_add_to_trans(log, trans,
2934 dp, be32_to_cpu(ohead->oh_len)); 2981 dp, be32_to_cpu(ohead->oh_len));
2935 break; 2982 break;
2936 default: 2983 default:
@@ -3139,7 +3186,7 @@ xlog_recover_process_one_iunlink(
3139 int error; 3186 int error;
3140 3187
3141 ino = XFS_AGINO_TO_INO(mp, agno, agino); 3188 ino = XFS_AGINO_TO_INO(mp, agno, agino);
3142 error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); 3189 error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
3143 if (error) 3190 if (error)
3144 goto fail; 3191 goto fail;
3145 3192
@@ -3331,42 +3378,6 @@ xlog_pack_data(
3331 } 3378 }
3332} 3379}
3333 3380
3334#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3335STATIC void
3336xlog_unpack_data_checksum(
3337 xlog_rec_header_t *rhead,
3338 xfs_caddr_t dp,
3339 xlog_t *log)
3340{
3341 __be32 *up = (__be32 *)dp;
3342 uint chksum = 0;
3343 int i;
3344
3345 /* divide length by 4 to get # words */
3346 for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
3347 chksum ^= be32_to_cpu(*up);
3348 up++;
3349 }
3350 if (chksum != be32_to_cpu(rhead->h_chksum)) {
3351 if (rhead->h_chksum ||
3352 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3353 cmn_err(CE_DEBUG,
3354 "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
3355 be32_to_cpu(rhead->h_chksum), chksum);
3356 cmn_err(CE_DEBUG,
3357"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3358 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3359 cmn_err(CE_DEBUG,
3360 "XFS: LogR this is a LogV2 filesystem\n");
3361 }
3362 log->l_flags |= XLOG_CHKSUM_MISMATCH;
3363 }
3364 }
3365}
3366#else
3367#define xlog_unpack_data_checksum(rhead, dp, log)
3368#endif
3369
3370STATIC void 3381STATIC void
3371xlog_unpack_data( 3382xlog_unpack_data(
3372 xlog_rec_header_t *rhead, 3383 xlog_rec_header_t *rhead,
@@ -3390,8 +3401,6 @@ xlog_unpack_data(
3390 dp += BBSIZE; 3401 dp += BBSIZE;
3391 } 3402 }
3392 } 3403 }
3393
3394 xlog_unpack_data_checksum(rhead, dp, log);
3395} 3404}
3396 3405
3397STATIC int 3406STATIC int
@@ -3490,7 +3499,7 @@ xlog_do_recovery_pass(
3490 hblks = 1; 3499 hblks = 1;
3491 } 3500 }
3492 } else { 3501 } else {
3493 ASSERT(log->l_sectbb_log == 0); 3502 ASSERT(log->l_sectBBsize == 1);
3494 hblks = 1; 3503 hblks = 1;
3495 hbp = xlog_get_bp(log, 1); 3504 hbp = xlog_get_bp(log, 1);
3496 h_size = XLOG_BIG_RECORD_BSIZE; 3505 h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3946,10 +3955,6 @@ xlog_recover_check_summary(
3946 xfs_agf_t *agfp; 3955 xfs_agf_t *agfp;
3947 xfs_buf_t *agfbp; 3956 xfs_buf_t *agfbp;
3948 xfs_buf_t *agibp; 3957 xfs_buf_t *agibp;
3949 xfs_buf_t *sbbp;
3950#ifdef XFS_LOUD_RECOVERY
3951 xfs_sb_t *sbp;
3952#endif
3953 xfs_agnumber_t agno; 3958 xfs_agnumber_t agno;
3954 __uint64_t freeblks; 3959 __uint64_t freeblks;
3955 __uint64_t itotal; 3960 __uint64_t itotal;
@@ -3984,30 +3989,5 @@ xlog_recover_check_summary(
3984 xfs_buf_relse(agibp); 3989 xfs_buf_relse(agibp);
3985 } 3990 }
3986 } 3991 }
3987
3988 sbbp = xfs_getsb(mp, 0);
3989#ifdef XFS_LOUD_RECOVERY
3990 sbp = &mp->m_sb;
3991 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
3992 cmn_err(CE_NOTE,
3993 "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
3994 sbp->sb_icount, itotal);
3995 cmn_err(CE_NOTE,
3996 "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
3997 sbp->sb_ifree, ifree);
3998 cmn_err(CE_NOTE,
3999 "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
4000 sbp->sb_fdblocks, freeblks);
4001#if 0
4002 /*
4003 * This is turned off until I account for the allocation
4004 * btree blocks which live in free space.
4005 */
4006 ASSERT(sbp->sb_icount == itotal);
4007 ASSERT(sbp->sb_ifree == ifree);
4008 ASSERT(sbp->sb_fdblocks == freeblks);
4009#endif
4010#endif
4011 xfs_buf_relse(sbbp);
4012} 3992}
4013#endif /* DEBUG */ 3993#endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
28#define XLOG_RHASH(tid) \ 28#define XLOG_RHASH(tid) \
29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) 29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
30 30
31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) 31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
32 32
33 33
34/* 34/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e79b56b4bca6..aeb9d72ebf6e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -25,13 +25,10 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 30#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 32#include "xfs_dinode.h"
36#include "xfs_inode.h" 33#include "xfs_inode.h"
37#include "xfs_btree.h" 34#include "xfs_btree.h"
@@ -268,10 +265,10 @@ xfs_sb_validate_fsb_count(
268 265
269#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ 266#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
270 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) 267 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
271 return E2BIG; 268 return EFBIG;
272#else /* Limited by UINT_MAX of sectors */ 269#else /* Limited by UINT_MAX of sectors */
273 if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) 270 if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
274 return E2BIG; 271 return EFBIG;
275#endif 272#endif
276 return 0; 273 return 0;
277} 274}
@@ -393,7 +390,7 @@ xfs_mount_validate_sb(
393 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 390 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
394 xfs_fs_mount_cmn_err(flags, 391 xfs_fs_mount_cmn_err(flags,
395 "file system too large to be mounted on this system."); 392 "file system too large to be mounted on this system.");
396 return XFS_ERROR(E2BIG); 393 return XFS_ERROR(EFBIG);
397 } 394 }
398 395
399 if (unlikely(sbp->sb_inprogress)) { 396 if (unlikely(sbp->sb_inprogress)) {
@@ -413,17 +410,6 @@ xfs_mount_validate_sb(
413 return 0; 410 return 0;
414} 411}
415 412
416STATIC void
417xfs_initialize_perag_icache(
418 xfs_perag_t *pag)
419{
420 if (!pag->pag_ici_init) {
421 rwlock_init(&pag->pag_ici_lock);
422 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
423 pag->pag_ici_init = 1;
424 }
425}
426
427int 413int
428xfs_initialize_perag( 414xfs_initialize_perag(
429 xfs_mount_t *mp, 415 xfs_mount_t *mp,
@@ -436,13 +422,8 @@ xfs_initialize_perag(
436 xfs_agino_t agino; 422 xfs_agino_t agino;
437 xfs_ino_t ino; 423 xfs_ino_t ino;
438 xfs_sb_t *sbp = &mp->m_sb; 424 xfs_sb_t *sbp = &mp->m_sb;
439 xfs_ino_t max_inum = XFS_MAXINUMBER_32;
440 int error = -ENOMEM; 425 int error = -ENOMEM;
441 426
442 /* Check to see if the filesystem can overflow 32 bit inodes */
443 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
444 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
445
446 /* 427 /*
447 * Walk the current per-ag tree so we don't try to initialise AGs 428 * Walk the current per-ag tree so we don't try to initialise AGs
448 * that already exist (growfs case). Allocate and insert all the 429 * that already exist (growfs case). Allocate and insert all the
@@ -456,11 +437,18 @@ xfs_initialize_perag(
456 } 437 }
457 if (!first_initialised) 438 if (!first_initialised)
458 first_initialised = index; 439 first_initialised = index;
440
459 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); 441 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
460 if (!pag) 442 if (!pag)
461 goto out_unwind; 443 goto out_unwind;
444 pag->pag_agno = index;
445 pag->pag_mount = mp;
446 rwlock_init(&pag->pag_ici_lock);
447 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
448
462 if (radix_tree_preload(GFP_NOFS)) 449 if (radix_tree_preload(GFP_NOFS))
463 goto out_unwind; 450 goto out_unwind;
451
464 spin_lock(&mp->m_perag_lock); 452 spin_lock(&mp->m_perag_lock);
465 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { 453 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
466 BUG(); 454 BUG();
@@ -469,25 +457,26 @@ xfs_initialize_perag(
469 error = -EEXIST; 457 error = -EEXIST;
470 goto out_unwind; 458 goto out_unwind;
471 } 459 }
472 pag->pag_agno = index;
473 pag->pag_mount = mp;
474 spin_unlock(&mp->m_perag_lock); 460 spin_unlock(&mp->m_perag_lock);
475 radix_tree_preload_end(); 461 radix_tree_preload_end();
476 } 462 }
477 463
478 /* Clear the mount flag if no inode can overflow 32 bits 464 /*
479 * on this filesystem, or if specifically requested.. 465 * If we mount with the inode64 option, or no inode overflows
466 * the legacy 32-bit address space clear the inode32 option.
480 */ 467 */
481 if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) { 468 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
469 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
470
471 if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
482 mp->m_flags |= XFS_MOUNT_32BITINODES; 472 mp->m_flags |= XFS_MOUNT_32BITINODES;
483 } else { 473 else
484 mp->m_flags &= ~XFS_MOUNT_32BITINODES; 474 mp->m_flags &= ~XFS_MOUNT_32BITINODES;
485 }
486 475
487 /* If we can overflow then setup the ag headers accordingly */
488 if (mp->m_flags & XFS_MOUNT_32BITINODES) { 476 if (mp->m_flags & XFS_MOUNT_32BITINODES) {
489 /* Calculate how much should be reserved for inodes to 477 /*
490 * meet the max inode percentage. 478 * Calculate how much should be reserved for inodes to meet
479 * the max inode percentage.
491 */ 480 */
492 if (mp->m_maxicount) { 481 if (mp->m_maxicount) {
493 __uint64_t icount; 482 __uint64_t icount;
@@ -500,30 +489,28 @@ xfs_initialize_perag(
500 } else { 489 } else {
501 max_metadata = agcount; 490 max_metadata = agcount;
502 } 491 }
492
503 for (index = 0; index < agcount; index++) { 493 for (index = 0; index < agcount; index++) {
504 ino = XFS_AGINO_TO_INO(mp, index, agino); 494 ino = XFS_AGINO_TO_INO(mp, index, agino);
505 if (ino > max_inum) { 495 if (ino > XFS_MAXINUMBER_32) {
506 index++; 496 index++;
507 break; 497 break;
508 } 498 }
509 499
510 /* This ag is preferred for inodes */
511 pag = xfs_perag_get(mp, index); 500 pag = xfs_perag_get(mp, index);
512 pag->pagi_inodeok = 1; 501 pag->pagi_inodeok = 1;
513 if (index < max_metadata) 502 if (index < max_metadata)
514 pag->pagf_metadata = 1; 503 pag->pagf_metadata = 1;
515 xfs_initialize_perag_icache(pag);
516 xfs_perag_put(pag); 504 xfs_perag_put(pag);
517 } 505 }
518 } else { 506 } else {
519 /* Setup default behavior for smaller filesystems */
520 for (index = 0; index < agcount; index++) { 507 for (index = 0; index < agcount; index++) {
521 pag = xfs_perag_get(mp, index); 508 pag = xfs_perag_get(mp, index);
522 pag->pagi_inodeok = 1; 509 pag->pagi_inodeok = 1;
523 xfs_initialize_perag_icache(pag);
524 xfs_perag_put(pag); 510 xfs_perag_put(pag);
525 } 511 }
526 } 512 }
513
527 if (maxagi) 514 if (maxagi)
528 *maxagi = index; 515 *maxagi = index;
529 return 0; 516 return 0;
@@ -1009,7 +996,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1009 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 996 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
1010 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 997 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
1011 cmn_err(CE_WARN, "XFS: size check 1 failed"); 998 cmn_err(CE_WARN, "XFS: size check 1 failed");
1012 return XFS_ERROR(E2BIG); 999 return XFS_ERROR(EFBIG);
1013 } 1000 }
1014 error = xfs_read_buf(mp, mp->m_ddev_targp, 1001 error = xfs_read_buf(mp, mp->m_ddev_targp,
1015 d - XFS_FSS_TO_BB(mp, 1), 1002 d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1006,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1019 } else { 1006 } else {
1020 cmn_err(CE_WARN, "XFS: size check 2 failed"); 1007 cmn_err(CE_WARN, "XFS: size check 2 failed");
1021 if (error == ENOSPC) 1008 if (error == ENOSPC)
1022 error = XFS_ERROR(E2BIG); 1009 error = XFS_ERROR(EFBIG);
1023 return error; 1010 return error;
1024 } 1011 }
1025 1012
@@ -1027,7 +1014,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1027 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1014 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1028 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1015 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1029 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1016 cmn_err(CE_WARN, "XFS: size check 3 failed");
1030 return XFS_ERROR(E2BIG); 1017 return XFS_ERROR(EFBIG);
1031 } 1018 }
1032 error = xfs_read_buf(mp, mp->m_logdev_targp, 1019 error = xfs_read_buf(mp, mp->m_logdev_targp,
1033 d - XFS_FSB_TO_BB(mp, 1), 1020 d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1024,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1037 } else { 1024 } else {
1038 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1025 cmn_err(CE_WARN, "XFS: size check 3 failed");
1039 if (error == ENOSPC) 1026 if (error == ENOSPC)
1040 error = XFS_ERROR(E2BIG); 1027 error = XFS_ERROR(EFBIG);
1041 return error; 1028 return error;
1042 } 1029 }
1043 } 1030 }
@@ -1254,7 +1241,7 @@ xfs_mountfs(
1254 * Allocate and initialize the per-ag data. 1241 * Allocate and initialize the per-ag data.
1255 */ 1242 */
1256 spin_lock_init(&mp->m_perag_lock); 1243 spin_lock_init(&mp->m_perag_lock);
1257 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS); 1244 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
1258 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); 1245 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1259 if (error) { 1246 if (error) {
1260 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); 1247 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
@@ -1310,7 +1297,7 @@ xfs_mountfs(
1310 * Get and sanity-check the root inode. 1297 * Get and sanity-check the root inode.
1311 * Save the pointer to it in the mount structure. 1298 * Save the pointer to it in the mount structure.
1312 */ 1299 */
1313 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); 1300 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
1314 if (error) { 1301 if (error) {
1315 cmn_err(CE_WARN, "XFS: failed to read root inode"); 1302 cmn_err(CE_WARN, "XFS: failed to read root inode");
1316 goto out_log_dealloc; 1303 goto out_log_dealloc;
@@ -1405,13 +1392,6 @@ xfs_mountfs(
1405 xfs_qm_mount_quotas(mp); 1392 xfs_qm_mount_quotas(mp);
1406 } 1393 }
1407 1394
1408#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1409 if (XFS_IS_QUOTA_ON(mp))
1410 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
1411 else
1412 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
1413#endif
1414
1415 /* 1395 /*
1416 * Now we are mounted, reserve a small amount of unused space for 1396 * Now we are mounted, reserve a small amount of unused space for
1417 * privileged transactions. This is needed so that transaction 1397 * privileged transactions. This is needed so that transaction
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..622da2179a57 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,65 +66,6 @@ struct xfs_nameops;
66struct xfs_ail; 66struct xfs_ail;
67struct xfs_quotainfo; 67struct xfs_quotainfo;
68 68
69
70/*
71 * Prototypes and functions for the Data Migration subsystem.
72 */
73
74typedef int (*xfs_send_data_t)(int, struct xfs_inode *,
75 xfs_off_t, size_t, int, int *);
76typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
77typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
79 struct xfs_inode *, dm_right_t,
80 struct xfs_inode *, dm_right_t,
81 const unsigned char *, const unsigned char *,
82 mode_t, int, int);
83typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
84 char *, char *);
85typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
86 dm_right_t, mode_t, int, int);
87
88typedef struct xfs_dmops {
89 xfs_send_data_t xfs_send_data;
90 xfs_send_mmap_t xfs_send_mmap;
91 xfs_send_destroy_t xfs_send_destroy;
92 xfs_send_namesp_t xfs_send_namesp;
93 xfs_send_mount_t xfs_send_mount;
94 xfs_send_unmount_t xfs_send_unmount;
95} xfs_dmops_t;
96
97#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \
98 (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED)
99
100#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
101 (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
102#define XFS_SEND_MMAP(mp, vma,fl) \
103 (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
104#define XFS_SEND_DESTROY(mp, ip,right) \
105 (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
106#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
107 (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
108#define XFS_SEND_MOUNT(mp,right,path,name) \
109 (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
110#define XFS_SEND_PREUNMOUNT(mp) \
111do { \
112 if (mp->m_flags & XFS_MOUNT_DMAPI) { \
113 (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \
114 (mp)->m_rootip, DM_RIGHT_NULL, \
115 (mp)->m_rootip, DM_RIGHT_NULL, \
116 NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
117 } \
118} while (0)
119#define XFS_SEND_UNMOUNT(mp) \
120do { \
121 if (mp->m_flags & XFS_MOUNT_DMAPI) { \
122 (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \
123 DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
124 } \
125} while (0)
126
127
128#ifdef HAVE_PERCPU_SB 69#ifdef HAVE_PERCPU_SB
129 70
130/* 71/*
@@ -241,8 +182,6 @@ typedef struct xfs_mount {
241 uint m_chsize; /* size of next field */ 182 uint m_chsize; /* size of next field */
242 struct xfs_chash *m_chash; /* fs private inode per-cluster 183 struct xfs_chash *m_chash; /* fs private inode per-cluster
243 * hash table */ 184 * hash table */
244 struct xfs_dmops *m_dm_ops; /* vector of DMI ops */
245 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */
246 atomic_t m_active_trans; /* number trans frozen */ 185 atomic_t m_active_trans; /* number trans frozen */
247#ifdef HAVE_PERCPU_SB 186#ifdef HAVE_PERCPU_SB
248 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ 187 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -259,7 +198,7 @@ typedef struct xfs_mount {
259 wait_queue_head_t m_wait_single_sync_task; 198 wait_queue_head_t m_wait_single_sync_task;
260 __int64_t m_update_flags; /* sb flags we need to update 199 __int64_t m_update_flags; /* sb flags we need to update
261 on the next remount,rw */ 200 on the next remount,rw */
262 struct list_head m_mplist; /* inode shrinker mount list */ 201 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
263} xfs_mount_t; 202} xfs_mount_t;
264 203
265/* 204/*
@@ -268,7 +207,7 @@ typedef struct xfs_mount {
268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 207#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
269 must be synchronous except 208 must be synchronous except
270 for space allocations */ 209 for space allocations */
271#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 210#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
272#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 211#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
273#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 212#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
274 operations, typically for 213 operations, typically for
@@ -281,8 +220,6 @@ typedef struct xfs_mount {
281#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ 220#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
282#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ 221#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
283#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ 222#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
284#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */
285 /* osyncisdsync is now default*/
286#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above 223#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above
287 * 32 bits in size */ 224 * 32 bits in size */
288#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ 225#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */
@@ -439,11 +376,6 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
439 376
440extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 377extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
441 378
442extern int xfs_dmops_get(struct xfs_mount *);
443extern void xfs_dmops_put(struct xfs_mount *);
444
445extern struct xfs_dmops xfs_dmcore_xfs;
446
447#endif /* __KERNEL__ */ 379#endif /* __KERNEL__ */
448 380
449extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 381extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fdcab3f81dde..e0e64b113bd6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
201#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ 201#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
202#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ 202#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */
203#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ 203#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
204#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */
205#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */
206#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */
207#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ 204#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
208#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ 205#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
209#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ 206#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index fc1cda23b817..8fca957200df 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -24,12 +24,9 @@
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h" 26#include "xfs_dir2.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 30#include "xfs_dinode.h"
34#include "xfs_inode.h" 31#include "xfs_inode.h"
35#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
@@ -116,20 +113,7 @@ xfs_rename(
116 int spaceres; 113 int spaceres;
117 int num_inodes; 114 int num_inodes;
118 115
119 xfs_itrace_entry(src_dp); 116 trace_xfs_rename(src_dp, target_dp, src_name, target_name);
120 xfs_itrace_entry(target_dp);
121
122 if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
123 DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
124 error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
125 src_dp, DM_RIGHT_NULL,
126 target_dp, DM_RIGHT_NULL,
127 src_name->name, target_name->name,
128 0, 0, 0);
129 if (error)
130 return error;
131 }
132 /* Return through std_return after this point. */
133 117
134 new_parent = (src_dp != target_dp); 118 new_parent = (src_dp != target_dp);
135 src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); 119 src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
@@ -184,26 +168,14 @@ xfs_rename(
184 /* 168 /*
185 * Join all the inodes to the transaction. From this point on, 169 * Join all the inodes to the transaction. From this point on,
186 * we can rely on either trans_commit or trans_cancel to unlock 170 * we can rely on either trans_commit or trans_cancel to unlock
187 * them. Note that we need to add a vnode reference to the 171 * them.
188 * directories since trans_commit & trans_cancel will decrement
189 * them when they unlock the inodes. Also, we need to be careful
190 * not to add an inode to the transaction more than once.
191 */ 172 */
192 IHOLD(src_dp); 173 xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
193 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 174 if (new_parent)
194 175 xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
195 if (new_parent) { 176 xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
196 IHOLD(target_dp); 177 if (target_ip)
197 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 178 xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
198 }
199
200 IHOLD(src_ip);
201 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
202
203 if (target_ip) {
204 IHOLD(target_ip);
205 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
206 }
207 179
208 /* 180 /*
209 * If we are using project inheritance, we only allow renames 181 * If we are using project inheritance, we only allow renames
@@ -369,26 +341,13 @@ xfs_rename(
369 * trans_commit will unlock src_ip, target_ip & decrement 341 * trans_commit will unlock src_ip, target_ip & decrement
370 * the vnode references. 342 * the vnode references.
371 */ 343 */
372 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 344 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
373
374 /* Fall through to std_return with error = 0 or errno from
375 * xfs_trans_commit */
376std_return:
377 if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
378 DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
379 (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
380 src_dp, DM_RIGHT_NULL,
381 target_dp, DM_RIGHT_NULL,
382 src_name->name, target_name->name,
383 0, error, 0);
384 }
385 return error;
386 345
387 abort_return: 346 abort_return:
388 cancel_flags |= XFS_TRANS_ABORT; 347 cancel_flags |= XFS_TRANS_ABORT;
389 /* FALLTHROUGH */
390 error_return: 348 error_return:
391 xfs_bmap_cancel(&free_list); 349 xfs_bmap_cancel(&free_list);
392 xfs_trans_cancel(tp, cancel_flags); 350 xfs_trans_cancel(tp, cancel_flags);
393 goto std_return; 351 std_return:
352 return error;
394} 353}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d59..891260fea11e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,17 +25,10 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 30#include "xfs_dinode.h"
36#include "xfs_inode.h" 31#include "xfs_inode.h"
37#include "xfs_btree.h"
38#include "xfs_ialloc.h"
39#include "xfs_alloc.h" 32#include "xfs_alloc.h"
40#include "xfs_bmap.h" 33#include "xfs_bmap.h"
41#include "xfs_rtalloc.h" 34#include "xfs_rtalloc.h"
@@ -129,7 +122,7 @@ xfs_growfs_rt_alloc(
129 cancelflags |= XFS_TRANS_ABORT; 122 cancelflags |= XFS_TRANS_ABORT;
130 error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, 123 error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
131 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, 124 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
132 resblks, &map, &nmap, &flist, NULL); 125 resblks, &map, &nmap, &flist);
133 if (!error && nmap < 1) 126 if (!error && nmap < 1)
134 error = XFS_ERROR(ENOSPC); 127 error = XFS_ERROR(ENOSPC);
135 if (error) 128 if (error)
@@ -2247,7 +2240,7 @@ xfs_rtmount_init(
2247 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", 2240 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
2248 (unsigned long long) XFS_BB_TO_FSB(mp, d), 2241 (unsigned long long) XFS_BB_TO_FSB(mp, d),
2249 (unsigned long long) mp->m_sb.sb_rblocks); 2242 (unsigned long long) mp->m_sb.sb_rblocks);
2250 return XFS_ERROR(E2BIG); 2243 return XFS_ERROR(EFBIG);
2251 } 2244 }
2252 error = xfs_read_buf(mp, mp->m_rtdev_targp, 2245 error = xfs_read_buf(mp, mp->m_rtdev_targp,
2253 d - XFS_FSB_TO_BB(mp, 1), 2246 d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2249,7 @@ xfs_rtmount_init(
2256 cmn_err(CE_WARN, 2249 cmn_err(CE_WARN,
2257 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); 2250 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
2258 if (error == ENOSPC) 2251 if (error == ENOSPC)
2259 return XFS_ERROR(E2BIG); 2252 return XFS_ERROR(EFBIG);
2260 return error; 2253 return error;
2261 } 2254 }
2262 xfs_buf_relse(bp); 2255 xfs_buf_relse(bp);
@@ -2277,12 +2270,12 @@ xfs_rtmount_inodes(
2277 sbp = &mp->m_sb; 2270 sbp = &mp->m_sb;
2278 if (sbp->sb_rbmino == NULLFSINO) 2271 if (sbp->sb_rbmino == NULLFSINO)
2279 return 0; 2272 return 0;
2280 error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0); 2273 error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
2281 if (error) 2274 if (error)
2282 return error; 2275 return error;
2283 ASSERT(mp->m_rbmip != NULL); 2276 ASSERT(mp->m_rbmip != NULL);
2284 ASSERT(sbp->sb_rsumino != NULLFSINO); 2277 ASSERT(sbp->sb_rsumino != NULLFSINO);
2285 error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0); 2278 error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
2286 if (error) { 2279 if (error) {
2287 IRELE(mp->m_rbmip); 2280 IRELE(mp->m_rbmip);
2288 return error; 2281 return error;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a08..ff614c29b441 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
147# define xfs_rtfree_extent(t,b,l) (ENOSYS) 147# define xfs_rtfree_extent(t,b,l) (ENOSYS)
148# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) 148# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
149# define xfs_growfs_rt(mp,in) (ENOSYS) 149# define xfs_growfs_rt(mp,in) (ENOSYS)
150# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 150static inline int /* error */
151xfs_rtmount_init(
152 xfs_mount_t *mp) /* file system mount structure */
153{
154 if (mp->m_sb.sb_rblocks == 0)
155 return 0;
156
157 cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
158 return ENOSYS;
159}
151# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
152# define xfs_rtunmount_inodes(m) 161# define xfs_rtunmount_inodes(m)
153#endif /* CONFIG_XFS_RT */ 162#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index e336742a58a4..56861d5daaef 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -24,27 +24,12 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 29#include "xfs_dinode.h"
36#include "xfs_inode.h" 30#include "xfs_inode.h"
37#include "xfs_inode_item.h"
38#include "xfs_itable.h"
39#include "xfs_btree.h"
40#include "xfs_alloc.h"
41#include "xfs_ialloc.h"
42#include "xfs_attr.h"
43#include "xfs_bmap.h"
44#include "xfs_error.h" 31#include "xfs_error.h"
45#include "xfs_buf_item.h"
46#include "xfs_rw.h" 32#include "xfs_rw.h"
47#include "xfs_trace.h"
48 33
49/* 34/*
50 * Force a shutdown of the filesystem instantly while keeping 35 * Force a shutdown of the filesystem instantly while keeping
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f73e358bae8d..1c47edaea0d2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved. 4 * All Rights Reserved.
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -24,16 +25,12 @@
24#include "xfs_trans.h" 25#include "xfs_trans.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_error.h" 29#include "xfs_error.h"
31#include "xfs_da_btree.h" 30#include "xfs_da_btree.h"
32#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h" 32#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h" 33#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 34#include "xfs_dinode.h"
38#include "xfs_inode.h" 35#include "xfs_inode.h"
39#include "xfs_btree.h" 36#include "xfs_btree.h"
@@ -44,148 +41,494 @@
44#include "xfs_trans_priv.h" 41#include "xfs_trans_priv.h"
45#include "xfs_trans_space.h" 42#include "xfs_trans_space.h"
46#include "xfs_inode_item.h" 43#include "xfs_inode_item.h"
47 44#include "xfs_trace.h"
48
49STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *);
50STATIC uint xfs_trans_count_vecs(xfs_trans_t *);
51STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
52STATIC void xfs_trans_uncommit(xfs_trans_t *, uint);
53STATIC void xfs_trans_committed(xfs_trans_t *, int);
54STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
55STATIC void xfs_trans_free(xfs_trans_t *);
56 45
57kmem_zone_t *xfs_trans_zone; 46kmem_zone_t *xfs_trans_zone;
47kmem_zone_t *xfs_log_item_desc_zone;
58 48
59 49
60/* 50/*
61 * Reservation functions here avoid a huge stack in xfs_trans_init 51 * Various log reservation values.
62 * due to register overflow from temporaries in the calculations. 52 *
53 * These are based on the size of the file system block because that is what
54 * most transactions manipulate. Each adds in an additional 128 bytes per
55 * item logged to try to account for the overhead of the transaction mechanism.
56 *
57 * Note: Most of the reservations underestimate the number of allocation
58 * groups into which they could free extents in the xfs_bmap_finish() call.
59 * This is because the number in the worst case is quite high and quite
60 * unusual. In order to fix this we need to change xfs_bmap_finish() to free
61 * extents in only a single AG at a time. This will require changes to the
62 * EFI code as well, however, so that the EFI for the extents not freed is
63 * logged again in each transaction. See SGI PV #261917.
64 *
65 * Reservation functions here avoid a huge stack in xfs_trans_init due to
66 * register overflow from temporaries in the calculations.
63 */ 67 */
64 68
69
70/*
71 * In a write transaction we can allocate a maximum of 2
72 * extents. This gives:
73 * the inode getting the new extents: inode size
74 * the inode's bmap btree: max depth * block size
75 * the agfs of the ags from which the extents are allocated: 2 * sector
76 * the superblock free block counter: sector size
77 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
78 * And the bmap_finish transaction can free bmap blocks in a join:
79 * the agfs of the ags containing the blocks: 2 * sector size
80 * the agfls of the ags containing the blocks: 2 * sector size
81 * the super block free block counter: sector size
82 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
83 */
65STATIC uint 84STATIC uint
66xfs_calc_write_reservation(xfs_mount_t *mp) 85xfs_calc_write_reservation(
86 struct xfs_mount *mp)
67{ 87{
68 return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 88 return XFS_DQUOT_LOGRES(mp) +
89 MAX((mp->m_sb.sb_inodesize +
90 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
91 2 * mp->m_sb.sb_sectsize +
92 mp->m_sb.sb_sectsize +
93 XFS_ALLOCFREE_LOG_RES(mp, 2) +
94 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
95 XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
96 (2 * mp->m_sb.sb_sectsize +
97 2 * mp->m_sb.sb_sectsize +
98 mp->m_sb.sb_sectsize +
99 XFS_ALLOCFREE_LOG_RES(mp, 2) +
100 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
69} 101}
70 102
103/*
104 * In truncating a file we free up to two extents at once. We can modify:
105 * the inode being truncated: inode size
106 * the inode's bmap btree: (max depth + 1) * block size
107 * And the bmap_finish transaction can free the blocks and bmap blocks:
108 * the agf for each of the ags: 4 * sector size
109 * the agfl for each of the ags: 4 * sector size
110 * the super block to reflect the freed blocks: sector size
111 * worst case split in allocation btrees per extent assuming 4 extents:
112 * 4 exts * 2 trees * (2 * max depth - 1) * block size
113 * the inode btree: max depth * blocksize
114 * the allocation btrees: 2 trees * (max depth - 1) * block size
115 */
71STATIC uint 116STATIC uint
72xfs_calc_itruncate_reservation(xfs_mount_t *mp) 117xfs_calc_itruncate_reservation(
118 struct xfs_mount *mp)
73{ 119{
74 return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 120 return XFS_DQUOT_LOGRES(mp) +
121 MAX((mp->m_sb.sb_inodesize +
122 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
123 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
124 (4 * mp->m_sb.sb_sectsize +
125 4 * mp->m_sb.sb_sectsize +
126 mp->m_sb.sb_sectsize +
127 XFS_ALLOCFREE_LOG_RES(mp, 4) +
128 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
129 128 * 5 +
130 XFS_ALLOCFREE_LOG_RES(mp, 1) +
131 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
132 XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
75} 133}
76 134
135/*
136 * In renaming a files we can modify:
137 * the four inodes involved: 4 * inode size
138 * the two directory btrees: 2 * (max depth + v2) * dir block size
139 * the two directory bmap btrees: 2 * max depth * block size
140 * And the bmap_finish transaction can free dir and bmap blocks (two sets
141 * of bmap blocks) giving:
142 * the agf for the ags in which the blocks live: 3 * sector size
143 * the agfl for the ags in which the blocks live: 3 * sector size
144 * the superblock for the free block count: sector size
145 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
146 */
77STATIC uint 147STATIC uint
78xfs_calc_rename_reservation(xfs_mount_t *mp) 148xfs_calc_rename_reservation(
149 struct xfs_mount *mp)
79{ 150{
80 return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 151 return XFS_DQUOT_LOGRES(mp) +
152 MAX((4 * mp->m_sb.sb_inodesize +
153 2 * XFS_DIROP_LOG_RES(mp) +
154 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
155 (3 * mp->m_sb.sb_sectsize +
156 3 * mp->m_sb.sb_sectsize +
157 mp->m_sb.sb_sectsize +
158 XFS_ALLOCFREE_LOG_RES(mp, 3) +
159 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
81} 160}
82 161
162/*
163 * For creating a link to an inode:
164 * the parent directory inode: inode size
165 * the linked inode: inode size
166 * the directory btree could split: (max depth + v2) * dir block size
167 * the directory bmap btree could join or split: (max depth + v2) * blocksize
168 * And the bmap_finish transaction can free some bmap blocks giving:
169 * the agf for the ag in which the blocks live: sector size
170 * the agfl for the ag in which the blocks live: sector size
171 * the superblock for the free block count: sector size
172 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
173 */
83STATIC uint 174STATIC uint
84xfs_calc_link_reservation(xfs_mount_t *mp) 175xfs_calc_link_reservation(
176 struct xfs_mount *mp)
85{ 177{
86 return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 178 return XFS_DQUOT_LOGRES(mp) +
179 MAX((mp->m_sb.sb_inodesize +
180 mp->m_sb.sb_inodesize +
181 XFS_DIROP_LOG_RES(mp) +
182 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
183 (mp->m_sb.sb_sectsize +
184 mp->m_sb.sb_sectsize +
185 mp->m_sb.sb_sectsize +
186 XFS_ALLOCFREE_LOG_RES(mp, 1) +
187 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
87} 188}
88 189
190/*
191 * For removing a directory entry we can modify:
192 * the parent directory inode: inode size
193 * the removed inode: inode size
194 * the directory btree could join: (max depth + v2) * dir block size
195 * the directory bmap btree could join or split: (max depth + v2) * blocksize
196 * And the bmap_finish transaction can free the dir and bmap blocks giving:
197 * the agf for the ag in which the blocks live: 2 * sector size
198 * the agfl for the ag in which the blocks live: 2 * sector size
199 * the superblock for the free block count: sector size
200 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
201 */
89STATIC uint 202STATIC uint
90xfs_calc_remove_reservation(xfs_mount_t *mp) 203xfs_calc_remove_reservation(
204 struct xfs_mount *mp)
91{ 205{
92 return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 206 return XFS_DQUOT_LOGRES(mp) +
207 MAX((mp->m_sb.sb_inodesize +
208 mp->m_sb.sb_inodesize +
209 XFS_DIROP_LOG_RES(mp) +
210 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
211 (2 * mp->m_sb.sb_sectsize +
212 2 * mp->m_sb.sb_sectsize +
213 mp->m_sb.sb_sectsize +
214 XFS_ALLOCFREE_LOG_RES(mp, 2) +
215 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
93} 216}
94 217
218/*
219 * For symlink we can modify:
220 * the parent directory inode: inode size
221 * the new inode: inode size
222 * the inode btree entry: 1 block
223 * the directory btree: (max depth + v2) * dir block size
224 * the directory inode's bmap btree: (max depth + v2) * block size
225 * the blocks for the symlink: 1 kB
226 * Or in the first xact we allocate some inodes giving:
227 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
228 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
229 * the inode btree: max depth * blocksize
230 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
231 */
95STATIC uint 232STATIC uint
96xfs_calc_symlink_reservation(xfs_mount_t *mp) 233xfs_calc_symlink_reservation(
234 struct xfs_mount *mp)
97{ 235{
98 return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 236 return XFS_DQUOT_LOGRES(mp) +
237 MAX((mp->m_sb.sb_inodesize +
238 mp->m_sb.sb_inodesize +
239 XFS_FSB_TO_B(mp, 1) +
240 XFS_DIROP_LOG_RES(mp) +
241 1024 +
242 128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
243 (2 * mp->m_sb.sb_sectsize +
244 XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
245 XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
246 XFS_ALLOCFREE_LOG_RES(mp, 1) +
247 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
248 XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
99} 249}
100 250
251/*
252 * For create we can modify:
253 * the parent directory inode: inode size
254 * the new inode: inode size
255 * the inode btree entry: block size
256 * the superblock for the nlink flag: sector size
257 * the directory btree: (max depth + v2) * dir block size
258 * the directory inode's bmap btree: (max depth + v2) * block size
259 * Or in the first xact we allocate some inodes giving:
260 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
261 * the superblock for the nlink flag: sector size
262 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
263 * the inode btree: max depth * blocksize
264 * the allocation btrees: 2 trees * (max depth - 1) * block size
265 */
101STATIC uint 266STATIC uint
102xfs_calc_create_reservation(xfs_mount_t *mp) 267xfs_calc_create_reservation(
268 struct xfs_mount *mp)
103{ 269{
104 return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 270 return XFS_DQUOT_LOGRES(mp) +
271 MAX((mp->m_sb.sb_inodesize +
272 mp->m_sb.sb_inodesize +
273 mp->m_sb.sb_sectsize +
274 XFS_FSB_TO_B(mp, 1) +
275 XFS_DIROP_LOG_RES(mp) +
276 128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
277 (3 * mp->m_sb.sb_sectsize +
278 XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
279 XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
280 XFS_ALLOCFREE_LOG_RES(mp, 1) +
281 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
282 XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
105} 283}
106 284
285/*
286 * Making a new directory is the same as creating a new file.
287 */
107STATIC uint 288STATIC uint
108xfs_calc_mkdir_reservation(xfs_mount_t *mp) 289xfs_calc_mkdir_reservation(
290 struct xfs_mount *mp)
109{ 291{
110 return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 292 return xfs_calc_create_reservation(mp);
111} 293}
112 294
295/*
296 * In freeing an inode we can modify:
297 * the inode being freed: inode size
298 * the super block free inode counter: sector size
299 * the agi hash list and counters: sector size
300 * the inode btree entry: block size
301 * the on disk inode before ours in the agi hash list: inode cluster size
302 * the inode btree: max depth * blocksize
303 * the allocation btrees: 2 trees * (max depth - 1) * block size
304 */
113STATIC uint 305STATIC uint
114xfs_calc_ifree_reservation(xfs_mount_t *mp) 306xfs_calc_ifree_reservation(
307 struct xfs_mount *mp)
115{ 308{
116 return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 309 return XFS_DQUOT_LOGRES(mp) +
310 mp->m_sb.sb_inodesize +
311 mp->m_sb.sb_sectsize +
312 mp->m_sb.sb_sectsize +
313 XFS_FSB_TO_B(mp, 1) +
314 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
315 XFS_INODE_CLUSTER_SIZE(mp)) +
316 128 * 5 +
317 XFS_ALLOCFREE_LOG_RES(mp, 1) +
318 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
319 XFS_ALLOCFREE_LOG_COUNT(mp, 1));
117} 320}
118 321
322/*
323 * When only changing the inode we log the inode and possibly the superblock
324 * We also add a bit of slop for the transaction stuff.
325 */
119STATIC uint 326STATIC uint
120xfs_calc_ichange_reservation(xfs_mount_t *mp) 327xfs_calc_ichange_reservation(
328 struct xfs_mount *mp)
121{ 329{
122 return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 330 return XFS_DQUOT_LOGRES(mp) +
331 mp->m_sb.sb_inodesize +
332 mp->m_sb.sb_sectsize +
333 512;
334
123} 335}
124 336
337/*
338 * Growing the data section of the filesystem.
339 * superblock
340 * agi and agf
341 * allocation btrees
342 */
125STATIC uint 343STATIC uint
126xfs_calc_growdata_reservation(xfs_mount_t *mp) 344xfs_calc_growdata_reservation(
345 struct xfs_mount *mp)
127{ 346{
128 return XFS_CALC_GROWDATA_LOG_RES(mp); 347 return mp->m_sb.sb_sectsize * 3 +
348 XFS_ALLOCFREE_LOG_RES(mp, 1) +
349 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
129} 350}
130 351
352/*
353 * Growing the rt section of the filesystem.
354 * In the first set of transactions (ALLOC) we allocate space to the
355 * bitmap or summary files.
356 * superblock: sector size
357 * agf of the ag from which the extent is allocated: sector size
358 * bmap btree for bitmap/summary inode: max depth * blocksize
359 * bitmap/summary inode: inode size
360 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
361 */
131STATIC uint 362STATIC uint
132xfs_calc_growrtalloc_reservation(xfs_mount_t *mp) 363xfs_calc_growrtalloc_reservation(
364 struct xfs_mount *mp)
133{ 365{
134 return XFS_CALC_GROWRTALLOC_LOG_RES(mp); 366 return 2 * mp->m_sb.sb_sectsize +
367 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
368 mp->m_sb.sb_inodesize +
369 XFS_ALLOCFREE_LOG_RES(mp, 1) +
370 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
371 XFS_ALLOCFREE_LOG_COUNT(mp, 1));
135} 372}
136 373
374/*
375 * Growing the rt section of the filesystem.
376 * In the second set of transactions (ZERO) we zero the new metadata blocks.
377 * one bitmap/summary block: blocksize
378 */
137STATIC uint 379STATIC uint
138xfs_calc_growrtzero_reservation(xfs_mount_t *mp) 380xfs_calc_growrtzero_reservation(
381 struct xfs_mount *mp)
139{ 382{
140 return XFS_CALC_GROWRTZERO_LOG_RES(mp); 383 return mp->m_sb.sb_blocksize + 128;
141} 384}
142 385
386/*
387 * Growing the rt section of the filesystem.
388 * In the third set of transactions (FREE) we update metadata without
389 * allocating any new blocks.
390 * superblock: sector size
391 * bitmap inode: inode size
392 * summary inode: inode size
393 * one bitmap block: blocksize
394 * summary blocks: new summary size
395 */
143STATIC uint 396STATIC uint
144xfs_calc_growrtfree_reservation(xfs_mount_t *mp) 397xfs_calc_growrtfree_reservation(
398 struct xfs_mount *mp)
145{ 399{
146 return XFS_CALC_GROWRTFREE_LOG_RES(mp); 400 return mp->m_sb.sb_sectsize +
401 2 * mp->m_sb.sb_inodesize +
402 mp->m_sb.sb_blocksize +
403 mp->m_rsumsize +
404 128 * 5;
147} 405}
148 406
407/*
408 * Logging the inode modification timestamp on a synchronous write.
409 * inode
410 */
149STATIC uint 411STATIC uint
150xfs_calc_swrite_reservation(xfs_mount_t *mp) 412xfs_calc_swrite_reservation(
413 struct xfs_mount *mp)
151{ 414{
152 return XFS_CALC_SWRITE_LOG_RES(mp); 415 return mp->m_sb.sb_inodesize + 128;
153} 416}
154 417
418/*
419 * Logging the inode mode bits when writing a setuid/setgid file
420 * inode
421 */
155STATIC uint 422STATIC uint
156xfs_calc_writeid_reservation(xfs_mount_t *mp) 423xfs_calc_writeid_reservation(xfs_mount_t *mp)
157{ 424{
158 return XFS_CALC_WRITEID_LOG_RES(mp); 425 return mp->m_sb.sb_inodesize + 128;
159} 426}
160 427
428/*
429 * Converting the inode from non-attributed to attributed.
430 * the inode being converted: inode size
431 * agf block and superblock (for block allocation)
432 * the new block (directory sized)
433 * bmap blocks for the new directory block
434 * allocation btrees
435 */
161STATIC uint 436STATIC uint
162xfs_calc_addafork_reservation(xfs_mount_t *mp) 437xfs_calc_addafork_reservation(
438 struct xfs_mount *mp)
163{ 439{
164 return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 440 return XFS_DQUOT_LOGRES(mp) +
441 mp->m_sb.sb_inodesize +
442 mp->m_sb.sb_sectsize * 2 +
443 mp->m_dirblksize +
444 XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
445 XFS_ALLOCFREE_LOG_RES(mp, 1) +
446 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
447 XFS_ALLOCFREE_LOG_COUNT(mp, 1));
165} 448}
166 449
450/*
451 * Removing the attribute fork of a file
452 * the inode being truncated: inode size
453 * the inode's bmap btree: max depth * block size
454 * And the bmap_finish transaction can free the blocks and bmap blocks:
455 * the agf for each of the ags: 4 * sector size
456 * the agfl for each of the ags: 4 * sector size
457 * the super block to reflect the freed blocks: sector size
458 * worst case split in allocation btrees per extent assuming 4 extents:
459 * 4 exts * 2 trees * (2 * max depth - 1) * block size
460 */
167STATIC uint 461STATIC uint
168xfs_calc_attrinval_reservation(xfs_mount_t *mp) 462xfs_calc_attrinval_reservation(
463 struct xfs_mount *mp)
169{ 464{
170 return XFS_CALC_ATTRINVAL_LOG_RES(mp); 465 return MAX((mp->m_sb.sb_inodesize +
466 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
467 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
468 (4 * mp->m_sb.sb_sectsize +
469 4 * mp->m_sb.sb_sectsize +
470 mp->m_sb.sb_sectsize +
471 XFS_ALLOCFREE_LOG_RES(mp, 4) +
472 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
171} 473}
172 474
475/*
476 * Setting an attribute.
477 * the inode getting the attribute
478 * the superblock for allocations
479 * the agfs extents are allocated from
480 * the attribute btree * max depth
481 * the inode allocation btree
482 * Since attribute transaction space is dependent on the size of the attribute,
483 * the calculation is done partially at mount time and partially at runtime.
484 */
173STATIC uint 485STATIC uint
174xfs_calc_attrset_reservation(xfs_mount_t *mp) 486xfs_calc_attrset_reservation(
487 struct xfs_mount *mp)
175{ 488{
176 return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 489 return XFS_DQUOT_LOGRES(mp) +
490 mp->m_sb.sb_inodesize +
491 mp->m_sb.sb_sectsize +
492 XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
493 128 * (2 + XFS_DA_NODE_MAXDEPTH);
177} 494}
178 495
496/*
497 * Removing an attribute.
498 * the inode: inode size
499 * the attribute btree could join: max depth * block size
500 * the inode bmap btree could join or split: max depth * block size
501 * And the bmap_finish transaction can free the attr blocks freed giving:
502 * the agf for the ag in which the blocks live: 2 * sector size
503 * the agfl for the ag in which the blocks live: 2 * sector size
504 * the superblock for the free block count: sector size
505 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
506 */
179STATIC uint 507STATIC uint
180xfs_calc_attrrm_reservation(xfs_mount_t *mp) 508xfs_calc_attrrm_reservation(
509 struct xfs_mount *mp)
181{ 510{
182 return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 511 return XFS_DQUOT_LOGRES(mp) +
512 MAX((mp->m_sb.sb_inodesize +
513 XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
514 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
515 128 * (1 + XFS_DA_NODE_MAXDEPTH +
516 XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
517 (2 * mp->m_sb.sb_sectsize +
518 2 * mp->m_sb.sb_sectsize +
519 mp->m_sb.sb_sectsize +
520 XFS_ALLOCFREE_LOG_RES(mp, 2) +
521 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
183} 522}
184 523
524/*
525 * Clearing a bad agino number in an agi hash bucket.
526 */
185STATIC uint 527STATIC uint
186xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp) 528xfs_calc_clear_agi_bucket_reservation(
529 struct xfs_mount *mp)
187{ 530{
188 return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp); 531 return mp->m_sb.sb_sectsize + 128;
189} 532}
190 533
191/* 534/*
@@ -194,11 +537,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
194 */ 537 */
195void 538void
196xfs_trans_init( 539xfs_trans_init(
197 xfs_mount_t *mp) 540 struct xfs_mount *mp)
198{ 541{
199 xfs_trans_reservations_t *resp; 542 struct xfs_trans_reservations *resp = &mp->m_reservations;
200 543
201 resp = &(mp->m_reservations);
202 resp->tr_write = xfs_calc_write_reservation(mp); 544 resp->tr_write = xfs_calc_write_reservation(mp);
203 resp->tr_itruncate = xfs_calc_itruncate_reservation(mp); 545 resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
204 resp->tr_rename = xfs_calc_rename_reservation(mp); 546 resp->tr_rename = xfs_calc_rename_reservation(mp);
@@ -253,14 +595,30 @@ _xfs_trans_alloc(
253 tp->t_magic = XFS_TRANS_MAGIC; 595 tp->t_magic = XFS_TRANS_MAGIC;
254 tp->t_type = type; 596 tp->t_type = type;
255 tp->t_mountp = mp; 597 tp->t_mountp = mp;
256 tp->t_items_free = XFS_LIC_NUM_SLOTS; 598 INIT_LIST_HEAD(&tp->t_items);
257 tp->t_busy_free = XFS_LBC_NUM_SLOTS; 599 INIT_LIST_HEAD(&tp->t_busy);
258 xfs_lic_init(&(tp->t_items));
259 XFS_LBC_INIT(&(tp->t_busy));
260 return tp; 600 return tp;
261} 601}
262 602
263/* 603/*
604 * Free the transaction structure. If there is more clean up
605 * to do when the structure is freed, add it here.
606 */
607STATIC void
608xfs_trans_free(
609 struct xfs_trans *tp)
610{
611 struct xfs_busy_extent *busyp, *n;
612
613 list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
614 xfs_alloc_busy_clear(tp->t_mountp, busyp);
615
616 atomic_dec(&tp->t_mountp->m_active_trans);
617 xfs_trans_free_dqinfo(tp);
618 kmem_zone_free(xfs_trans_zone, tp);
619}
620
621/*
264 * This is called to create a new transaction which will share the 622 * This is called to create a new transaction which will share the
265 * permanent log reservation of the given transaction. The remaining 623 * permanent log reservation of the given transaction. The remaining
266 * unused block and rt extent reservations are also inherited. This 624 * unused block and rt extent reservations are also inherited. This
@@ -282,10 +640,8 @@ xfs_trans_dup(
282 ntp->t_magic = XFS_TRANS_MAGIC; 640 ntp->t_magic = XFS_TRANS_MAGIC;
283 ntp->t_type = tp->t_type; 641 ntp->t_type = tp->t_type;
284 ntp->t_mountp = tp->t_mountp; 642 ntp->t_mountp = tp->t_mountp;
285 ntp->t_items_free = XFS_LIC_NUM_SLOTS; 643 INIT_LIST_HEAD(&ntp->t_items);
286 ntp->t_busy_free = XFS_LBC_NUM_SLOTS; 644 INIT_LIST_HEAD(&ntp->t_busy);
287 xfs_lic_init(&(ntp->t_items));
288 XFS_LBC_INIT(&(ntp->t_busy));
289 645
290 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 646 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
291 ASSERT(tp->t_ticket != NULL); 647 ASSERT(tp->t_ticket != NULL);
@@ -421,7 +777,6 @@ undo_blocks:
421 return error; 777 return error;
422} 778}
423 779
424
425/* 780/*
426 * Record the indicated change to the given field for application 781 * Record the indicated change to the given field for application
427 * to the file system's superblock when the transaction commits. 782 * to the file system's superblock when the transaction commits.
@@ -650,7 +1005,7 @@ xfs_trans_apply_sb_deltas(
650 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we 1005 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
651 * still need to update the incore superblock with the changes. 1006 * still need to update the incore superblock with the changes.
652 */ 1007 */
653STATIC void 1008void
654xfs_trans_unreserve_and_mod_sb( 1009xfs_trans_unreserve_and_mod_sb(
655 xfs_trans_t *tp) 1010 xfs_trans_t *tp)
656{ 1011{
@@ -764,94 +1119,340 @@ xfs_trans_unreserve_and_mod_sb(
764 } 1119 }
765} 1120}
766 1121
767
768/* 1122/*
769 * xfs_trans_commit 1123 * Add the given log item to the transaction's list of log items.
770 * 1124 *
771 * Commit the given transaction to the log a/synchronously. 1125 * The log item will now point to its new descriptor with its li_desc field.
1126 */
1127void
1128xfs_trans_add_item(
1129 struct xfs_trans *tp,
1130 struct xfs_log_item *lip)
1131{
1132 struct xfs_log_item_desc *lidp;
1133
1134 ASSERT(lip->li_mountp = tp->t_mountp);
1135 ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
1136
1137 lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
1138
1139 lidp->lid_item = lip;
1140 lidp->lid_flags = 0;
1141 lidp->lid_size = 0;
1142 list_add_tail(&lidp->lid_trans, &tp->t_items);
1143
1144 lip->li_desc = lidp;
1145}
1146
1147STATIC void
1148xfs_trans_free_item_desc(
1149 struct xfs_log_item_desc *lidp)
1150{
1151 list_del_init(&lidp->lid_trans);
1152 kmem_zone_free(xfs_log_item_desc_zone, lidp);
1153}
1154
1155/*
1156 * Unlink and free the given descriptor.
1157 */
1158void
1159xfs_trans_del_item(
1160 struct xfs_log_item *lip)
1161{
1162 xfs_trans_free_item_desc(lip->li_desc);
1163 lip->li_desc = NULL;
1164}
1165
1166/*
1167 * Unlock all of the items of a transaction and free all the descriptors
1168 * of that transaction.
1169 */
1170void
1171xfs_trans_free_items(
1172 struct xfs_trans *tp,
1173 xfs_lsn_t commit_lsn,
1174 int flags)
1175{
1176 struct xfs_log_item_desc *lidp, *next;
1177
1178 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
1179 struct xfs_log_item *lip = lidp->lid_item;
1180
1181 lip->li_desc = NULL;
1182
1183 if (commit_lsn != NULLCOMMITLSN)
1184 IOP_COMMITTING(lip, commit_lsn);
1185 if (flags & XFS_TRANS_ABORT)
1186 lip->li_flags |= XFS_LI_ABORTED;
1187 IOP_UNLOCK(lip);
1188
1189 xfs_trans_free_item_desc(lidp);
1190 }
1191}
1192
1193/*
1194 * Unlock the items associated with a transaction.
772 * 1195 *
773 * XFS disk error handling mechanism is not based on a typical 1196 * Items which were not logged should be freed. Those which were logged must
774 * transaction abort mechanism. Logically after the filesystem 1197 * still be tracked so they can be unpinned when the transaction commits.
775 * gets marked 'SHUTDOWN', we can't let any new transactions
776 * be durable - ie. committed to disk - because some metadata might
777 * be inconsistent. In such cases, this returns an error, and the
778 * caller may assume that all locked objects joined to the transaction
779 * have already been unlocked as if the commit had succeeded.
780 * Do not reference the transaction structure after this call.
781 */ 1198 */
782 /*ARGSUSED*/ 1199STATIC void
783int 1200xfs_trans_unlock_items(
784_xfs_trans_commit( 1201 struct xfs_trans *tp,
785 xfs_trans_t *tp, 1202 xfs_lsn_t commit_lsn)
786 uint flags,
787 int *log_flushed)
788{ 1203{
789 xfs_log_iovec_t *log_vector; 1204 struct xfs_log_item_desc *lidp, *next;
790 int nvec; 1205
791 xfs_mount_t *mp; 1206 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
792 xfs_lsn_t commit_lsn; 1207 struct xfs_log_item *lip = lidp->lid_item;
793 /* REFERENCED */
794 int error;
795 int log_flags;
796 int sync;
797#define XFS_TRANS_LOGVEC_COUNT 16
798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
799 struct xlog_in_core *commit_iclog;
800 int shutdown;
801 1208
802 commit_lsn = -1; 1209 lip->li_desc = NULL;
1210
1211 if (commit_lsn != NULLCOMMITLSN)
1212 IOP_COMMITTING(lip, commit_lsn);
1213 IOP_UNLOCK(lip);
1214
1215 /*
1216 * Free the descriptor if the item is not dirty
1217 * within this transaction.
1218 */
1219 if (!(lidp->lid_flags & XFS_LID_DIRTY))
1220 xfs_trans_free_item_desc(lidp);
1221 }
1222}
1223
1224/*
1225 * Total up the number of log iovecs needed to commit this
1226 * transaction. The transaction itself needs one for the
1227 * transaction header. Ask each dirty item in turn how many
1228 * it needs to get the total.
1229 */
1230static uint
1231xfs_trans_count_vecs(
1232 struct xfs_trans *tp)
1233{
1234 int nvecs;
1235 struct xfs_log_item_desc *lidp;
1236
1237 nvecs = 1;
1238
1239 /* In the non-debug case we need to start bailing out if we
1240 * didn't find a log_item here, return zero and let trans_commit
1241 * deal with it.
1242 */
1243 if (list_empty(&tp->t_items)) {
1244 ASSERT(0);
1245 return 0;
1246 }
1247
1248 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
1249 /*
1250 * Skip items which aren't dirty in this transaction.
1251 */
1252 if (!(lidp->lid_flags & XFS_LID_DIRTY))
1253 continue;
1254 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1255 nvecs += lidp->lid_size;
1256 }
1257
1258 return nvecs;
1259}
1260
1261/*
1262 * Fill in the vector with pointers to data to be logged
1263 * by this transaction. The transaction header takes
1264 * the first vector, and then each dirty item takes the
1265 * number of vectors it indicated it needed in xfs_trans_count_vecs().
1266 *
1267 * As each item fills in the entries it needs, also pin the item
1268 * so that it cannot be flushed out until the log write completes.
1269 */
1270static void
1271xfs_trans_fill_vecs(
1272 struct xfs_trans *tp,
1273 struct xfs_log_iovec *log_vector)
1274{
1275 struct xfs_log_item_desc *lidp;
1276 struct xfs_log_iovec *vecp;
1277 uint nitems;
803 1278
804 /* 1279 /*
805 * Determine whether this commit is releasing a permanent 1280 * Skip over the entry for the transaction header, we'll
806 * log reservation or not. 1281 * fill that in at the end.
807 */ 1282 */
808 if (flags & XFS_TRANS_RELEASE_LOG_RES) { 1283 vecp = log_vector + 1;
809 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1284
810 log_flags = XFS_LOG_REL_PERM_RESERV; 1285 nitems = 0;
811 } else { 1286 ASSERT(!list_empty(&tp->t_items));
812 log_flags = 0; 1287 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
1288 /* Skip items which aren't dirty in this transaction. */
1289 if (!(lidp->lid_flags & XFS_LID_DIRTY))
1290 continue;
1291
1292 /*
1293 * The item may be marked dirty but not log anything. This can
1294 * be used to get called when a transaction is committed.
1295 */
1296 if (lidp->lid_size)
1297 nitems++;
1298 IOP_FORMAT(lidp->lid_item, vecp);
1299 vecp += lidp->lid_size;
1300 IOP_PIN(lidp->lid_item);
813 } 1301 }
814 mp = tp->t_mountp;
815 1302
816 /* 1303 /*
817 * If there is nothing to be logged by the transaction, 1304 * Now that we've counted the number of items in this transaction, fill
818 * then unlock all of the items associated with the 1305 * in the transaction header. Note that the transaction header does not
819 * transaction and free the transaction structure. 1306 * have a log item.
820 * Also make sure to return any reserved blocks to 1307 */
821 * the free pool. 1308 tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
1309 tp->t_header.th_type = tp->t_type;
1310 tp->t_header.th_num_items = nitems;
1311 log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
1312 log_vector->i_len = sizeof(xfs_trans_header_t);
1313 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
1314}
1315
1316/*
1317 * The committed item processing consists of calling the committed routine of
1318 * each logged item, updating the item's position in the AIL if necessary, and
1319 * unpinning each item. If the committed routine returns -1, then do nothing
1320 * further with the item because it may have been freed.
1321 *
1322 * Since items are unlocked when they are copied to the incore log, it is
1323 * possible for two transactions to be completing and manipulating the same
1324 * item simultaneously. The AIL lock will protect the lsn field of each item.
1325 * The value of this field can never go backwards.
1326 *
1327 * We unpin the items after repositioning them in the AIL, because otherwise
1328 * they could be immediately flushed and we'd have to race with the flusher
1329 * trying to pull the item from the AIL as we add it.
1330 */
1331void
1332xfs_trans_item_committed(
1333 struct xfs_log_item *lip,
1334 xfs_lsn_t commit_lsn,
1335 int aborted)
1336{
1337 xfs_lsn_t item_lsn;
1338 struct xfs_ail *ailp;
1339
1340 if (aborted)
1341 lip->li_flags |= XFS_LI_ABORTED;
1342 item_lsn = IOP_COMMITTED(lip, commit_lsn);
1343
1344 /* If the committed routine returns -1, item has been freed. */
1345 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1346 return;
1347
1348 /*
1349 * If the returned lsn is greater than what it contained before, update
1350 * the location of the item in the AIL. If it is not, then do nothing.
1351 * Items can never move backwards in the AIL.
1352 *
1353 * While the new lsn should usually be greater, it is possible that a
1354 * later transaction completing simultaneously with an earlier one
1355 * using the same item could complete first with a higher lsn. This
1356 * would cause the earlier transaction to fail the test below.
822 */ 1357 */
823shut_us_down: 1358 ailp = lip->li_ailp;
824 shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; 1359 spin_lock(&ailp->xa_lock);
825 if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { 1360 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
826 xfs_trans_unreserve_and_mod_sb(tp);
827 /* 1361 /*
828 * It is indeed possible for the transaction to be 1362 * This will set the item's lsn to item_lsn and update the
829 * not dirty but the dqinfo portion to be. All that 1363 * position of the item in the AIL.
830 * means is that we have some (non-persistent) quota 1364 *
831 * reservations that need to be unreserved. 1365 * xfs_trans_ail_update() drops the AIL lock.
832 */ 1366 */
833 xfs_trans_unreserve_and_mod_dquots(tp); 1367 xfs_trans_ail_update(ailp, lip, item_lsn);
834 if (tp->t_ticket) { 1368 } else {
835 commit_lsn = xfs_log_done(mp, tp->t_ticket, 1369 spin_unlock(&ailp->xa_lock);
836 NULL, log_flags);
837 if (commit_lsn == -1 && !shutdown)
838 shutdown = XFS_ERROR(EIO);
839 }
840 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
841 xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
842 xfs_trans_free_busy(tp);
843 xfs_trans_free(tp);
844 XFS_STATS_INC(xs_trans_empty);
845 return (shutdown);
846 } 1370 }
847 ASSERT(tp->t_ticket != NULL);
848 1371
849 /* 1372 /*
850 * If we need to update the superblock, then do it now. 1373 * Now that we've repositioned the item in the AIL, unpin it so it can
1374 * be flushed. Pass information about buffer stale state down from the
1375 * log item flags, if anyone else stales the buffer we do not want to
1376 * pay any attention to it.
851 */ 1377 */
852 if (tp->t_flags & XFS_TRANS_SB_DIRTY) 1378 IOP_UNPIN(lip, 0);
853 xfs_trans_apply_sb_deltas(tp); 1379}
854 xfs_trans_apply_dquot_deltas(tp); 1380
1381/*
1382 * This is typically called by the LM when a transaction has been fully
1383 * committed to disk. It needs to unpin the items which have
1384 * been logged by the transaction and update their positions
1385 * in the AIL if necessary.
1386 *
1387 * This also gets called when the transactions didn't get written out
1388 * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
1389 */
1390STATIC void
1391xfs_trans_committed(
1392 struct xfs_trans *tp,
1393 int abortflag)
1394{
1395 struct xfs_log_item_desc *lidp, *next;
1396
1397 /* Call the transaction's completion callback if there is one. */
1398 if (tp->t_callback != NULL)
1399 tp->t_callback(tp, tp->t_callarg);
1400
1401 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
1402 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
1403 xfs_trans_free_item_desc(lidp);
1404 }
1405
1406 xfs_trans_free(tp);
1407}
1408
1409/*
1410 * Called from the trans_commit code when we notice that
1411 * the filesystem is in the middle of a forced shutdown.
1412 */
1413STATIC void
1414xfs_trans_uncommit(
1415 struct xfs_trans *tp,
1416 uint flags)
1417{
1418 struct xfs_log_item_desc *lidp;
1419
1420 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
1421 /*
1422 * Unpin all but those that aren't dirty.
1423 */
1424 if (lidp->lid_flags & XFS_LID_DIRTY)
1425 IOP_UNPIN(lidp->lid_item, 1);
1426 }
1427
1428 xfs_trans_unreserve_and_mod_sb(tp);
1429 xfs_trans_unreserve_and_mod_dquots(tp);
1430
1431 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1432 xfs_trans_free(tp);
1433}
1434
1435/*
1436 * Format the transaction direct to the iclog. This isolates the physical
1437 * transaction commit operation from the logical operation and hence allows
1438 * other methods to be introduced without affecting the existing commit path.
1439 */
1440static int
1441xfs_trans_commit_iclog(
1442 struct xfs_mount *mp,
1443 struct xfs_trans *tp,
1444 xfs_lsn_t *commit_lsn,
1445 int flags)
1446{
1447 int shutdown;
1448 int error;
1449 int log_flags = 0;
1450 struct xlog_in_core *commit_iclog;
1451#define XFS_TRANS_LOGVEC_COUNT 16
1452 struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
1453 struct xfs_log_iovec *log_vector;
1454 uint nvec;
1455
855 1456
856 /* 1457 /*
857 * Ask each log item how many log_vector entries it will 1458 * Ask each log item how many log_vector entries it will
@@ -861,8 +1462,7 @@ shut_us_down:
861 */ 1462 */
862 nvec = xfs_trans_count_vecs(tp); 1463 nvec = xfs_trans_count_vecs(tp);
863 if (nvec == 0) { 1464 if (nvec == 0) {
864 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1465 return ENOMEM; /* triggers a shutdown! */
865 goto shut_us_down;
866 } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { 1466 } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
867 log_vector = log_vector_fast; 1467 log_vector = log_vector_fast;
868 } else { 1468 } else {
@@ -877,6 +1477,9 @@ shut_us_down:
877 */ 1477 */
878 xfs_trans_fill_vecs(tp, log_vector); 1478 xfs_trans_fill_vecs(tp, log_vector);
879 1479
1480 if (flags & XFS_TRANS_RELEASE_LOG_RES)
1481 log_flags = XFS_LOG_REL_PERM_RESERV;
1482
880 error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); 1483 error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
881 1484
882 /* 1485 /*
@@ -884,18 +1487,19 @@ shut_us_down:
884 * at any time after this call. However, all the items associated 1487 * at any time after this call. However, all the items associated
885 * with the transaction are still locked and pinned in memory. 1488 * with the transaction are still locked and pinned in memory.
886 */ 1489 */
887 commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); 1490 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
888 1491
889 tp->t_commit_lsn = commit_lsn; 1492 tp->t_commit_lsn = *commit_lsn;
890 if (nvec > XFS_TRANS_LOGVEC_COUNT) { 1493 trace_xfs_trans_commit_lsn(tp);
1494
1495 if (nvec > XFS_TRANS_LOGVEC_COUNT)
891 kmem_free(log_vector); 1496 kmem_free(log_vector);
892 }
893 1497
894 /* 1498 /*
895 * If we got a log write error. Unpin the logitems that we 1499 * If we got a log write error. Unpin the logitems that we
896 * had pinned, clean up, free trans structure, and return error. 1500 * had pinned, clean up, free trans structure, and return error.
897 */ 1501 */
898 if (error || commit_lsn == -1) { 1502 if (error || *commit_lsn == -1) {
899 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1503 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
900 xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); 1504 xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
901 return XFS_ERROR(EIO); 1505 return XFS_ERROR(EIO);
@@ -909,8 +1513,6 @@ shut_us_down:
909 */ 1513 */
910 xfs_trans_unreserve_and_mod_sb(tp); 1514 xfs_trans_unreserve_and_mod_sb(tp);
911 1515
912 sync = tp->t_flags & XFS_TRANS_SYNC;
913
914 /* 1516 /*
915 * Tell the LM to call the transaction completion routine 1517 * Tell the LM to call the transaction completion routine
916 * when the log write with LSN commit_lsn completes (e.g. 1518 * when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1555,7 @@ shut_us_down:
953 * the commit lsn of this transaction for dependency tracking 1555 * the commit lsn of this transaction for dependency tracking
954 * purposes. 1556 * purposes.
955 */ 1557 */
956 xfs_trans_unlock_items(tp, commit_lsn); 1558 xfs_trans_unlock_items(tp, *commit_lsn);
957 1559
958 /* 1560 /*
959 * If we detected a log error earlier, finish committing 1561 * If we detected a log error earlier, finish committing
@@ -973,156 +1575,195 @@ shut_us_down:
973 * and the items are released we can finally allow the iclog to 1575 * and the items are released we can finally allow the iclog to
974 * go to disk. 1576 * go to disk.
975 */ 1577 */
976 error = xfs_log_release_iclog(mp, commit_iclog); 1578 return xfs_log_release_iclog(mp, commit_iclog);
977
978 /*
979 * If the transaction needs to be synchronous, then force the
980 * log out now and wait for it.
981 */
982 if (sync) {
983 if (!error) {
984 error = _xfs_log_force_lsn(mp, commit_lsn,
985 XFS_LOG_SYNC, log_flushed);
986 }
987 XFS_STATS_INC(xs_trans_sync);
988 } else {
989 XFS_STATS_INC(xs_trans_async);
990 }
991
992 return (error);
993} 1579}
994 1580
995
996/* 1581/*
997 * Total up the number of log iovecs needed to commit this 1582 * Walk the log items and allocate log vector structures for
998 * transaction. The transaction itself needs one for the 1583 * each item large enough to fit all the vectors they require.
999 * transaction header. Ask each dirty item in turn how many 1584 * Note that this format differs from the old log vector format in
1000 * it needs to get the total. 1585 * that there is no transaction header in these log vectors.
1001 */ 1586 */
1002STATIC uint 1587STATIC struct xfs_log_vec *
1003xfs_trans_count_vecs( 1588xfs_trans_alloc_log_vecs(
1004 xfs_trans_t *tp) 1589 xfs_trans_t *tp)
1005{ 1590{
1006 int nvecs; 1591 struct xfs_log_item_desc *lidp;
1007 xfs_log_item_desc_t *lidp; 1592 struct xfs_log_vec *lv = NULL;
1593 struct xfs_log_vec *ret_lv = NULL;
1008 1594
1009 nvecs = 1;
1010 lidp = xfs_trans_first_item(tp);
1011 ASSERT(lidp != NULL);
1012 1595
1013 /* In the non-debug case we need to start bailing out if we 1596 /* Bail out if we didn't find a log item. */
1014 * didn't find a log_item here, return zero and let trans_commit 1597 if (list_empty(&tp->t_items)) {
1015 * deal with it. 1598 ASSERT(0);
1016 */ 1599 return NULL;
1017 if (lidp == NULL) 1600 }
1018 return 0;
1019 1601
1020 while (lidp != NULL) { 1602 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
1021 /* 1603 struct xfs_log_vec *new_lv;
1022 * Skip items which aren't dirty in this transaction. 1604
1023 */ 1605 /* Skip items which aren't dirty in this transaction. */
1024 if (!(lidp->lid_flags & XFS_LID_DIRTY)) { 1606 if (!(lidp->lid_flags & XFS_LID_DIRTY))
1025 lidp = xfs_trans_next_item(tp, lidp);
1026 continue; 1607 continue;
1027 } 1608
1609 /* Skip items that do not have any vectors for writing */
1028 lidp->lid_size = IOP_SIZE(lidp->lid_item); 1610 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1029 nvecs += lidp->lid_size; 1611 if (!lidp->lid_size)
1030 lidp = xfs_trans_next_item(tp, lidp); 1612 continue;
1613
1614 new_lv = kmem_zalloc(sizeof(*new_lv) +
1615 lidp->lid_size * sizeof(struct xfs_log_iovec),
1616 KM_SLEEP);
1617
1618 /* The allocated iovec region lies beyond the log vector. */
1619 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
1620 new_lv->lv_niovecs = lidp->lid_size;
1621 new_lv->lv_item = lidp->lid_item;
1622 if (!ret_lv)
1623 ret_lv = new_lv;
1624 else
1625 lv->lv_next = new_lv;
1626 lv = new_lv;
1031 } 1627 }
1032 1628
1033 return nvecs; 1629 return ret_lv;
1034} 1630}
1035 1631
1036/* 1632static int
1037 * Called from the trans_commit code when we notice that 1633xfs_trans_commit_cil(
1038 * the filesystem is in the middle of a forced shutdown. 1634 struct xfs_mount *mp,
1039 */ 1635 struct xfs_trans *tp,
1040STATIC void 1636 xfs_lsn_t *commit_lsn,
1041xfs_trans_uncommit( 1637 int flags)
1042 xfs_trans_t *tp,
1043 uint flags)
1044{ 1638{
1045 xfs_log_item_desc_t *lidp; 1639 struct xfs_log_vec *log_vector;
1640 int error;
1046 1641
1047 for (lidp = xfs_trans_first_item(tp); 1642 /*
1048 lidp != NULL; 1643 * Get each log item to allocate a vector structure for
1049 lidp = xfs_trans_next_item(tp, lidp)) { 1644 * the log item to to pass to the log write code. The
1050 /* 1645 * CIL commit code will format the vector and save it away.
1051 * Unpin all but those that aren't dirty. 1646 */
1052 */ 1647 log_vector = xfs_trans_alloc_log_vecs(tp);
1053 if (lidp->lid_flags & XFS_LID_DIRTY) 1648 if (!log_vector)
1054 IOP_UNPIN_REMOVE(lidp->lid_item, tp); 1649 return ENOMEM;
1055 }
1056 1650
1057 xfs_trans_unreserve_and_mod_sb(tp); 1651 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1058 xfs_trans_unreserve_and_mod_dquots(tp); 1652 if (error)
1653 return error;
1059 1654
1060 xfs_trans_free_items(tp, flags); 1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1061 xfs_trans_free_busy(tp);
1062 xfs_trans_free(tp); 1656 xfs_trans_free(tp);
1657 return 0;
1063} 1658}
1064 1659
1065/* 1660/*
1066 * Fill in the vector with pointers to data to be logged 1661 * xfs_trans_commit
1067 * by this transaction. The transaction header takes
1068 * the first vector, and then each dirty item takes the
1069 * number of vectors it indicated it needed in xfs_trans_count_vecs().
1070 * 1662 *
1071 * As each item fills in the entries it needs, also pin the item 1663 * Commit the given transaction to the log a/synchronously.
1072 * so that it cannot be flushed out until the log write completes. 1664 *
1665 * XFS disk error handling mechanism is not based on a typical
1666 * transaction abort mechanism. Logically after the filesystem
1667 * gets marked 'SHUTDOWN', we can't let any new transactions
1668 * be durable - ie. committed to disk - because some metadata might
1669 * be inconsistent. In such cases, this returns an error, and the
1670 * caller may assume that all locked objects joined to the transaction
1671 * have already been unlocked as if the commit had succeeded.
1672 * Do not reference the transaction structure after this call.
1073 */ 1673 */
1074STATIC void 1674int
1075xfs_trans_fill_vecs( 1675_xfs_trans_commit(
1076 xfs_trans_t *tp, 1676 struct xfs_trans *tp,
1077 xfs_log_iovec_t *log_vector) 1677 uint flags,
1678 int *log_flushed)
1078{ 1679{
1079 xfs_log_item_desc_t *lidp; 1680 struct xfs_mount *mp = tp->t_mountp;
1080 xfs_log_iovec_t *vecp; 1681 xfs_lsn_t commit_lsn = -1;
1081 uint nitems; 1682 int error = 0;
1683 int log_flags = 0;
1684 int sync = tp->t_flags & XFS_TRANS_SYNC;
1082 1685
1083 /* 1686 /*
1084 * Skip over the entry for the transaction header, we'll 1687 * Determine whether this commit is releasing a permanent
1085 * fill that in at the end. 1688 * log reservation or not.
1086 */ 1689 */
1087 vecp = log_vector + 1; /* pointer arithmetic */ 1690 if (flags & XFS_TRANS_RELEASE_LOG_RES) {
1691 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1692 log_flags = XFS_LOG_REL_PERM_RESERV;
1693 }
1088 1694
1089 nitems = 0; 1695 /*
1090 lidp = xfs_trans_first_item(tp); 1696 * If there is nothing to be logged by the transaction,
1091 ASSERT(lidp != NULL); 1697 * then unlock all of the items associated with the
1092 while (lidp != NULL) { 1698 * transaction and free the transaction structure.
1093 /* 1699 * Also make sure to return any reserved blocks to
1094 * Skip items which aren't dirty in this transaction. 1700 * the free pool.
1095 */ 1701 */
1096 if (!(lidp->lid_flags & XFS_LID_DIRTY)) { 1702 if (!(tp->t_flags & XFS_TRANS_DIRTY))
1097 lidp = xfs_trans_next_item(tp, lidp); 1703 goto out_unreserve;
1098 continue; 1704
1099 } 1705 if (XFS_FORCED_SHUTDOWN(mp)) {
1100 /* 1706 error = XFS_ERROR(EIO);
1101 * The item may be marked dirty but not log anything. 1707 goto out_unreserve;
1102 * This can be used to get called when a transaction 1708 }
1103 * is committed. 1709
1104 */ 1710 ASSERT(tp->t_ticket != NULL);
1105 if (lidp->lid_size) { 1711
1106 nitems++; 1712 /*
1713 * If we need to update the superblock, then do it now.
1714 */
1715 if (tp->t_flags & XFS_TRANS_SB_DIRTY)
1716 xfs_trans_apply_sb_deltas(tp);
1717 xfs_trans_apply_dquot_deltas(tp);
1718
1719 if (mp->m_flags & XFS_MOUNT_DELAYLOG)
1720 error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
1721 else
1722 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
1723
1724 if (error == ENOMEM) {
1725 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1726 error = XFS_ERROR(EIO);
1727 goto out_unreserve;
1728 }
1729
1730 /*
1731 * If the transaction needs to be synchronous, then force the
1732 * log out now and wait for it.
1733 */
1734 if (sync) {
1735 if (!error) {
1736 error = _xfs_log_force_lsn(mp, commit_lsn,
1737 XFS_LOG_SYNC, log_flushed);
1107 } 1738 }
1108 IOP_FORMAT(lidp->lid_item, vecp); 1739 XFS_STATS_INC(xs_trans_sync);
1109 vecp += lidp->lid_size; /* pointer arithmetic */ 1740 } else {
1110 IOP_PIN(lidp->lid_item); 1741 XFS_STATS_INC(xs_trans_async);
1111 lidp = xfs_trans_next_item(tp, lidp);
1112 } 1742 }
1113 1743
1744 return error;
1745
1746out_unreserve:
1747 xfs_trans_unreserve_and_mod_sb(tp);
1748
1114 /* 1749 /*
1115 * Now that we've counted the number of items in this 1750 * It is indeed possible for the transaction to be not dirty but
1116 * transaction, fill in the transaction header. 1751 * the dqinfo portion to be. All that means is that we have some
1752 * (non-persistent) quota reservations that need to be unreserved.
1117 */ 1753 */
1118 tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; 1754 xfs_trans_unreserve_and_mod_dquots(tp);
1119 tp->t_header.th_type = tp->t_type; 1755 if (tp->t_ticket) {
1120 tp->t_header.th_num_items = nitems; 1756 commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
1121 log_vector->i_addr = (xfs_caddr_t)&tp->t_header; 1757 if (commit_lsn == -1 && !error)
1122 log_vector->i_len = sizeof(xfs_trans_header_t); 1758 error = XFS_ERROR(EIO);
1123 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; 1759 }
1124} 1760 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1761 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
1762 xfs_trans_free(tp);
1125 1763
1764 XFS_STATS_INC(xs_trans_empty);
1765 return error;
1766}
1126 1767
1127/* 1768/*
1128 * Unlock all of the transaction's items and free the transaction. 1769 * Unlock all of the transaction's items and free the transaction.
@@ -1138,12 +1779,6 @@ xfs_trans_cancel(
1138 int flags) 1779 int flags)
1139{ 1780{
1140 int log_flags; 1781 int log_flags;
1141#ifdef DEBUG
1142 xfs_log_item_chunk_t *licp;
1143 xfs_log_item_desc_t *lidp;
1144 xfs_log_item_t *lip;
1145 int i;
1146#endif
1147 xfs_mount_t *mp = tp->t_mountp; 1782 xfs_mount_t *mp = tp->t_mountp;
1148 1783
1149 /* 1784 /*
@@ -1162,21 +1797,11 @@ xfs_trans_cancel(
1162 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1797 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1163 } 1798 }
1164#ifdef DEBUG 1799#ifdef DEBUG
1165 if (!(flags & XFS_TRANS_ABORT)) { 1800 if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
1166 licp = &(tp->t_items); 1801 struct xfs_log_item_desc *lidp;
1167 while (licp != NULL) { 1802
1168 lidp = licp->lic_descs; 1803 list_for_each_entry(lidp, &tp->t_items, lid_trans)
1169 for (i = 0; i < licp->lic_unused; i++, lidp++) { 1804 ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
1170 if (xfs_lic_isfree(licp, i)) {
1171 continue;
1172 }
1173
1174 lip = lidp->lid_item;
1175 if (!XFS_FORCED_SHUTDOWN(mp))
1176 ASSERT(!(lip->li_type == XFS_LI_EFD));
1177 }
1178 licp = licp->lic_next;
1179 }
1180 } 1805 }
1181#endif 1806#endif
1182 xfs_trans_unreserve_and_mod_sb(tp); 1807 xfs_trans_unreserve_and_mod_sb(tp);
@@ -1195,25 +1820,10 @@ xfs_trans_cancel(
1195 /* mark this thread as no longer being in a transaction */ 1820 /* mark this thread as no longer being in a transaction */
1196 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1821 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1197 1822
1198 xfs_trans_free_items(tp, flags); 1823 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1199 xfs_trans_free_busy(tp);
1200 xfs_trans_free(tp); 1824 xfs_trans_free(tp);
1201} 1825}
1202 1826
1203
1204/*
1205 * Free the transaction structure. If there is more clean up
1206 * to do when the structure is freed, add it here.
1207 */
1208STATIC void
1209xfs_trans_free(
1210 xfs_trans_t *tp)
1211{
1212 atomic_dec(&tp->t_mountp->m_active_trans);
1213 xfs_trans_free_dqinfo(tp);
1214 kmem_zone_free(xfs_trans_zone, tp);
1215}
1216
1217/* 1827/*
1218 * Roll from one trans in the sequence of PERMANENT transactions to 1828 * Roll from one trans in the sequence of PERMANENT transactions to
1219 * the next: permanent transactions are only flushed out when 1829 * the next: permanent transactions are only flushed out when
@@ -1279,178 +1889,6 @@ xfs_trans_roll(
1279 if (error) 1889 if (error)
1280 return error; 1890 return error;
1281 1891
1282 xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); 1892 xfs_trans_ijoin(trans, dp);
1283 xfs_trans_ihold(trans, dp);
1284 return 0; 1893 return 0;
1285} 1894}
1286
1287/*
1288 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
1289 *
1290 * This is typically called by the LM when a transaction has been fully
1291 * committed to disk. It needs to unpin the items which have
1292 * been logged by the transaction and update their positions
1293 * in the AIL if necessary.
1294 * This also gets called when the transactions didn't get written out
1295 * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
1296 *
1297 * Call xfs_trans_chunk_committed() to process the items in
1298 * each chunk.
1299 */
1300STATIC void
1301xfs_trans_committed(
1302 xfs_trans_t *tp,
1303 int abortflag)
1304{
1305 xfs_log_item_chunk_t *licp;
1306 xfs_log_item_chunk_t *next_licp;
1307 xfs_log_busy_chunk_t *lbcp;
1308 xfs_log_busy_slot_t *lbsp;
1309 int i;
1310
1311 /*
1312 * Call the transaction's completion callback if there
1313 * is one.
1314 */
1315 if (tp->t_callback != NULL) {
1316 tp->t_callback(tp, tp->t_callarg);
1317 }
1318
1319 /*
1320 * Special case the chunk embedded in the transaction.
1321 */
1322 licp = &(tp->t_items);
1323 if (!(xfs_lic_are_all_free(licp))) {
1324 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1325 }
1326
1327 /*
1328 * Process the items in each chunk in turn.
1329 */
1330 licp = licp->lic_next;
1331 while (licp != NULL) {
1332 ASSERT(!xfs_lic_are_all_free(licp));
1333 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1334 next_licp = licp->lic_next;
1335 kmem_free(licp);
1336 licp = next_licp;
1337 }
1338
1339 /*
1340 * Clear all the per-AG busy list items listed in this transaction
1341 */
1342 lbcp = &tp->t_busy;
1343 while (lbcp != NULL) {
1344 for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
1345 if (!XFS_LBC_ISFREE(lbcp, i)) {
1346 xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
1347 lbsp->lbc_idx);
1348 }
1349 }
1350 lbcp = lbcp->lbc_next;
1351 }
1352 xfs_trans_free_busy(tp);
1353
1354 /*
1355 * That's it for the transaction structure. Free it.
1356 */
1357 xfs_trans_free(tp);
1358}
1359
1360/*
1361 * This is called to perform the commit processing for each
1362 * item described by the given chunk.
1363 *
1364 * The commit processing consists of unlocking items which were
1365 * held locked with the SYNC_UNLOCK attribute, calling the committed
1366 * routine of each logged item, updating the item's position in the AIL
1367 * if necessary, and unpinning each item. If the committed routine
1368 * returns -1, then do nothing further with the item because it
1369 * may have been freed.
1370 *
1371 * Since items are unlocked when they are copied to the incore
1372 * log, it is possible for two transactions to be completing
1373 * and manipulating the same item simultaneously. The AIL lock
1374 * will protect the lsn field of each item. The value of this
1375 * field can never go backwards.
1376 *
1377 * We unpin the items after repositioning them in the AIL, because
1378 * otherwise they could be immediately flushed and we'd have to race
1379 * with the flusher trying to pull the item from the AIL as we add it.
1380 */
1381STATIC void
1382xfs_trans_chunk_committed(
1383 xfs_log_item_chunk_t *licp,
1384 xfs_lsn_t lsn,
1385 int aborted)
1386{
1387 xfs_log_item_desc_t *lidp;
1388 xfs_log_item_t *lip;
1389 xfs_lsn_t item_lsn;
1390 int i;
1391
1392 lidp = licp->lic_descs;
1393 for (i = 0; i < licp->lic_unused; i++, lidp++) {
1394 struct xfs_ail *ailp;
1395
1396 if (xfs_lic_isfree(licp, i)) {
1397 continue;
1398 }
1399
1400 lip = lidp->lid_item;
1401 if (aborted)
1402 lip->li_flags |= XFS_LI_ABORTED;
1403
1404 /*
1405 * Send in the ABORTED flag to the COMMITTED routine
1406 * so that it knows whether the transaction was aborted
1407 * or not.
1408 */
1409 item_lsn = IOP_COMMITTED(lip, lsn);
1410
1411 /*
1412 * If the committed routine returns -1, make
1413 * no more references to the item.
1414 */
1415 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
1416 continue;
1417 }
1418
1419 /*
1420 * If the returned lsn is greater than what it
1421 * contained before, update the location of the
1422 * item in the AIL. If it is not, then do nothing.
1423 * Items can never move backwards in the AIL.
1424 *
1425 * While the new lsn should usually be greater, it
1426 * is possible that a later transaction completing
1427 * simultaneously with an earlier one using the
1428 * same item could complete first with a higher lsn.
1429 * This would cause the earlier transaction to fail
1430 * the test below.
1431 */
1432 ailp = lip->li_ailp;
1433 spin_lock(&ailp->xa_lock);
1434 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
1435 /*
1436 * This will set the item's lsn to item_lsn
1437 * and update the position of the item in
1438 * the AIL.
1439 *
1440 * xfs_trans_ail_update() drops the AIL lock.
1441 */
1442 xfs_trans_ail_update(ailp, lip, item_lsn);
1443 } else {
1444 spin_unlock(&ailp->xa_lock);
1445 }
1446
1447 /*
1448 * Now that we've repositioned the item in the AIL,
1449 * unpin it so it can be flushed. Pass information
1450 * about buffer stale state down from the log item
1451 * flags, if anyone else stales the buffer we do not
1452 * want to pay any attention to it.
1453 */
1454 IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
1455 }
1456}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 79c8bab9dfff..c13c0f97b494 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
49#define XFS_LI_DQUOT 0x123d 49#define XFS_LI_DQUOT 0x123d
50#define XFS_LI_QUOTAOFF 0x123e 50#define XFS_LI_QUOTAOFF 0x123e
51 51
52#define XFS_LI_TYPE_DESC \
53 { XFS_LI_EFI, "XFS_LI_EFI" }, \
54 { XFS_LI_EFD, "XFS_LI_EFD" }, \
55 { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
56 { XFS_LI_INODE, "XFS_LI_INODE" }, \
57 { XFS_LI_BUF, "XFS_LI_BUF" }, \
58 { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
59 { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }
60
52/* 61/*
53 * Transaction types. Used to distinguish types of buffers. 62 * Transaction types. Used to distinguish types of buffers.
54 */ 63 */
@@ -97,7 +106,8 @@ typedef struct xfs_trans_header {
97#define XFS_TRANS_GROWFSRT_FREE 39 106#define XFS_TRANS_GROWFSRT_FREE 39
98#define XFS_TRANS_SWAPEXT 40 107#define XFS_TRANS_SWAPEXT 40
99#define XFS_TRANS_SB_COUNT 41 108#define XFS_TRANS_SB_COUNT 41
100#define XFS_TRANS_TYPE_MAX 41 109#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42
101/* new transaction types need to be reflected in xfs_logprint(8) */ 111/* new transaction types need to be reflected in xfs_logprint(8) */
102 112
103#define XFS_TRANS_TYPES \ 113#define XFS_TRANS_TYPES \
@@ -139,6 +149,7 @@ typedef struct xfs_trans_header {
139 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 149 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
140 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 150 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
141 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ 151 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
152 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
142 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 153 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
143 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 154 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
144 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 155 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
@@ -150,106 +161,14 @@ typedef struct xfs_trans_header {
150 * the amount of space needed to log the item it describes 161 * the amount of space needed to log the item it describes
151 * once we get to commit processing (see xfs_trans_commit()). 162 * once we get to commit processing (see xfs_trans_commit()).
152 */ 163 */
153typedef struct xfs_log_item_desc { 164struct xfs_log_item_desc {
154 struct xfs_log_item *lid_item; 165 struct xfs_log_item *lid_item;
155 ushort lid_size; 166 ushort lid_size;
156 unsigned char lid_flags; 167 unsigned char lid_flags;
157 unsigned char lid_index; 168 struct list_head lid_trans;
158} xfs_log_item_desc_t; 169};
159 170
160#define XFS_LID_DIRTY 0x1 171#define XFS_LID_DIRTY 0x1
161#define XFS_LID_PINNED 0x2
162#define XFS_LID_BUF_STALE 0x8
163
164/*
165 * This structure is used to maintain a chunk list of log_item_desc
166 * structures. The free field is a bitmask indicating which descriptors
167 * in this chunk's array are free. The unused field is the first value
168 * not used since this chunk was allocated.
169 */
170#define XFS_LIC_NUM_SLOTS 15
171typedef struct xfs_log_item_chunk {
172 struct xfs_log_item_chunk *lic_next;
173 ushort lic_free;
174 ushort lic_unused;
175 xfs_log_item_desc_t lic_descs[XFS_LIC_NUM_SLOTS];
176} xfs_log_item_chunk_t;
177
178#define XFS_LIC_MAX_SLOT (XFS_LIC_NUM_SLOTS - 1)
179#define XFS_LIC_FREEMASK ((1 << XFS_LIC_NUM_SLOTS) - 1)
180
181
182/*
183 * Initialize the given chunk. Set the chunk's free descriptor mask
184 * to indicate that all descriptors are free. The caller gets to set
185 * lic_unused to the right value (0 matches all free). The
186 * lic_descs.lid_index values are set up as each desc is allocated.
187 */
188static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
189{
190 cp->lic_free = XFS_LIC_FREEMASK;
191}
192
193static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
194{
195 cp->lic_descs[slot].lid_index = (unsigned char)(slot);
196}
197
198static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
199{
200 return cp->lic_free & XFS_LIC_FREEMASK;
201}
202
203static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
204{
205 cp->lic_free = XFS_LIC_FREEMASK;
206}
207
208static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
209{
210 return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
211}
212
213static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
214{
215 return (cp->lic_free & (1 << slot));
216}
217
218static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
219{
220 cp->lic_free &= ~(1 << slot);
221}
222
223static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
224{
225 cp->lic_free |= 1 << slot;
226}
227
228static inline xfs_log_item_desc_t *
229xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
230{
231 return &(cp->lic_descs[slot]);
232}
233
234static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
235{
236 return (uint)dp->lid_index;
237}
238
239/*
240 * Calculate the address of a chunk given a descriptor pointer:
241 * dp - dp->lid_index give the address of the start of the lic_descs array.
242 * From this we subtract the offset of the lic_descs field in a chunk.
243 * All of this yields the address of the chunk, which is
244 * cast to a chunk pointer.
245 */
246static inline xfs_log_item_chunk_t *
247xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
248{
249 return (xfs_log_item_chunk_t*) \
250 (((xfs_caddr_t)((dp) - (dp)->lid_index)) - \
251 (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
252}
253 172
254#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ 173#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
255/* 174/*
@@ -265,8 +184,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
265/* 184/*
266 * Values for call flags parameter. 185 * Values for call flags parameter.
267 */ 186 */
268#define XFS_TRANS_NOSLEEP 0x1
269#define XFS_TRANS_WAIT 0x2
270#define XFS_TRANS_RELEASE_LOG_RES 0x4 187#define XFS_TRANS_RELEASE_LOG_RES 0x4
271#define XFS_TRANS_ABORT 0x8 188#define XFS_TRANS_ABORT 0x8
272 189
@@ -290,24 +207,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
290 207
291 208
292/* 209/*
293 * Various log reservation values.
294 * These are based on the size of the file system block
295 * because that is what most transactions manipulate.
296 * Each adds in an additional 128 bytes per item logged to
297 * try to account for the overhead of the transaction mechanism.
298 *
299 * Note:
300 * Most of the reservations underestimate the number of allocation
301 * groups into which they could free extents in the xfs_bmap_finish()
302 * call. This is because the number in the worst case is quite high
303 * and quite unusual. In order to fix this we need to change
304 * xfs_bmap_finish() to free extents in only a single AG at a time.
305 * This will require changes to the EFI code as well, however, so that
306 * the EFI for the extents not freed is logged again in each transaction.
307 * See bug 261917.
308 */
309
310/*
311 * Per-extent log reservation for the allocation btree changes 210 * Per-extent log reservation for the allocation btree changes
312 * involved in freeing or allocating an extent. 211 * involved in freeing or allocating an extent.
313 * 2 trees * (2 blocks/level * max depth - 1) * block size 212 * 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -331,429 +230,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
331 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ 230 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
332 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) 231 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
333 232
334/*
335 * In a write transaction we can allocate a maximum of 2
336 * extents. This gives:
337 * the inode getting the new extents: inode size
338 * the inode's bmap btree: max depth * block size
339 * the agfs of the ags from which the extents are allocated: 2 * sector
340 * the superblock free block counter: sector size
341 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
342 * And the bmap_finish transaction can free bmap blocks in a join:
343 * the agfs of the ags containing the blocks: 2 * sector size
344 * the agfls of the ags containing the blocks: 2 * sector size
345 * the super block free block counter: sector size
346 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
347 */
348#define XFS_CALC_WRITE_LOG_RES(mp) \
349 (MAX( \
350 ((mp)->m_sb.sb_inodesize + \
351 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
352 (2 * (mp)->m_sb.sb_sectsize) + \
353 (mp)->m_sb.sb_sectsize + \
354 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
355 (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
356 ((2 * (mp)->m_sb.sb_sectsize) + \
357 (2 * (mp)->m_sb.sb_sectsize) + \
358 (mp)->m_sb.sb_sectsize + \
359 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
360 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
361 233
362#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) 234#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write)
363
364/*
365 * In truncating a file we free up to two extents at once. We can modify:
366 * the inode being truncated: inode size
367 * the inode's bmap btree: (max depth + 1) * block size
368 * And the bmap_finish transaction can free the blocks and bmap blocks:
369 * the agf for each of the ags: 4 * sector size
370 * the agfl for each of the ags: 4 * sector size
371 * the super block to reflect the freed blocks: sector size
372 * worst case split in allocation btrees per extent assuming 4 extents:
373 * 4 exts * 2 trees * (2 * max depth - 1) * block size
374 * the inode btree: max depth * blocksize
375 * the allocation btrees: 2 trees * (max depth - 1) * block size
376 */
377#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
378 (MAX( \
379 ((mp)->m_sb.sb_inodesize + \
380 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
381 (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
382 ((4 * (mp)->m_sb.sb_sectsize) + \
383 (4 * (mp)->m_sb.sb_sectsize) + \
384 (mp)->m_sb.sb_sectsize + \
385 XFS_ALLOCFREE_LOG_RES(mp, 4) + \
386 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
387 (128 * 5) + \
388 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
389 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
390 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
391
392#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) 235#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
393
394/*
395 * In renaming a files we can modify:
396 * the four inodes involved: 4 * inode size
397 * the two directory btrees: 2 * (max depth + v2) * dir block size
398 * the two directory bmap btrees: 2 * max depth * block size
399 * And the bmap_finish transaction can free dir and bmap blocks (two sets
400 * of bmap blocks) giving:
401 * the agf for the ags in which the blocks live: 3 * sector size
402 * the agfl for the ags in which the blocks live: 3 * sector size
403 * the superblock for the free block count: sector size
404 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
405 */
406#define XFS_CALC_RENAME_LOG_RES(mp) \
407 (MAX( \
408 ((4 * (mp)->m_sb.sb_inodesize) + \
409 (2 * XFS_DIROP_LOG_RES(mp)) + \
410 (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
411 ((3 * (mp)->m_sb.sb_sectsize) + \
412 (3 * (mp)->m_sb.sb_sectsize) + \
413 (mp)->m_sb.sb_sectsize + \
414 XFS_ALLOCFREE_LOG_RES(mp, 3) + \
415 (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
416
417#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) 236#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
418
419/*
420 * For creating a link to an inode:
421 * the parent directory inode: inode size
422 * the linked inode: inode size
423 * the directory btree could split: (max depth + v2) * dir block size
424 * the directory bmap btree could join or split: (max depth + v2) * blocksize
425 * And the bmap_finish transaction can free some bmap blocks giving:
426 * the agf for the ag in which the blocks live: sector size
427 * the agfl for the ag in which the blocks live: sector size
428 * the superblock for the free block count: sector size
429 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
430 */
431#define XFS_CALC_LINK_LOG_RES(mp) \
432 (MAX( \
433 ((mp)->m_sb.sb_inodesize + \
434 (mp)->m_sb.sb_inodesize + \
435 XFS_DIROP_LOG_RES(mp) + \
436 (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
437 ((mp)->m_sb.sb_sectsize + \
438 (mp)->m_sb.sb_sectsize + \
439 (mp)->m_sb.sb_sectsize + \
440 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
441 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
442
443#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) 237#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
444
445/*
446 * For removing a directory entry we can modify:
447 * the parent directory inode: inode size
448 * the removed inode: inode size
449 * the directory btree could join: (max depth + v2) * dir block size
450 * the directory bmap btree could join or split: (max depth + v2) * blocksize
451 * And the bmap_finish transaction can free the dir and bmap blocks giving:
452 * the agf for the ag in which the blocks live: 2 * sector size
453 * the agfl for the ag in which the blocks live: 2 * sector size
454 * the superblock for the free block count: sector size
455 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
456 */
457#define XFS_CALC_REMOVE_LOG_RES(mp) \
458 (MAX( \
459 ((mp)->m_sb.sb_inodesize + \
460 (mp)->m_sb.sb_inodesize + \
461 XFS_DIROP_LOG_RES(mp) + \
462 (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
463 ((2 * (mp)->m_sb.sb_sectsize) + \
464 (2 * (mp)->m_sb.sb_sectsize) + \
465 (mp)->m_sb.sb_sectsize + \
466 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
467 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
468
469#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) 238#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
470
471/*
472 * For symlink we can modify:
473 * the parent directory inode: inode size
474 * the new inode: inode size
475 * the inode btree entry: 1 block
476 * the directory btree: (max depth + v2) * dir block size
477 * the directory inode's bmap btree: (max depth + v2) * block size
478 * the blocks for the symlink: 1 kB
479 * Or in the first xact we allocate some inodes giving:
480 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
481 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
482 * the inode btree: max depth * blocksize
483 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
484 */
485#define XFS_CALC_SYMLINK_LOG_RES(mp) \
486 (MAX( \
487 ((mp)->m_sb.sb_inodesize + \
488 (mp)->m_sb.sb_inodesize + \
489 XFS_FSB_TO_B(mp, 1) + \
490 XFS_DIROP_LOG_RES(mp) + \
491 1024 + \
492 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
493 (2 * (mp)->m_sb.sb_sectsize + \
494 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
495 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
496 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
497 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
498 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
499
500#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) 239#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
501
502/*
503 * For create we can modify:
504 * the parent directory inode: inode size
505 * the new inode: inode size
506 * the inode btree entry: block size
507 * the superblock for the nlink flag: sector size
508 * the directory btree: (max depth + v2) * dir block size
509 * the directory inode's bmap btree: (max depth + v2) * block size
510 * Or in the first xact we allocate some inodes giving:
511 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
512 * the superblock for the nlink flag: sector size
513 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
514 * the inode btree: max depth * blocksize
515 * the allocation btrees: 2 trees * (max depth - 1) * block size
516 */
517#define XFS_CALC_CREATE_LOG_RES(mp) \
518 (MAX( \
519 ((mp)->m_sb.sb_inodesize + \
520 (mp)->m_sb.sb_inodesize + \
521 (mp)->m_sb.sb_sectsize + \
522 XFS_FSB_TO_B(mp, 1) + \
523 XFS_DIROP_LOG_RES(mp) + \
524 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
525 (3 * (mp)->m_sb.sb_sectsize + \
526 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
527 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
528 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
529 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
530 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
531
532#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) 240#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
533
534/*
535 * Making a new directory is the same as creating a new file.
536 */
537#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp)
538
539#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) 241#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
540
541/*
542 * In freeing an inode we can modify:
543 * the inode being freed: inode size
544 * the super block free inode counter: sector size
545 * the agi hash list and counters: sector size
546 * the inode btree entry: block size
547 * the on disk inode before ours in the agi hash list: inode cluster size
548 * the inode btree: max depth * blocksize
549 * the allocation btrees: 2 trees * (max depth - 1) * block size
550 */
551#define XFS_CALC_IFREE_LOG_RES(mp) \
552 ((mp)->m_sb.sb_inodesize + \
553 (mp)->m_sb.sb_sectsize + \
554 (mp)->m_sb.sb_sectsize + \
555 XFS_FSB_TO_B((mp), 1) + \
556 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
557 (128 * 5) + \
558 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
559 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
560 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
561
562
563#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) 242#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
564
565/*
566 * When only changing the inode we log the inode and possibly the superblock
567 * We also add a bit of slop for the transaction stuff.
568 */
569#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \
570 (mp)->m_sb.sb_sectsize + 512)
571
572#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) 243#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
573
574/*
575 * Growing the data section of the filesystem.
576 * superblock
577 * agi and agf
578 * allocation btrees
579 */
580#define XFS_CALC_GROWDATA_LOG_RES(mp) \
581 ((mp)->m_sb.sb_sectsize * 3 + \
582 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
583 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
584
585#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) 244#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
586
587/*
588 * Growing the rt section of the filesystem.
589 * In the first set of transactions (ALLOC) we allocate space to the
590 * bitmap or summary files.
591 * superblock: sector size
592 * agf of the ag from which the extent is allocated: sector size
593 * bmap btree for bitmap/summary inode: max depth * blocksize
594 * bitmap/summary inode: inode size
595 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
596 */
597#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
598 (2 * (mp)->m_sb.sb_sectsize + \
599 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
600 (mp)->m_sb.sb_inodesize + \
601 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
602 (128 * \
603 (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
604 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
605
606#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) 245#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
607
608/*
609 * Growing the rt section of the filesystem.
610 * In the second set of transactions (ZERO) we zero the new metadata blocks.
611 * one bitmap/summary block: blocksize
612 */
613#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
614 ((mp)->m_sb.sb_blocksize + 128)
615
616#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) 246#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
617
618/*
619 * Growing the rt section of the filesystem.
620 * In the third set of transactions (FREE) we update metadata without
621 * allocating any new blocks.
622 * superblock: sector size
623 * bitmap inode: inode size
624 * summary inode: inode size
625 * one bitmap block: blocksize
626 * summary blocks: new summary size
627 */
628#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
629 ((mp)->m_sb.sb_sectsize + \
630 2 * (mp)->m_sb.sb_inodesize + \
631 (mp)->m_sb.sb_blocksize + \
632 (mp)->m_rsumsize + \
633 (128 * 5))
634
635#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) 247#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
636
637/*
638 * Logging the inode modification timestamp on a synchronous write.
639 * inode
640 */
641#define XFS_CALC_SWRITE_LOG_RES(mp) \
642 ((mp)->m_sb.sb_inodesize + 128)
643
644#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) 248#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
645
646/* 249/*
647 * Logging the inode timestamps on an fsync -- same as SWRITE 250 * Logging the inode timestamps on an fsync -- same as SWRITE
648 * as long as SWRITE logs the entire inode core 251 * as long as SWRITE logs the entire inode core
649 */ 252 */
650#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) 253#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
651
652/*
653 * Logging the inode mode bits when writing a setuid/setgid file
654 * inode
655 */
656#define XFS_CALC_WRITEID_LOG_RES(mp) \
657 ((mp)->m_sb.sb_inodesize + 128)
658
659#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) 254#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
660
661/*
662 * Converting the inode from non-attributed to attributed.
663 * the inode being converted: inode size
664 * agf block and superblock (for block allocation)
665 * the new block (directory sized)
666 * bmap blocks for the new directory block
667 * allocation btrees
668 */
669#define XFS_CALC_ADDAFORK_LOG_RES(mp) \
670 ((mp)->m_sb.sb_inodesize + \
671 (mp)->m_sb.sb_sectsize * 2 + \
672 (mp)->m_dirblksize + \
673 XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
674 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
675 (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
676 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
677
678#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) 255#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
679
680/*
681 * Removing the attribute fork of a file
682 * the inode being truncated: inode size
683 * the inode's bmap btree: max depth * block size
684 * And the bmap_finish transaction can free the blocks and bmap blocks:
685 * the agf for each of the ags: 4 * sector size
686 * the agfl for each of the ags: 4 * sector size
687 * the super block to reflect the freed blocks: sector size
688 * worst case split in allocation btrees per extent assuming 4 extents:
689 * 4 exts * 2 trees * (2 * max depth - 1) * block size
690 */
691#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \
692 (MAX( \
693 ((mp)->m_sb.sb_inodesize + \
694 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
695 (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
696 ((4 * (mp)->m_sb.sb_sectsize) + \
697 (4 * (mp)->m_sb.sb_sectsize) + \
698 (mp)->m_sb.sb_sectsize + \
699 XFS_ALLOCFREE_LOG_RES(mp, 4) + \
700 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
701
702#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) 256#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
703
704/*
705 * Setting an attribute.
706 * the inode getting the attribute
707 * the superblock for allocations
708 * the agfs extents are allocated from
709 * the attribute btree * max depth
710 * the inode allocation btree
711 * Since attribute transaction space is dependent on the size of the attribute,
712 * the calculation is done partially at mount time and partially at runtime.
713 */
714#define XFS_CALC_ATTRSET_LOG_RES(mp) \
715 ((mp)->m_sb.sb_inodesize + \
716 (mp)->m_sb.sb_sectsize + \
717 XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
718 (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
719
720#define XFS_ATTRSET_LOG_RES(mp, ext) \ 257#define XFS_ATTRSET_LOG_RES(mp, ext) \
721 ((mp)->m_reservations.tr_attrset + \ 258 ((mp)->m_reservations.tr_attrset + \
722 (ext * (mp)->m_sb.sb_sectsize) + \ 259 (ext * (mp)->m_sb.sb_sectsize) + \
723 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ 260 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
724 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) 261 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
725
726/*
727 * Removing an attribute.
728 * the inode: inode size
729 * the attribute btree could join: max depth * block size
730 * the inode bmap btree could join or split: max depth * block size
731 * And the bmap_finish transaction can free the attr blocks freed giving:
732 * the agf for the ag in which the blocks live: 2 * sector size
733 * the agfl for the ag in which the blocks live: 2 * sector size
734 * the superblock for the free block count: sector size
735 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
736 */
737#define XFS_CALC_ATTRRM_LOG_RES(mp) \
738 (MAX( \
739 ((mp)->m_sb.sb_inodesize + \
740 XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
741 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
742 (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
743 ((2 * (mp)->m_sb.sb_sectsize) + \
744 (2 * (mp)->m_sb.sb_sectsize) + \
745 (mp)->m_sb.sb_sectsize + \
746 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
747 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
748
749#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) 262#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
750
751/*
752 * Clearing a bad agino number in an agi hash bucket.
753 */
754#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
755 ((mp)->m_sb.sb_sectsize + 128)
756
757#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) 263#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
758 264
759 265
@@ -805,6 +311,7 @@ struct xfs_log_item_desc;
805struct xfs_mount; 311struct xfs_mount;
806struct xfs_trans; 312struct xfs_trans;
807struct xfs_dquot_acct; 313struct xfs_dquot_acct;
314struct xfs_busy_extent;
808 315
809typedef struct xfs_log_item { 316typedef struct xfs_log_item {
810 struct list_head li_ail; /* AIL pointers */ 317 struct list_head li_ail; /* AIL pointers */
@@ -820,6 +327,11 @@ typedef struct xfs_log_item {
820 /* buffer item iodone */ 327 /* buffer item iodone */
821 /* callback func */ 328 /* callback func */
822 struct xfs_item_ops *li_ops; /* function list */ 329 struct xfs_item_ops *li_ops; /* function list */
330
331 /* delayed logging */
332 struct list_head li_cil; /* CIL pointers */
333 struct xfs_log_vec *li_lv; /* active log vector */
334 xfs_lsn_t li_seq; /* CIL commit seq */
823} xfs_log_item_t; 335} xfs_log_item_t;
824 336
825#define XFS_LI_IN_AIL 0x1 337#define XFS_LI_IN_AIL 0x1
@@ -833,8 +345,7 @@ typedef struct xfs_item_ops {
833 uint (*iop_size)(xfs_log_item_t *); 345 uint (*iop_size)(xfs_log_item_t *);
834 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
835 void (*iop_pin)(xfs_log_item_t *); 347 void (*iop_pin)(xfs_log_item_t *);
836 void (*iop_unpin)(xfs_log_item_t *, int); 348 void (*iop_unpin)(xfs_log_item_t *, int remove);
837 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
838 uint (*iop_trylock)(xfs_log_item_t *); 349 uint (*iop_trylock)(xfs_log_item_t *);
839 void (*iop_unlock)(xfs_log_item_t *); 350 void (*iop_unlock)(xfs_log_item_t *);
840 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); 351 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
@@ -846,8 +357,7 @@ typedef struct xfs_item_ops {
846#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) 357#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
847#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 358#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
848#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) 359#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
849#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) 360#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
850#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
851#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) 361#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
852#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) 362#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
853#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) 363#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
@@ -864,34 +374,6 @@ typedef struct xfs_item_ops {
864#define XFS_ITEM_PUSHBUF 3 374#define XFS_ITEM_PUSHBUF 3
865 375
866/* 376/*
867 * This structure is used to maintain a list of block ranges that have been
868 * freed in the transaction. The ranges are listed in the perag[] busy list
869 * between when they're freed and the transaction is committed to disk.
870 */
871
872typedef struct xfs_log_busy_slot {
873 xfs_agnumber_t lbc_ag;
874 ushort lbc_idx; /* index in perag.busy[] */
875} xfs_log_busy_slot_t;
876
877#define XFS_LBC_NUM_SLOTS 31
878typedef struct xfs_log_busy_chunk {
879 struct xfs_log_busy_chunk *lbc_next;
880 uint lbc_free; /* free slots bitmask */
881 ushort lbc_unused; /* first unused */
882 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
883} xfs_log_busy_chunk_t;
884
885#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
886#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
887
888#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
889#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
890#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
891#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
892#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
893
894/*
895 * This is the type of function which can be given to xfs_trans_callback() 377 * This is the type of function which can be given to xfs_trans_callback()
896 * to be called upon the transaction's commit to disk. 378 * to be called upon the transaction's commit to disk.
897 */ 379 */
@@ -939,11 +421,9 @@ typedef struct xfs_trans {
939 int64_t t_rblocks_delta;/* superblock rblocks change */ 421 int64_t t_rblocks_delta;/* superblock rblocks change */
940 int64_t t_rextents_delta;/* superblocks rextents chg */ 422 int64_t t_rextents_delta;/* superblocks rextents chg */
941 int64_t t_rextslog_delta;/* superblocks rextslog chg */ 423 int64_t t_rextslog_delta;/* superblocks rextslog chg */
942 unsigned int t_items_free; /* log item descs free */ 424 struct list_head t_items; /* log item descriptors */
943 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
944 xfs_trans_header_t t_header; /* header for in-log trans */ 425 xfs_trans_header_t t_header; /* header for in-log trans */
945 unsigned int t_busy_free; /* busy descs free */ 426 struct list_head t_busy; /* list of busy extents */
946 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
947 unsigned long t_pflags; /* saved process flags state */ 427 unsigned long t_pflags; /* saved process flags state */
948} xfs_trans_t; 428} xfs_trans_t;
949 429
@@ -993,8 +473,8 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
993void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 473void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
994int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, 474int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
995 xfs_ino_t , uint, uint, struct xfs_inode **); 475 xfs_ino_t , uint, uint, struct xfs_inode **);
996void xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint); 476void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
997void xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *); 477void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
998void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); 478void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
999void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); 479void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
1000struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); 480struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -1017,11 +497,9 @@ int _xfs_trans_commit(xfs_trans_t *,
1017void xfs_trans_cancel(xfs_trans_t *, int); 497void xfs_trans_cancel(xfs_trans_t *, int);
1018int xfs_trans_ail_init(struct xfs_mount *); 498int xfs_trans_ail_init(struct xfs_mount *);
1019void xfs_trans_ail_destroy(struct xfs_mount *); 499void xfs_trans_ail_destroy(struct xfs_mount *);
1020xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
1021 xfs_agnumber_t ag,
1022 xfs_extlen_t idx);
1023 500
1024extern kmem_zone_t *xfs_trans_zone; 501extern kmem_zone_t *xfs_trans_zone;
502extern kmem_zone_t *xfs_log_item_desc_zone;
1025 503
1026#endif /* __KERNEL__ */ 504#endif /* __KERNEL__ */
1027 505
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index e799824f7245..dc9069568ff7 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -24,7 +24,6 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
30#include "xfs_error.h" 29#include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index fb586360d1c9..90af025e6839 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -24,14 +24,10 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_buf_item.h" 33#include "xfs_buf_item.h"
@@ -40,11 +36,32 @@
40#include "xfs_rw.h" 36#include "xfs_rw.h"
41#include "xfs_trace.h" 37#include "xfs_trace.h"
42 38
39/*
40 * Check to see if a buffer matching the given parameters is already
41 * a part of the given transaction.
42 */
43STATIC struct xfs_buf *
44xfs_trans_buf_item_match(
45 struct xfs_trans *tp,
46 struct xfs_buftarg *target,
47 xfs_daddr_t blkno,
48 int len)
49{
50 struct xfs_log_item_desc *lidp;
51 struct xfs_buf_log_item *blip;
52
53 len = BBTOB(len);
54 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
55 blip = (struct xfs_buf_log_item *)lidp->lid_item;
56 if (blip->bli_item.li_type == XFS_LI_BUF &&
57 XFS_BUF_TARGET(blip->bli_buf) == target &&
58 XFS_BUF_ADDR(blip->bli_buf) == blkno &&
59 XFS_BUF_COUNT(blip->bli_buf) == len)
60 return blip->bli_buf;
61 }
43 62
44STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, 63 return NULL;
45 xfs_daddr_t, int); 64}
46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
47 xfs_daddr_t, int);
48 65
49/* 66/*
50 * Add the locked buffer to the transaction. 67 * Add the locked buffer to the transaction.
@@ -74,7 +91,7 @@ _xfs_trans_bjoin(
74 xfs_buf_item_init(bp, tp->t_mountp); 91 xfs_buf_item_init(bp, tp->t_mountp);
75 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 92 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
76 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 93 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
77 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 94 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
78 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); 95 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
79 if (reset_recur) 96 if (reset_recur)
80 bip->bli_recur = 0; 97 bip->bli_recur = 0;
@@ -87,7 +104,7 @@ _xfs_trans_bjoin(
87 /* 104 /*
88 * Get a log_item_desc to point at the new item. 105 * Get a log_item_desc to point at the new item.
89 */ 106 */
90 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); 107 xfs_trans_add_item(tp, &bip->bli_item);
91 108
92 /* 109 /*
93 * Initialize b_fsprivate2 so we can find it with incore_match() 110 * Initialize b_fsprivate2 so we can find it with incore_match()
@@ -112,14 +129,6 @@ xfs_trans_bjoin(
112 * within the transaction, just increment its lock recursion count 129 * within the transaction, just increment its lock recursion count
113 * and return a pointer to it. 130 * and return a pointer to it.
114 * 131 *
115 * Use the fast path function xfs_trans_buf_item_match() or the buffer
116 * cache routine incore_match() to find the buffer
117 * if it is already owned by this transaction.
118 *
119 * If we don't already own the buffer, use get_buf() to get it.
120 * If it doesn't yet have an associated xfs_buf_log_item structure,
121 * then allocate one and add the item to this transaction.
122 *
123 * If the transaction pointer is NULL, make this just a normal 132 * If the transaction pointer is NULL, make this just a normal
124 * get_buf() call. 133 * get_buf() call.
125 */ 134 */
@@ -149,11 +158,7 @@ xfs_trans_get_buf(xfs_trans_t *tp,
149 * have it locked. In this case we just increment the lock 158 * have it locked. In this case we just increment the lock
150 * recursion count and return the buffer to the caller. 159 * recursion count and return the buffer to the caller.
151 */ 160 */
152 if (tp->t_items.lic_next == NULL) { 161 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
153 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
154 } else {
155 bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
156 }
157 if (bp != NULL) { 162 if (bp != NULL) {
158 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 163 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
159 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) 164 if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -259,14 +264,6 @@ int xfs_error_mod = 33;
259 * within the transaction and already read in, just increment its 264 * within the transaction and already read in, just increment its
260 * lock recursion count and return a pointer to it. 265 * lock recursion count and return a pointer to it.
261 * 266 *
262 * Use the fast path function xfs_trans_buf_item_match() or the buffer
263 * cache routine incore_match() to find the buffer
264 * if it is already owned by this transaction.
265 *
266 * If we don't already own the buffer, use read_buf() to get it.
267 * If it doesn't yet have an associated xfs_buf_log_item structure,
268 * then allocate one and add the item to this transaction.
269 *
270 * If the transaction pointer is NULL, make this just a normal 267 * If the transaction pointer is NULL, make this just a normal
271 * read_buf() call. 268 * read_buf() call.
272 */ 269 */
@@ -328,11 +325,7 @@ xfs_trans_read_buf(
328 * If the buffer is not yet read in, then we read it in, increment 325 * If the buffer is not yet read in, then we read it in, increment
329 * the lock recursion count, and return it to the caller. 326 * the lock recursion count, and return it to the caller.
330 */ 327 */
331 if (tp->t_items.lic_next == NULL) { 328 bp = xfs_trans_buf_item_match(tp, target, blkno, len);
332 bp = xfs_trans_buf_item_match(tp, target, blkno, len);
333 } else {
334 bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
335 }
336 if (bp != NULL) { 329 if (bp != NULL) {
337 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 330 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
338 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 331 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -467,7 +460,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
467{ 460{
468 xfs_buf_log_item_t *bip; 461 xfs_buf_log_item_t *bip;
469 xfs_log_item_t *lip; 462 xfs_log_item_t *lip;
470 xfs_log_item_desc_t *lidp;
471 463
472 /* 464 /*
473 * Default to a normal brelse() call if the tp is NULL. 465 * Default to a normal brelse() call if the tp is NULL.
@@ -495,16 +487,9 @@ xfs_trans_brelse(xfs_trans_t *tp,
495 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 487 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
496 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 488 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
497 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 489 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
498 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 490 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
499 ASSERT(atomic_read(&bip->bli_refcount) > 0); 491 ASSERT(atomic_read(&bip->bli_refcount) > 0);
500 492
501 /*
502 * Find the item descriptor pointing to this buffer's
503 * log item. It must be there.
504 */
505 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
506 ASSERT(lidp != NULL);
507
508 trace_xfs_trans_brelse(bip); 493 trace_xfs_trans_brelse(bip);
509 494
510 /* 495 /*
@@ -520,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
520 * If the buffer is dirty within this transaction, we can't 505 * If the buffer is dirty within this transaction, we can't
521 * release it until we commit. 506 * release it until we commit.
522 */ 507 */
523 if (lidp->lid_flags & XFS_LID_DIRTY) 508 if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
524 return; 509 return;
525 510
526 /* 511 /*
@@ -537,7 +522,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
537 /* 522 /*
538 * Free up the log item descriptor tracking the released item. 523 * Free up the log item descriptor tracking the released item.
539 */ 524 */
540 xfs_trans_free_item(tp, lidp); 525 xfs_trans_del_item(&bip->bli_item);
541 526
542 /* 527 /*
543 * Clear the hold flag in the buf log item if it is set. 528 * Clear the hold flag in the buf log item if it is set.
@@ -603,7 +588,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
603 588
604 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 589 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
605 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 590 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
606 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 591 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
607 ASSERT(atomic_read(&bip->bli_refcount) > 0); 592 ASSERT(atomic_read(&bip->bli_refcount) > 0);
608 bip->bli_flags |= XFS_BLI_HOLD; 593 bip->bli_flags |= XFS_BLI_HOLD;
609 trace_xfs_trans_bhold(bip); 594 trace_xfs_trans_bhold(bip);
@@ -625,7 +610,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
625 610
626 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 611 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
627 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 612 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
628 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 613 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
629 ASSERT(atomic_read(&bip->bli_refcount) > 0); 614 ASSERT(atomic_read(&bip->bli_refcount) > 0);
630 ASSERT(bip->bli_flags & XFS_BLI_HOLD); 615 ASSERT(bip->bli_flags & XFS_BLI_HOLD);
631 bip->bli_flags &= ~XFS_BLI_HOLD; 616 bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -649,7 +634,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
649 uint last) 634 uint last)
650{ 635{
651 xfs_buf_log_item_t *bip; 636 xfs_buf_log_item_t *bip;
652 xfs_log_item_desc_t *lidp;
653 637
654 ASSERT(XFS_BUF_ISBUSY(bp)); 638 ASSERT(XFS_BUF_ISBUSY(bp));
655 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 639 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -674,7 +658,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
674 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 658 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
675 ASSERT(atomic_read(&bip->bli_refcount) > 0); 659 ASSERT(atomic_read(&bip->bli_refcount) > 0);
676 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 660 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
677 bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone; 661 bip->bli_item.li_cb = xfs_buf_iodone;
678 662
679 trace_xfs_trans_log_buf(bip); 663 trace_xfs_trans_log_buf(bip);
680 664
@@ -688,15 +672,11 @@ xfs_trans_log_buf(xfs_trans_t *tp,
688 bip->bli_flags &= ~XFS_BLI_STALE; 672 bip->bli_flags &= ~XFS_BLI_STALE;
689 ASSERT(XFS_BUF_ISSTALE(bp)); 673 ASSERT(XFS_BUF_ISSTALE(bp));
690 XFS_BUF_UNSTALE(bp); 674 XFS_BUF_UNSTALE(bp);
691 bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; 675 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
692 } 676 }
693 677
694 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
695 ASSERT(lidp != NULL);
696
697 tp->t_flags |= XFS_TRANS_DIRTY; 678 tp->t_flags |= XFS_TRANS_DIRTY;
698 lidp->lid_flags |= XFS_LID_DIRTY; 679 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
699 lidp->lid_flags &= ~XFS_LID_BUF_STALE;
700 bip->bli_flags |= XFS_BLI_LOGGED; 680 bip->bli_flags |= XFS_BLI_LOGGED;
701 xfs_buf_item_log(bip, first, last); 681 xfs_buf_item_log(bip, first, last);
702} 682}
@@ -725,7 +705,6 @@ xfs_trans_binval(
725 xfs_trans_t *tp, 705 xfs_trans_t *tp,
726 xfs_buf_t *bp) 706 xfs_buf_t *bp)
727{ 707{
728 xfs_log_item_desc_t *lidp;
729 xfs_buf_log_item_t *bip; 708 xfs_buf_log_item_t *bip;
730 709
731 ASSERT(XFS_BUF_ISBUSY(bp)); 710 ASSERT(XFS_BUF_ISBUSY(bp));
@@ -733,8 +712,6 @@ xfs_trans_binval(
733 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 712 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
734 713
735 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 714 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
736 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
737 ASSERT(lidp != NULL);
738 ASSERT(atomic_read(&bip->bli_refcount) > 0); 715 ASSERT(atomic_read(&bip->bli_refcount) > 0);
739 716
740 trace_xfs_trans_binval(bip); 717 trace_xfs_trans_binval(bip);
@@ -747,9 +724,9 @@ xfs_trans_binval(
747 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 724 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
748 ASSERT(XFS_BUF_ISSTALE(bp)); 725 ASSERT(XFS_BUF_ISSTALE(bp));
749 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 726 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
750 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); 727 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
751 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 728 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
752 ASSERT(lidp->lid_flags & XFS_LID_DIRTY); 729 ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
753 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 730 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
754 return; 731 return;
755 } 732 }
@@ -759,7 +736,7 @@ xfs_trans_binval(
759 * in the buf log item. The STALE flag will be used in 736 * in the buf log item. The STALE flag will be used in
760 * xfs_buf_item_unpin() to determine if it should clean up 737 * xfs_buf_item_unpin() to determine if it should clean up
761 * when the last reference to the buf item is given up. 738 * when the last reference to the buf item is given up.
762 * We set the XFS_BLI_CANCEL flag in the buf log format structure 739 * We set the XFS_BLF_CANCEL flag in the buf log format structure
763 * and log the buf item. This will be used at recovery time 740 * and log the buf item. This will be used at recovery time
764 * to determine that copies of the buffer in the log before 741 * to determine that copies of the buffer in the log before
765 * this should not be replayed. 742 * this should not be replayed.
@@ -777,26 +754,26 @@ xfs_trans_binval(
777 XFS_BUF_UNDELAYWRITE(bp); 754 XFS_BUF_UNDELAYWRITE(bp);
778 XFS_BUF_STALE(bp); 755 XFS_BUF_STALE(bp);
779 bip->bli_flags |= XFS_BLI_STALE; 756 bip->bli_flags |= XFS_BLI_STALE;
780 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); 757 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
781 bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; 758 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
782 bip->bli_format.blf_flags |= XFS_BLI_CANCEL; 759 bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
783 memset((char *)(bip->bli_format.blf_data_map), 0, 760 memset((char *)(bip->bli_format.blf_data_map), 0,
784 (bip->bli_format.blf_map_size * sizeof(uint))); 761 (bip->bli_format.blf_map_size * sizeof(uint)));
785 lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; 762 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
786 tp->t_flags |= XFS_TRANS_DIRTY; 763 tp->t_flags |= XFS_TRANS_DIRTY;
787} 764}
788 765
789/* 766/*
790 * This call is used to indicate that the buffer contains on-disk 767 * This call is used to indicate that the buffer contains on-disk inodes which
791 * inodes which must be handled specially during recovery. They 768 * must be handled specially during recovery. They require special handling
792 * require special handling because only the di_next_unlinked from 769 * because only the di_next_unlinked from the inodes in the buffer should be
793 * the inodes in the buffer should be recovered. The rest of the 770 * recovered. The rest of the data in the buffer is logged via the inodes
794 * data in the buffer is logged via the inodes themselves. 771 * themselves.
795 * 772 *
796 * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log 773 * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
797 * format structure so that we'll know what to do at recovery time. 774 * transferred to the buffer's log format structure so that we'll know what to
775 * do at recovery time.
798 */ 776 */
799/* ARGSUSED */
800void 777void
801xfs_trans_inode_buf( 778xfs_trans_inode_buf(
802 xfs_trans_t *tp, 779 xfs_trans_t *tp,
@@ -811,7 +788,7 @@ xfs_trans_inode_buf(
811 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 788 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
812 ASSERT(atomic_read(&bip->bli_refcount) > 0); 789 ASSERT(atomic_read(&bip->bli_refcount) > 0);
813 790
814 bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; 791 bip->bli_flags |= XFS_BLI_INODE_BUF;
815} 792}
816 793
817/* 794/*
@@ -838,12 +815,9 @@ xfs_trans_stale_inode_buf(
838 ASSERT(atomic_read(&bip->bli_refcount) > 0); 815 ASSERT(atomic_read(&bip->bli_refcount) > 0);
839 816
840 bip->bli_flags |= XFS_BLI_STALE_INODE; 817 bip->bli_flags |= XFS_BLI_STALE_INODE;
841 bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) 818 bip->bli_item.li_cb = xfs_buf_iodone;
842 xfs_buf_iodone;
843} 819}
844 820
845
846
847/* 821/*
848 * Mark the buffer as being one which contains newly allocated 822 * Mark the buffer as being one which contains newly allocated
849 * inodes. We need to make sure that even if this buffer is 823 * inodes. We need to make sure that even if this buffer is
@@ -893,120 +867,12 @@ xfs_trans_dquot_buf(
893 ASSERT(XFS_BUF_ISBUSY(bp)); 867 ASSERT(XFS_BUF_ISBUSY(bp));
894 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 868 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
895 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 869 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
896 ASSERT(type == XFS_BLI_UDQUOT_BUF || 870 ASSERT(type == XFS_BLF_UDQUOT_BUF ||
897 type == XFS_BLI_PDQUOT_BUF || 871 type == XFS_BLF_PDQUOT_BUF ||
898 type == XFS_BLI_GDQUOT_BUF); 872 type == XFS_BLF_GDQUOT_BUF);
899 873
900 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 874 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
901 ASSERT(atomic_read(&bip->bli_refcount) > 0); 875 ASSERT(atomic_read(&bip->bli_refcount) > 0);
902 876
903 bip->bli_format.blf_flags |= type; 877 bip->bli_format.blf_flags |= type;
904} 878}
905
906/*
907 * Check to see if a buffer matching the given parameters is already
908 * a part of the given transaction. Only check the first, embedded
909 * chunk, since we don't want to spend all day scanning large transactions.
910 */
911STATIC xfs_buf_t *
912xfs_trans_buf_item_match(
913 xfs_trans_t *tp,
914 xfs_buftarg_t *target,
915 xfs_daddr_t blkno,
916 int len)
917{
918 xfs_log_item_chunk_t *licp;
919 xfs_log_item_desc_t *lidp;
920 xfs_buf_log_item_t *blip;
921 xfs_buf_t *bp;
922 int i;
923
924 bp = NULL;
925 len = BBTOB(len);
926 licp = &tp->t_items;
927 if (!xfs_lic_are_all_free(licp)) {
928 for (i = 0; i < licp->lic_unused; i++) {
929 /*
930 * Skip unoccupied slots.
931 */
932 if (xfs_lic_isfree(licp, i)) {
933 continue;
934 }
935
936 lidp = xfs_lic_slot(licp, i);
937 blip = (xfs_buf_log_item_t *)lidp->lid_item;
938 if (blip->bli_item.li_type != XFS_LI_BUF) {
939 continue;
940 }
941
942 bp = blip->bli_buf;
943 if ((XFS_BUF_TARGET(bp) == target) &&
944 (XFS_BUF_ADDR(bp) == blkno) &&
945 (XFS_BUF_COUNT(bp) == len)) {
946 /*
947 * We found it. Break out and
948 * return the pointer to the buffer.
949 */
950 break;
951 } else {
952 bp = NULL;
953 }
954 }
955 }
956 return bp;
957}
958
959/*
960 * Check to see if a buffer matching the given parameters is already
961 * a part of the given transaction. Check all the chunks, we
962 * want to be thorough.
963 */
964STATIC xfs_buf_t *
965xfs_trans_buf_item_match_all(
966 xfs_trans_t *tp,
967 xfs_buftarg_t *target,
968 xfs_daddr_t blkno,
969 int len)
970{
971 xfs_log_item_chunk_t *licp;
972 xfs_log_item_desc_t *lidp;
973 xfs_buf_log_item_t *blip;
974 xfs_buf_t *bp;
975 int i;
976
977 bp = NULL;
978 len = BBTOB(len);
979 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
980 if (xfs_lic_are_all_free(licp)) {
981 ASSERT(licp == &tp->t_items);
982 ASSERT(licp->lic_next == NULL);
983 return NULL;
984 }
985 for (i = 0; i < licp->lic_unused; i++) {
986 /*
987 * Skip unoccupied slots.
988 */
989 if (xfs_lic_isfree(licp, i)) {
990 continue;
991 }
992
993 lidp = xfs_lic_slot(licp, i);
994 blip = (xfs_buf_log_item_t *)lidp->lid_item;
995 if (blip->bli_item.li_type != XFS_LI_BUF) {
996 continue;
997 }
998
999 bp = blip->bli_buf;
1000 if ((XFS_BUF_TARGET(bp) == target) &&
1001 (XFS_BUF_ADDR(bp) == blkno) &&
1002 (XFS_BUF_COUNT(bp) == len)) {
1003 /*
1004 * We found it. Break out and
1005 * return the pointer to the buffer.
1006 */
1007 return bp;
1008 }
1009 }
1010 }
1011 return NULL;
1012}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 27cce2a9c7e9..f783d5e9fa70 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -23,7 +23,6 @@
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dmapi.h"
27#include "xfs_mount.h" 26#include "xfs_mount.h"
28#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
29#include "xfs_extfree_item.h" 28#include "xfs_extfree_item.h"
@@ -49,9 +48,8 @@ xfs_trans_get_efi(xfs_trans_t *tp,
49 /* 48 /*
50 * Get a log_item_desc to point at the new item. 49 * Get a log_item_desc to point at the new item.
51 */ 50 */
52 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip); 51 xfs_trans_add_item(tp, &efip->efi_item);
53 52 return efip;
54 return (efip);
55} 53}
56 54
57/* 55/*
@@ -65,15 +63,11 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
65 xfs_fsblock_t start_block, 63 xfs_fsblock_t start_block,
66 xfs_extlen_t ext_len) 64 xfs_extlen_t ext_len)
67{ 65{
68 xfs_log_item_desc_t *lidp;
69 uint next_extent; 66 uint next_extent;
70 xfs_extent_t *extp; 67 xfs_extent_t *extp;
71 68
72 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
73 ASSERT(lidp != NULL);
74
75 tp->t_flags |= XFS_TRANS_DIRTY; 69 tp->t_flags |= XFS_TRANS_DIRTY;
76 lidp->lid_flags |= XFS_LID_DIRTY; 70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
77 71
78 next_extent = efip->efi_next_extent; 72 next_extent = efip->efi_next_extent;
79 ASSERT(next_extent < efip->efi_format.efi_nextents); 73 ASSERT(next_extent < efip->efi_format.efi_nextents);
@@ -106,9 +100,8 @@ xfs_trans_get_efd(xfs_trans_t *tp,
106 /* 100 /*
107 * Get a log_item_desc to point at the new item. 101 * Get a log_item_desc to point at the new item.
108 */ 102 */
109 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp); 103 xfs_trans_add_item(tp, &efdp->efd_item);
110 104 return efdp;
111 return (efdp);
112} 105}
113 106
114/* 107/*
@@ -122,15 +115,11 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp,
122 xfs_fsblock_t start_block, 115 xfs_fsblock_t start_block,
123 xfs_extlen_t ext_len) 116 xfs_extlen_t ext_len)
124{ 117{
125 xfs_log_item_desc_t *lidp;
126 uint next_extent; 118 uint next_extent;
127 xfs_extent_t *extp; 119 xfs_extent_t *extp;
128 120
129 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
130 ASSERT(lidp != NULL);
131
132 tp->t_flags |= XFS_TRANS_DIRTY; 121 tp->t_flags |= XFS_TRANS_DIRTY;
133 lidp->lid_flags |= XFS_LID_DIRTY; 122 efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
134 123
135 next_extent = efdp->efd_next_extent; 124 next_extent = efdp->efd_next_extent;
136 ASSERT(next_extent < efdp->efd_format.efd_nextents); 125 ASSERT(next_extent < efdp->efd_format.efd_nextents);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 785ff101da0a..cdc53a1050c5 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -24,20 +24,16 @@
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 28#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 29#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 30#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h" 31#include "xfs_dinode.h"
36#include "xfs_inode.h" 32#include "xfs_inode.h"
37#include "xfs_btree.h" 33#include "xfs_btree.h"
38#include "xfs_ialloc.h"
39#include "xfs_trans_priv.h" 34#include "xfs_trans_priv.h"
40#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
36#include "xfs_trace.h"
41 37
42#ifdef XFS_TRANS_DEBUG 38#ifdef XFS_TRANS_DEBUG
43STATIC void 39STATIC void
@@ -47,7 +43,6 @@ xfs_trans_inode_broot_debug(
47#define xfs_trans_inode_broot_debug(ip) 43#define xfs_trans_inode_broot_debug(ip)
48#endif 44#endif
49 45
50
51/* 46/*
52 * Get an inode and join it to the transaction. 47 * Get an inode and join it to the transaction.
53 */ 48 */
@@ -62,78 +57,66 @@ xfs_trans_iget(
62{ 57{
63 int error; 58 int error;
64 59
65 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0); 60 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
66 if (!error && tp) 61 if (!error && tp) {
67 xfs_trans_ijoin(tp, *ipp, lock_flags); 62 xfs_trans_ijoin(tp, *ipp);
63 (*ipp)->i_itemp->ili_lock_flags = lock_flags;
64 }
68 return error; 65 return error;
69} 66}
70 67
71/* 68/*
72 * Add the locked inode to the transaction. 69 * Add a locked inode to the transaction.
73 * The inode must be locked, and it cannot be associated with any 70 *
74 * transaction. The caller must specify the locks already held 71 * The inode must be locked, and it cannot be associated with any transaction.
75 * on the inode.
76 */ 72 */
77void 73void
78xfs_trans_ijoin( 74xfs_trans_ijoin(
79 xfs_trans_t *tp, 75 struct xfs_trans *tp,
80 xfs_inode_t *ip, 76 struct xfs_inode *ip)
81 uint lock_flags)
82{ 77{
83 xfs_inode_log_item_t *iip; 78 xfs_inode_log_item_t *iip;
84 79
85 ASSERT(ip->i_transp == NULL); 80 ASSERT(ip->i_transp == NULL);
86 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 81 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
87 ASSERT(lock_flags & XFS_ILOCK_EXCL);
88 if (ip->i_itemp == NULL) 82 if (ip->i_itemp == NULL)
89 xfs_inode_item_init(ip, ip->i_mount); 83 xfs_inode_item_init(ip, ip->i_mount);
90 iip = ip->i_itemp; 84 iip = ip->i_itemp;
91 ASSERT(iip->ili_flags == 0); 85 ASSERT(iip->ili_lock_flags == 0);
92 86
93 /* 87 /*
94 * Get a log_item_desc to point at the new item. 88 * Get a log_item_desc to point at the new item.
95 */ 89 */
96 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip)); 90 xfs_trans_add_item(tp, &iip->ili_item);
97 91
98 xfs_trans_inode_broot_debug(ip); 92 xfs_trans_inode_broot_debug(ip);
99 93
100 /* 94 /*
101 * If the IO lock is already held, mark that in the inode log item.
102 */
103 if (lock_flags & XFS_IOLOCK_EXCL) {
104 iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
105 } else if (lock_flags & XFS_IOLOCK_SHARED) {
106 iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
107 }
108
109 /*
110 * Initialize i_transp so we can find it with xfs_inode_incore() 95 * Initialize i_transp so we can find it with xfs_inode_incore()
111 * in xfs_trans_iget() above. 96 * in xfs_trans_iget() above.
112 */ 97 */
113 ip->i_transp = tp; 98 ip->i_transp = tp;
114} 99}
115 100
116
117
118/* 101/*
119 * Mark the inode as not needing to be unlocked when the inode item's 102 * Add a locked inode to the transaction.
120 * IOP_UNLOCK() routine is called. The inode must already be locked 103 *
121 * and associated with the given transaction. 104 *
105 * Grabs a reference to the inode which will be dropped when the transaction
106 * is commited. The inode will also be unlocked at that point. The inode
107 * must be locked, and it cannot be associated with any transaction.
122 */ 108 */
123/*ARGSUSED*/
124void 109void
125xfs_trans_ihold( 110xfs_trans_ijoin_ref(
126 xfs_trans_t *tp, 111 struct xfs_trans *tp,
127 xfs_inode_t *ip) 112 struct xfs_inode *ip,
113 uint lock_flags)
128{ 114{
129 ASSERT(ip->i_transp == tp); 115 xfs_trans_ijoin(tp, ip);
130 ASSERT(ip->i_itemp != NULL); 116 IHOLD(ip);
131 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 117 ip->i_itemp->ili_lock_flags = lock_flags;
132
133 ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
134} 118}
135 119
136
137/* 120/*
138 * This is called to mark the fields indicated in fieldmask as needing 121 * This is called to mark the fields indicated in fieldmask as needing
139 * to be logged when the transaction is committed. The inode must 122 * to be logged when the transaction is committed. The inode must
@@ -149,17 +132,12 @@ xfs_trans_log_inode(
149 xfs_inode_t *ip, 132 xfs_inode_t *ip,
150 uint flags) 133 uint flags)
151{ 134{
152 xfs_log_item_desc_t *lidp;
153
154 ASSERT(ip->i_transp == tp); 135 ASSERT(ip->i_transp == tp);
155 ASSERT(ip->i_itemp != NULL); 136 ASSERT(ip->i_itemp != NULL);
156 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 137 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
157 138
158 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
159 ASSERT(lidp != NULL);
160
161 tp->t_flags |= XFS_TRANS_DIRTY; 139 tp->t_flags |= XFS_TRANS_DIRTY;
162 lidp->lid_flags |= XFS_LID_DIRTY; 140 ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
163 141
164 /* 142 /*
165 * Always OR in the bits from the ili_last_fields field. 143 * Always OR in the bits from the ili_last_fields field.
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
deleted file mode 100644
index eb3fc57f9eef..000000000000
--- a/fs/xfs/xfs_trans_item.c
+++ /dev/null
@@ -1,549 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_trans_priv.h"
25/* XXX: from here down needed until struct xfs_trans has its own ailp */
26#include "xfs_bit.h"
27#include "xfs_buf_item.h"
28#include "xfs_sb.h"
29#include "xfs_ag.h"
30#include "xfs_dir2.h"
31#include "xfs_dmapi.h"
32#include "xfs_mount.h"
33
34STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
35 int, int, xfs_lsn_t);
36
37/*
38 * This is called to add the given log item to the transaction's
39 * list of log items. It must find a free log item descriptor
40 * or allocate a new one and add the item to that descriptor.
41 * The function returns a pointer to item descriptor used to point
42 * to the new item. The log item will now point to its new descriptor
43 * with its li_desc field.
44 */
45xfs_log_item_desc_t *
46xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
47{
48 xfs_log_item_desc_t *lidp;
49 xfs_log_item_chunk_t *licp;
50 int i=0;
51
52 /*
53 * If there are no free descriptors, allocate a new chunk
54 * of them and put it at the front of the chunk list.
55 */
56 if (tp->t_items_free == 0) {
57 licp = (xfs_log_item_chunk_t*)
58 kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
59 ASSERT(licp != NULL);
60 /*
61 * Initialize the chunk, and then
62 * claim the first slot in the newly allocated chunk.
63 */
64 xfs_lic_init(licp);
65 xfs_lic_claim(licp, 0);
66 licp->lic_unused = 1;
67 xfs_lic_init_slot(licp, 0);
68 lidp = xfs_lic_slot(licp, 0);
69
70 /*
71 * Link in the new chunk and update the free count.
72 */
73 licp->lic_next = tp->t_items.lic_next;
74 tp->t_items.lic_next = licp;
75 tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
76
77 /*
78 * Initialize the descriptor and the generic portion
79 * of the log item.
80 *
81 * Point the new slot at this item and return it.
82 * Also point the log item at its currently active
83 * descriptor and set the item's mount pointer.
84 */
85 lidp->lid_item = lip;
86 lidp->lid_flags = 0;
87 lidp->lid_size = 0;
88 lip->li_desc = lidp;
89 lip->li_mountp = tp->t_mountp;
90 lip->li_ailp = tp->t_mountp->m_ail;
91 return lidp;
92 }
93
94 /*
95 * Find the free descriptor. It is somewhere in the chunklist
96 * of descriptors.
97 */
98 licp = &tp->t_items;
99 while (licp != NULL) {
100 if (xfs_lic_vacancy(licp)) {
101 if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
102 i = licp->lic_unused;
103 ASSERT(xfs_lic_isfree(licp, i));
104 break;
105 }
106 for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
107 if (xfs_lic_isfree(licp, i))
108 break;
109 }
110 ASSERT(i <= XFS_LIC_MAX_SLOT);
111 break;
112 }
113 licp = licp->lic_next;
114 }
115 ASSERT(licp != NULL);
116 /*
117 * If we find a free descriptor, claim it,
118 * initialize it, and return it.
119 */
120 xfs_lic_claim(licp, i);
121 if (licp->lic_unused <= i) {
122 licp->lic_unused = i + 1;
123 xfs_lic_init_slot(licp, i);
124 }
125 lidp = xfs_lic_slot(licp, i);
126 tp->t_items_free--;
127 lidp->lid_item = lip;
128 lidp->lid_flags = 0;
129 lidp->lid_size = 0;
130 lip->li_desc = lidp;
131 lip->li_mountp = tp->t_mountp;
132 lip->li_ailp = tp->t_mountp->m_ail;
133 return lidp;
134}
135
136/*
137 * Free the given descriptor.
138 *
139 * This requires setting the bit in the chunk's free mask corresponding
140 * to the given slot.
141 */
142void
143xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
144{
145 uint slot;
146 xfs_log_item_chunk_t *licp;
147 xfs_log_item_chunk_t **licpp;
148
149 slot = xfs_lic_desc_to_slot(lidp);
150 licp = xfs_lic_desc_to_chunk(lidp);
151 xfs_lic_relse(licp, slot);
152 lidp->lid_item->li_desc = NULL;
153 tp->t_items_free++;
154
155 /*
156 * If there are no more used items in the chunk and this is not
157 * the chunk embedded in the transaction structure, then free
158 * the chunk. First pull it from the chunk list and then
159 * free it back to the heap. We didn't bother with a doubly
160 * linked list here because the lists should be very short
161 * and this is not a performance path. It's better to save
162 * the memory of the extra pointer.
163 *
164 * Also decrement the transaction structure's count of free items
165 * by the number in a chunk since we are freeing an empty chunk.
166 */
167 if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
168 licpp = &(tp->t_items.lic_next);
169 while (*licpp != licp) {
170 ASSERT(*licpp != NULL);
171 licpp = &((*licpp)->lic_next);
172 }
173 *licpp = licp->lic_next;
174 kmem_free(licp);
175 tp->t_items_free -= XFS_LIC_NUM_SLOTS;
176 }
177}
178
179/*
180 * This is called to find the descriptor corresponding to the given
181 * log item. It returns a pointer to the descriptor.
182 * The log item MUST have a corresponding descriptor in the given
183 * transaction. This routine does not return NULL, it panics.
184 *
185 * The descriptor pointer is kept in the log item's li_desc field.
186 * Just return it.
187 */
188/*ARGSUSED*/
189xfs_log_item_desc_t *
190xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
191{
192 ASSERT(lip->li_desc != NULL);
193
194 return lip->li_desc;
195}
196
197
198/*
199 * Return a pointer to the first descriptor in the chunk list.
200 * This does not return NULL if there are none, it panics.
201 *
202 * The first descriptor must be in either the first or second chunk.
203 * This is because the only chunk allowed to be empty is the first.
204 * All others are freed when they become empty.
205 *
206 * At some point this and xfs_trans_next_item() should be optimized
207 * to quickly look at the mask to determine if there is anything to
208 * look at.
209 */
210xfs_log_item_desc_t *
211xfs_trans_first_item(xfs_trans_t *tp)
212{
213 xfs_log_item_chunk_t *licp;
214 int i;
215
216 licp = &tp->t_items;
217 /*
218 * If it's not in the first chunk, skip to the second.
219 */
220 if (xfs_lic_are_all_free(licp)) {
221 licp = licp->lic_next;
222 }
223
224 /*
225 * Return the first non-free descriptor in the chunk.
226 */
227 ASSERT(!xfs_lic_are_all_free(licp));
228 for (i = 0; i < licp->lic_unused; i++) {
229 if (xfs_lic_isfree(licp, i)) {
230 continue;
231 }
232
233 return xfs_lic_slot(licp, i);
234 }
235 cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
236 return NULL;
237}
238
239
240/*
241 * Given a descriptor, return the next descriptor in the chunk list.
242 * This returns NULL if there are no more used descriptors in the list.
243 *
244 * We do this by first locating the chunk in which the descriptor resides,
245 * and then scanning forward in the chunk and the list for the next
246 * used descriptor.
247 */
248/*ARGSUSED*/
249xfs_log_item_desc_t *
250xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
251{
252 xfs_log_item_chunk_t *licp;
253 int i;
254
255 licp = xfs_lic_desc_to_chunk(lidp);
256
257 /*
258 * First search the rest of the chunk. The for loop keeps us
259 * from referencing things beyond the end of the chunk.
260 */
261 for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
262 if (xfs_lic_isfree(licp, i)) {
263 continue;
264 }
265
266 return xfs_lic_slot(licp, i);
267 }
268
269 /*
270 * Now search the next chunk. It must be there, because the
271 * next chunk would have been freed if it were empty.
272 * If there is no next chunk, return NULL.
273 */
274 if (licp->lic_next == NULL) {
275 return NULL;
276 }
277
278 licp = licp->lic_next;
279 ASSERT(!xfs_lic_are_all_free(licp));
280 for (i = 0; i < licp->lic_unused; i++) {
281 if (xfs_lic_isfree(licp, i)) {
282 continue;
283 }
284
285 return xfs_lic_slot(licp, i);
286 }
287 ASSERT(0);
288 /* NOTREACHED */
289 return NULL; /* keep gcc quite */
290}
291
292/*
293 * This is called to unlock all of the items of a transaction and to free
294 * all the descriptors of that transaction.
295 *
296 * It walks the list of descriptors and unlocks each item. It frees
297 * each chunk except that embedded in the transaction as it goes along.
298 */
299void
300xfs_trans_free_items(
301 xfs_trans_t *tp,
302 int flags)
303{
304 xfs_log_item_chunk_t *licp;
305 xfs_log_item_chunk_t *next_licp;
306 int abort;
307
308 abort = flags & XFS_TRANS_ABORT;
309 licp = &tp->t_items;
310 /*
311 * Special case the embedded chunk so we don't free it below.
312 */
313 if (!xfs_lic_are_all_free(licp)) {
314 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
315 xfs_lic_all_free(licp);
316 licp->lic_unused = 0;
317 }
318 licp = licp->lic_next;
319
320 /*
321 * Unlock each item in each chunk and free the chunks.
322 */
323 while (licp != NULL) {
324 ASSERT(!xfs_lic_are_all_free(licp));
325 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
326 next_licp = licp->lic_next;
327 kmem_free(licp);
328 licp = next_licp;
329 }
330
331 /*
332 * Reset the transaction structure's free item count.
333 */
334 tp->t_items_free = XFS_LIC_NUM_SLOTS;
335 tp->t_items.lic_next = NULL;
336}
337
338
339
340/*
341 * This is called to unlock the items associated with a transaction.
342 * Items which were not logged should be freed.
343 * Those which were logged must still be tracked so they can be unpinned
344 * when the transaction commits.
345 */
346void
347xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
348{
349 xfs_log_item_chunk_t *licp;
350 xfs_log_item_chunk_t *next_licp;
351 xfs_log_item_chunk_t **licpp;
352 int freed;
353
354 freed = 0;
355 licp = &tp->t_items;
356
357 /*
358 * Special case the embedded chunk so we don't free.
359 */
360 if (!xfs_lic_are_all_free(licp)) {
361 freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
362 }
363 licpp = &(tp->t_items.lic_next);
364 licp = licp->lic_next;
365
366 /*
367 * Unlock each item in each chunk, free non-dirty descriptors,
368 * and free empty chunks.
369 */
370 while (licp != NULL) {
371 ASSERT(!xfs_lic_are_all_free(licp));
372 freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
373 next_licp = licp->lic_next;
374 if (xfs_lic_are_all_free(licp)) {
375 *licpp = next_licp;
376 kmem_free(licp);
377 freed -= XFS_LIC_NUM_SLOTS;
378 } else {
379 licpp = &(licp->lic_next);
380 }
381 ASSERT(*licpp == next_licp);
382 licp = next_licp;
383 }
384
385 /*
386 * Fix the free descriptor count in the transaction.
387 */
388 tp->t_items_free += freed;
389}
390
391/*
392 * Unlock each item pointed to by a descriptor in the given chunk.
393 * Stamp the commit lsn into each item if necessary.
394 * Free descriptors pointing to items which are not dirty if freeing_chunk
395 * is zero. If freeing_chunk is non-zero, then we need to unlock all
396 * items in the chunk.
397 *
398 * Return the number of descriptors freed.
399 */
400STATIC int
401xfs_trans_unlock_chunk(
402 xfs_log_item_chunk_t *licp,
403 int freeing_chunk,
404 int abort,
405 xfs_lsn_t commit_lsn)
406{
407 xfs_log_item_desc_t *lidp;
408 xfs_log_item_t *lip;
409 int i;
410 int freed;
411
412 freed = 0;
413 lidp = licp->lic_descs;
414 for (i = 0; i < licp->lic_unused; i++, lidp++) {
415 if (xfs_lic_isfree(licp, i)) {
416 continue;
417 }
418 lip = lidp->lid_item;
419 lip->li_desc = NULL;
420
421 if (commit_lsn != NULLCOMMITLSN)
422 IOP_COMMITTING(lip, commit_lsn);
423 if (abort)
424 lip->li_flags |= XFS_LI_ABORTED;
425 IOP_UNLOCK(lip);
426
427 /*
428 * Free the descriptor if the item is not dirty
429 * within this transaction and the caller is not
430 * going to just free the entire thing regardless.
431 */
432 if (!(freeing_chunk) &&
433 (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
434 xfs_lic_relse(licp, i);
435 freed++;
436 }
437 }
438
439 return freed;
440}
441
442
443/*
444 * This is called to add the given busy item to the transaction's
445 * list of busy items. It must find a free busy item descriptor
446 * or allocate a new one and add the item to that descriptor.
447 * The function returns a pointer to busy descriptor used to point
448 * to the new busy entry. The log busy entry will now point to its new
449 * descriptor with its ???? field.
450 */
451xfs_log_busy_slot_t *
452xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
453{
454 xfs_log_busy_chunk_t *lbcp;
455 xfs_log_busy_slot_t *lbsp;
456 int i=0;
457
458 /*
459 * If there are no free descriptors, allocate a new chunk
460 * of them and put it at the front of the chunk list.
461 */
462 if (tp->t_busy_free == 0) {
463 lbcp = (xfs_log_busy_chunk_t*)
464 kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
465 ASSERT(lbcp != NULL);
466 /*
467 * Initialize the chunk, and then
468 * claim the first slot in the newly allocated chunk.
469 */
470 XFS_LBC_INIT(lbcp);
471 XFS_LBC_CLAIM(lbcp, 0);
472 lbcp->lbc_unused = 1;
473 lbsp = XFS_LBC_SLOT(lbcp, 0);
474
475 /*
476 * Link in the new chunk and update the free count.
477 */
478 lbcp->lbc_next = tp->t_busy.lbc_next;
479 tp->t_busy.lbc_next = lbcp;
480 tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
481
482 /*
483 * Initialize the descriptor and the generic portion
484 * of the log item.
485 *
486 * Point the new slot at this item and return it.
487 * Also point the log item at its currently active
488 * descriptor and set the item's mount pointer.
489 */
490 lbsp->lbc_ag = ag;
491 lbsp->lbc_idx = idx;
492 return lbsp;
493 }
494
495 /*
496 * Find the free descriptor. It is somewhere in the chunklist
497 * of descriptors.
498 */
499 lbcp = &tp->t_busy;
500 while (lbcp != NULL) {
501 if (XFS_LBC_VACANCY(lbcp)) {
502 if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
503 i = lbcp->lbc_unused;
504 break;
505 } else {
506 /* out-of-order vacancy */
507 cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
508 ASSERT(0);
509 }
510 }
511 lbcp = lbcp->lbc_next;
512 }
513 ASSERT(lbcp != NULL);
514 /*
515 * If we find a free descriptor, claim it,
516 * initialize it, and return it.
517 */
518 XFS_LBC_CLAIM(lbcp, i);
519 if (lbcp->lbc_unused <= i) {
520 lbcp->lbc_unused = i + 1;
521 }
522 lbsp = XFS_LBC_SLOT(lbcp, i);
523 tp->t_busy_free--;
524 lbsp->lbc_ag = ag;
525 lbsp->lbc_idx = idx;
526 return lbsp;
527}
528
529
530/*
531 * xfs_trans_free_busy
532 * Free all of the busy lists from a transaction
533 */
534void
535xfs_trans_free_busy(xfs_trans_t *tp)
536{
537 xfs_log_busy_chunk_t *lbcp;
538 xfs_log_busy_chunk_t *lbcq;
539
540 lbcp = tp->t_busy.lbc_next;
541 while (lbcp != NULL) {
542 lbcq = lbcp->lbc_next;
543 kmem_free(lbcp);
544 lbcp = lbcq;
545 }
546
547 XFS_LBC_INIT(&tp->t_busy);
548 tp->t_busy.lbc_unused = 0;
549}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..62da86c90de5 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -23,25 +23,13 @@ struct xfs_log_item_desc;
23struct xfs_mount; 23struct xfs_mount;
24struct xfs_trans; 24struct xfs_trans;
25 25
26/* 26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27 * From xfs_trans_item.c 27void xfs_trans_del_item(struct xfs_log_item *);
28 */ 28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29struct xfs_log_item_desc *xfs_trans_add_item(struct xfs_trans *, 29 int flags);
30 struct xfs_log_item *); 30void xfs_trans_item_committed(struct xfs_log_item *lip,
31void xfs_trans_free_item(struct xfs_trans *, 31 xfs_lsn_t commit_lsn, int aborted);
32 struct xfs_log_item_desc *); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
33struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
34 struct xfs_log_item *);
35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
37 struct xfs_log_item_desc *);
38void xfs_trans_free_items(struct xfs_trans *, int);
39void xfs_trans_unlock_items(struct xfs_trans *,
40 xfs_lsn_t);
41void xfs_trans_free_busy(xfs_trans_t *tp);
42xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
43 xfs_agnumber_t ag,
44 xfs_extlen_t idx);
45 33
46/* 34/*
47 * AIL traversal cursor. 35 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ 76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77 77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */
79
78/* 80/*
79 * These types are 64 bits on disk but are either 32 or 64 bits in memory. 81 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
80 * Disk based types: 82 * Disk based types:
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 4d88616bde91..b7d5769d2df0 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -25,18 +25,14 @@
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h" 27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
30#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
31#include "xfs_dir2_sf.h"
32#include "xfs_attr_sf.h"
33#include "xfs_dinode.h" 30#include "xfs_dinode.h"
34#include "xfs_inode.h" 31#include "xfs_inode.h"
35#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
36#include "xfs_bmap.h" 33#include "xfs_bmap.h"
37#include "xfs_error.h" 34#include "xfs_error.h"
38#include "xfs_quota.h" 35#include "xfs_quota.h"
39#include "xfs_rw.h"
40#include "xfs_itable.h" 36#include "xfs_itable.h"
41#include "xfs_utils.h" 37#include "xfs_utils.h"
42 38
@@ -324,86 +320,3 @@ xfs_bumplink(
324 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 320 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
325 return 0; 321 return 0;
326} 322}
327
328/*
329 * Try to truncate the given file to 0 length. Currently called
330 * only out of xfs_remove when it has to truncate a file to free
331 * up space for the remove to proceed.
332 */
333int
334xfs_truncate_file(
335 xfs_mount_t *mp,
336 xfs_inode_t *ip)
337{
338 xfs_trans_t *tp;
339 int error;
340
341#ifdef QUOTADEBUG
342 /*
343 * This is called to truncate the quotainodes too.
344 */
345 if (XFS_IS_UQUOTA_ON(mp)) {
346 if (ip->i_ino != mp->m_sb.sb_uquotino)
347 ASSERT(ip->i_udquot);
348 }
349 if (XFS_IS_OQUOTA_ON(mp)) {
350 if (ip->i_ino != mp->m_sb.sb_gquotino)
351 ASSERT(ip->i_gdquot);
352 }
353#endif
354 /*
355 * Make the call to xfs_itruncate_start before starting the
356 * transaction, because we cannot make the call while we're
357 * in a transaction.
358 */
359 xfs_ilock(ip, XFS_IOLOCK_EXCL);
360 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
361 if (error) {
362 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
363 return error;
364 }
365
366 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
367 if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
368 XFS_TRANS_PERM_LOG_RES,
369 XFS_ITRUNCATE_LOG_COUNT))) {
370 xfs_trans_cancel(tp, 0);
371 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
372 return error;
373 }
374
375 /*
376 * Follow the normal truncate locking protocol. Since we
377 * hold the inode in the transaction, we know that its number
378 * of references will stay constant.
379 */
380 xfs_ilock(ip, XFS_ILOCK_EXCL);
381 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
382 xfs_trans_ihold(tp, ip);
383 /*
384 * Signal a sync xaction. The only case where that isn't
385 * the case is if we're truncating an already unlinked file
386 * on a wsync fs. In that case, we know the blocks can't
387 * reappear in the file because the links to file are
388 * permanently toast. Currently, we're always going to
389 * want a sync transaction because this code is being
390 * called from places where nlink is guaranteed to be 1
391 * but I'm leaving the tests in to protect against future
392 * changes -- rcc.
393 */
394 error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
395 XFS_DATA_FORK,
396 ((ip->i_d.di_nlink != 0 ||
397 !(mp->m_flags & XFS_MOUNT_WSYNC))
398 ? 1 : 0));
399 if (error) {
400 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
401 XFS_TRANS_ABORT);
402 } else {
403 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
404 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
405 }
406 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
407
408 return error;
409}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index ef321225d269..f55b9678264f 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,7 +18,6 @@
18#ifndef __XFS_UTILS_H__ 18#ifndef __XFS_UTILS_H__
19#define __XFS_UTILS_H__ 19#define __XFS_UTILS_H__
20 20
21extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
22extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, 21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
23 xfs_dev_t, cred_t *, prid_t, int, 22 xfs_dev_t, cred_t *, prid_t, int,
24 xfs_inode_t **, int *); 23 xfs_inode_t **, int *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be0ea38..4c7c7bfb2b2f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -26,19 +26,14 @@
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h" 27#include "xfs_ag.h"
28#include "xfs_dir2.h" 28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h" 29#include "xfs_mount.h"
31#include "xfs_da_btree.h" 30#include "xfs_da_btree.h"
32#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h" 32#include "xfs_ialloc_btree.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h" 33#include "xfs_dinode.h"
38#include "xfs_inode.h" 34#include "xfs_inode.h"
39#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
40#include "xfs_itable.h" 36#include "xfs_itable.h"
41#include "xfs_btree.h"
42#include "xfs_ialloc.h" 37#include "xfs_ialloc.h"
43#include "xfs_alloc.h" 38#include "xfs_alloc.h"
44#include "xfs_bmap.h" 39#include "xfs_bmap.h"
@@ -73,7 +68,7 @@ xfs_setattr(
73 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; 68 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
74 int need_iolock = 1; 69 int need_iolock = 1;
75 70
76 xfs_itrace_entry(ip); 71 trace_xfs_setattr(ip);
77 72
78 if (mp->m_flags & XFS_MOUNT_RDONLY) 73 if (mp->m_flags & XFS_MOUNT_RDONLY)
79 return XFS_ERROR(EROFS); 74 return XFS_ERROR(EROFS);
@@ -143,16 +138,6 @@ xfs_setattr(
143 goto error_return; 138 goto error_return;
144 } 139 }
145 } else { 140 } else {
146 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
147 !(flags & XFS_ATTR_DMI)) {
148 int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
149 code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
150 iattr->ia_size, 0, dmflags, NULL);
151 if (code) {
152 lock_flags = 0;
153 goto error_return;
154 }
155 }
156 if (need_iolock) 141 if (need_iolock)
157 lock_flags |= XFS_IOLOCK_EXCL; 142 lock_flags |= XFS_IOLOCK_EXCL;
158 } 143 }
@@ -236,8 +221,11 @@ xfs_setattr(
236 * transaction to modify the i_size. 221 * transaction to modify the i_size.
237 */ 222 */
238 code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); 223 code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
224 if (code)
225 goto error_return;
239 } 226 }
240 xfs_iunlock(ip, XFS_ILOCK_EXCL); 227 xfs_iunlock(ip, XFS_ILOCK_EXCL);
228 lock_flags &= ~XFS_ILOCK_EXCL;
241 229
242 /* 230 /*
243 * We are going to log the inode size change in this 231 * We are going to log the inode size change in this
@@ -251,40 +239,38 @@ xfs_setattr(
251 * really care about here and prevents waiting for other data 239 * really care about here and prevents waiting for other data
252 * not within the range we care about here. 240 * not within the range we care about here.
253 */ 241 */
254 if (!code && 242 if (ip->i_size != ip->i_d.di_size &&
255 ip->i_size != ip->i_d.di_size &&
256 iattr->ia_size > ip->i_d.di_size) { 243 iattr->ia_size > ip->i_d.di_size) {
257 code = xfs_flush_pages(ip, 244 code = xfs_flush_pages(ip,
258 ip->i_d.di_size, iattr->ia_size, 245 ip->i_d.di_size, iattr->ia_size,
259 XBF_ASYNC, FI_NONE); 246 XBF_ASYNC, FI_NONE);
247 if (code)
248 goto error_return;
260 } 249 }
261 250
262 /* wait for all I/O to complete */ 251 /* wait for all I/O to complete */
263 xfs_ioend_wait(ip); 252 xfs_ioend_wait(ip);
264 253
265 if (!code) 254 code = -block_truncate_page(inode->i_mapping, iattr->ia_size,
266 code = xfs_itruncate_data(ip, iattr->ia_size); 255 xfs_get_blocks);
267 if (code) { 256 if (code)
268 ASSERT(tp == NULL);
269 lock_flags &= ~XFS_ILOCK_EXCL;
270 ASSERT(lock_flags == XFS_IOLOCK_EXCL);
271 goto error_return; 257 goto error_return;
272 } 258
273 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); 259 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
274 if ((code = xfs_trans_reserve(tp, 0, 260 code = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
275 XFS_ITRUNCATE_LOG_RES(mp), 0, 261 XFS_TRANS_PERM_LOG_RES,
276 XFS_TRANS_PERM_LOG_RES, 262 XFS_ITRUNCATE_LOG_COUNT);
277 XFS_ITRUNCATE_LOG_COUNT))) { 263 if (code)
278 xfs_trans_cancel(tp, 0); 264 goto error_return;
279 if (need_iolock) 265
280 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 266 truncate_setsize(inode, iattr->ia_size);
281 return code; 267
282 }
283 commit_flags = XFS_TRANS_RELEASE_LOG_RES; 268 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
269 lock_flags |= XFS_ILOCK_EXCL;
270
284 xfs_ilock(ip, XFS_ILOCK_EXCL); 271 xfs_ilock(ip, XFS_ILOCK_EXCL);
285 272
286 xfs_trans_ijoin(tp, ip, lock_flags); 273 xfs_trans_ijoin(tp, ip);
287 xfs_trans_ihold(tp, ip);
288 274
289 /* 275 /*
290 * Only change the c/mtime if we are changing the size 276 * Only change the c/mtime if we are changing the size
@@ -334,8 +320,7 @@ xfs_setattr(
334 xfs_iflags_set(ip, XFS_ITRUNCATED); 320 xfs_iflags_set(ip, XFS_ITRUNCATED);
335 } 321 }
336 } else if (tp) { 322 } else if (tp) {
337 xfs_trans_ijoin(tp, ip, lock_flags); 323 xfs_trans_ijoin(tp, ip);
338 xfs_trans_ihold(tp, ip);
339 } 324 }
340 325
341 /* 326 /*
@@ -470,17 +455,10 @@ xfs_setattr(
470 return XFS_ERROR(code); 455 return XFS_ERROR(code);
471 } 456 }
472 457
473 if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
474 !(flags & XFS_ATTR_DMI)) {
475 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
476 NULL, DM_RIGHT_NULL, NULL, NULL,
477 0, 0, AT_DELAY_FLAG(flags));
478 }
479 return 0; 458 return 0;
480 459
481 abort_return: 460 abort_return:
482 commit_flags |= XFS_TRANS_ABORT; 461 commit_flags |= XFS_TRANS_ABORT;
483 /* FALLTHROUGH */
484 error_return: 462 error_return:
485 xfs_qm_dqrele(udqp); 463 xfs_qm_dqrele(udqp);
486 xfs_qm_dqrele(gdqp); 464 xfs_qm_dqrele(gdqp);
@@ -516,7 +494,7 @@ xfs_readlink_bmap(
516 int error = 0; 494 int error = 0;
517 495
518 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, 496 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
519 mval, &nmaps, NULL, NULL); 497 mval, &nmaps, NULL);
520 if (error) 498 if (error)
521 goto out; 499 goto out;
522 500
@@ -557,7 +535,7 @@ xfs_readlink(
557 int pathlen; 535 int pathlen;
558 int error = 0; 536 int error = 0;
559 537
560 xfs_itrace_entry(ip); 538 trace_xfs_readlink(ip);
561 539
562 if (XFS_FORCED_SHUTDOWN(mp)) 540 if (XFS_FORCED_SHUTDOWN(mp))
563 return XFS_ERROR(EIO); 541 return XFS_ERROR(EIO);
@@ -613,14 +591,14 @@ xfs_free_eofblocks(
613 */ 591 */
614 end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); 592 end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
615 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 593 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
616 map_len = last_fsb - end_fsb; 594 if (last_fsb <= end_fsb)
617 if (map_len <= 0)
618 return 0; 595 return 0;
596 map_len = last_fsb - end_fsb;
619 597
620 nimaps = 1; 598 nimaps = 1;
621 xfs_ilock(ip, XFS_ILOCK_SHARED); 599 xfs_ilock(ip, XFS_ILOCK_SHARED);
622 error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, 600 error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
623 NULL, 0, &imap, &nimaps, NULL, NULL); 601 NULL, 0, &imap, &nimaps, NULL);
624 xfs_iunlock(ip, XFS_ILOCK_SHARED); 602 xfs_iunlock(ip, XFS_ILOCK_SHARED);
625 603
626 if (!error && (nimaps != 0) && 604 if (!error && (nimaps != 0) &&
@@ -675,10 +653,7 @@ xfs_free_eofblocks(
675 } 653 }
676 654
677 xfs_ilock(ip, XFS_ILOCK_EXCL); 655 xfs_ilock(ip, XFS_ILOCK_EXCL);
678 xfs_trans_ijoin(tp, ip, 656 xfs_trans_ijoin(tp, ip);
679 XFS_IOLOCK_EXCL |
680 XFS_ILOCK_EXCL);
681 xfs_trans_ihold(tp, ip);
682 657
683 error = xfs_itruncate_finish(&tp, ip, 658 error = xfs_itruncate_finish(&tp, ip,
684 ip->i_size, 659 ip->i_size,
@@ -750,8 +725,7 @@ xfs_inactive_symlink_rmt(
750 xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 725 xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
751 size = (int)ip->i_d.di_size; 726 size = (int)ip->i_d.di_size;
752 ip->i_d.di_size = 0; 727 ip->i_d.di_size = 0;
753 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 728 xfs_trans_ijoin(tp, ip);
754 xfs_trans_ihold(tp, ip);
755 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 729 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
756 /* 730 /*
757 * Find the block(s) so we can inval and unmap them. 731 * Find the block(s) so we can inval and unmap them.
@@ -761,7 +735,7 @@ xfs_inactive_symlink_rmt(
761 nmaps = ARRAY_SIZE(mval); 735 nmaps = ARRAY_SIZE(mval);
762 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), 736 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
763 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, 737 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
764 &free_list, NULL))) 738 &free_list)))
765 goto error0; 739 goto error0;
766 /* 740 /*
767 * Invalidate the block(s). 741 * Invalidate the block(s).
@@ -776,7 +750,7 @@ xfs_inactive_symlink_rmt(
776 * Unmap the dead block(s) to the free_list. 750 * Unmap the dead block(s) to the free_list.
777 */ 751 */
778 if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, 752 if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
779 &first_block, &free_list, NULL, &done))) 753 &first_block, &free_list, &done)))
780 goto error1; 754 goto error1;
781 ASSERT(done); 755 ASSERT(done);
782 /* 756 /*
@@ -795,8 +769,7 @@ xfs_inactive_symlink_rmt(
795 * Mark it dirty so it will be logged and moved forward in the log as 769 * Mark it dirty so it will be logged and moved forward in the log as
796 * part of every commit. 770 * part of every commit.
797 */ 771 */
798 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 772 xfs_trans_ijoin(tp, ip);
799 xfs_trans_ihold(tp, ip);
800 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 773 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
801 /* 774 /*
802 * Get a new, empty transaction to return to our caller. 775 * Get a new, empty transaction to return to our caller.
@@ -929,8 +902,7 @@ xfs_inactive_attrs(
929 goto error_cancel; 902 goto error_cancel;
930 903
931 xfs_ilock(ip, XFS_ILOCK_EXCL); 904 xfs_ilock(ip, XFS_ILOCK_EXCL);
932 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 905 xfs_trans_ijoin(tp, ip);
933 xfs_trans_ihold(tp, ip);
934 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 906 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
935 907
936 ASSERT(ip->i_d.di_anextents == 0); 908 ASSERT(ip->i_d.di_anextents == 0);
@@ -1035,8 +1007,6 @@ xfs_inactive(
1035 int error; 1007 int error;
1036 int truncate; 1008 int truncate;
1037 1009
1038 xfs_itrace_entry(ip);
1039
1040 /* 1010 /*
1041 * If the inode is already free, then there can be nothing 1011 * If the inode is already free, then there can be nothing
1042 * to clean up here. 1012 * to clean up here.
@@ -1060,9 +1030,6 @@ xfs_inactive(
1060 1030
1061 mp = ip->i_mount; 1031 mp = ip->i_mount;
1062 1032
1063 if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
1064 XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
1065
1066 error = 0; 1033 error = 0;
1067 1034
1068 /* If this is a read-only mount, don't do this (would generate I/O) */ 1035 /* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1120,8 +1087,7 @@ xfs_inactive(
1120 } 1087 }
1121 1088
1122 xfs_ilock(ip, XFS_ILOCK_EXCL); 1089 xfs_ilock(ip, XFS_ILOCK_EXCL);
1123 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1090 xfs_trans_ijoin(tp, ip);
1124 xfs_trans_ihold(tp, ip);
1125 1091
1126 /* 1092 /*
1127 * normally, we have to run xfs_itruncate_finish sync. 1093 * normally, we have to run xfs_itruncate_finish sync.
@@ -1154,8 +1120,7 @@ xfs_inactive(
1154 return VN_INACTIVE_CACHE; 1120 return VN_INACTIVE_CACHE;
1155 } 1121 }
1156 1122
1157 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1123 xfs_trans_ijoin(tp, ip);
1158 xfs_trans_ihold(tp, ip);
1159 } else { 1124 } else {
1160 error = xfs_trans_reserve(tp, 0, 1125 error = xfs_trans_reserve(tp, 0,
1161 XFS_IFREE_LOG_RES(mp), 1126 XFS_IFREE_LOG_RES(mp),
@@ -1168,8 +1133,7 @@ xfs_inactive(
1168 } 1133 }
1169 1134
1170 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1135 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1171 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1136 xfs_trans_ijoin(tp, ip);
1172 xfs_trans_ihold(tp, ip);
1173 } 1137 }
1174 1138
1175 /* 1139 /*
@@ -1257,7 +1221,7 @@ xfs_lookup(
1257 int error; 1221 int error;
1258 uint lock_mode; 1222 uint lock_mode;
1259 1223
1260 xfs_itrace_entry(dp); 1224 trace_xfs_lookup(dp, name);
1261 1225
1262 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 1226 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1263 return XFS_ERROR(EIO); 1227 return XFS_ERROR(EIO);
@@ -1269,7 +1233,7 @@ xfs_lookup(
1269 if (error) 1233 if (error)
1270 goto out; 1234 goto out;
1271 1235
1272 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0); 1236 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
1273 if (error) 1237 if (error)
1274 goto out_free_name; 1238 goto out_free_name;
1275 1239
@@ -1309,21 +1273,11 @@ xfs_create(
1309 uint log_res; 1273 uint log_res;
1310 uint log_count; 1274 uint log_count;
1311 1275
1312 xfs_itrace_entry(dp); 1276 trace_xfs_create(dp, name);
1313 1277
1314 if (XFS_FORCED_SHUTDOWN(mp)) 1278 if (XFS_FORCED_SHUTDOWN(mp))
1315 return XFS_ERROR(EIO); 1279 return XFS_ERROR(EIO);
1316 1280
1317 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1318 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1319 dp, DM_RIGHT_NULL, NULL,
1320 DM_RIGHT_NULL, name->name, NULL,
1321 mode, 0, 0);
1322
1323 if (error)
1324 return error;
1325 }
1326
1327 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1281 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1328 prid = dp->i_d.di_projid; 1282 prid = dp->i_d.di_projid;
1329 else 1283 else
@@ -1427,8 +1381,7 @@ xfs_create(
1427 * the transaction cancel unlocking dp so don't do it explicitly in the 1381 * the transaction cancel unlocking dp so don't do it explicitly in the
1428 * error path. 1382 * error path.
1429 */ 1383 */
1430 IHOLD(dp); 1384 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
1431 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1432 unlock_dp_on_error = B_FALSE; 1385 unlock_dp_on_error = B_FALSE;
1433 1386
1434 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 1387 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1487,16 +1440,7 @@ xfs_create(
1487 xfs_qm_dqrele(gdqp); 1440 xfs_qm_dqrele(gdqp);
1488 1441
1489 *ipp = ip; 1442 *ipp = ip;
1490 1443 return 0;
1491 /* Fallthrough to std_return with error = 0 */
1492 std_return:
1493 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1494 XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
1495 ip, DM_RIGHT_NULL, name->name, NULL, mode,
1496 error, 0);
1497 }
1498
1499 return error;
1500 1444
1501 out_bmap_cancel: 1445 out_bmap_cancel:
1502 xfs_bmap_cancel(&free_list); 1446 xfs_bmap_cancel(&free_list);
@@ -1510,8 +1454,8 @@ xfs_create(
1510 1454
1511 if (unlock_dp_on_error) 1455 if (unlock_dp_on_error)
1512 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1456 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1513 1457 std_return:
1514 goto std_return; 1458 return error;
1515 1459
1516 out_abort_rele: 1460 out_abort_rele:
1517 /* 1461 /*
@@ -1726,20 +1670,11 @@ xfs_remove(
1726 uint resblks; 1670 uint resblks;
1727 uint log_count; 1671 uint log_count;
1728 1672
1729 xfs_itrace_entry(dp); 1673 trace_xfs_remove(dp, name);
1730 xfs_itrace_entry(ip);
1731 1674
1732 if (XFS_FORCED_SHUTDOWN(mp)) 1675 if (XFS_FORCED_SHUTDOWN(mp))
1733 return XFS_ERROR(EIO); 1676 return XFS_ERROR(EIO);
1734 1677
1735 if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
1736 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
1737 NULL, DM_RIGHT_NULL, name->name, NULL,
1738 ip->i_d.di_mode, 0, 0);
1739 if (error)
1740 return error;
1741 }
1742
1743 error = xfs_qm_dqattach(dp, 0); 1678 error = xfs_qm_dqattach(dp, 0);
1744 if (error) 1679 if (error)
1745 goto std_return; 1680 goto std_return;
@@ -1782,15 +1717,8 @@ xfs_remove(
1782 1717
1783 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); 1718 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
1784 1719
1785 /* 1720 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
1786 * At this point, we've gotten both the directory and the entry 1721 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
1787 * inodes locked.
1788 */
1789 IHOLD(ip);
1790 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1791
1792 IHOLD(dp);
1793 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1794 1722
1795 /* 1723 /*
1796 * If we're removing a directory perform some additional validation. 1724 * If we're removing a directory perform some additional validation.
@@ -1877,21 +1805,15 @@ xfs_remove(
1877 if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) 1805 if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1878 xfs_filestream_deassociate(ip); 1806 xfs_filestream_deassociate(ip);
1879 1807
1880 std_return: 1808 return 0;
1881 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
1882 XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
1883 NULL, DM_RIGHT_NULL, name->name, NULL,
1884 ip->i_d.di_mode, error, 0);
1885 }
1886
1887 return error;
1888 1809
1889 out_bmap_cancel: 1810 out_bmap_cancel:
1890 xfs_bmap_cancel(&free_list); 1811 xfs_bmap_cancel(&free_list);
1891 cancel_flags |= XFS_TRANS_ABORT; 1812 cancel_flags |= XFS_TRANS_ABORT;
1892 out_trans_cancel: 1813 out_trans_cancel:
1893 xfs_trans_cancel(tp, cancel_flags); 1814 xfs_trans_cancel(tp, cancel_flags);
1894 goto std_return; 1815 std_return:
1816 return error;
1895} 1817}
1896 1818
1897int 1819int
@@ -1909,25 +1831,13 @@ xfs_link(
1909 int committed; 1831 int committed;
1910 int resblks; 1832 int resblks;
1911 1833
1912 xfs_itrace_entry(tdp); 1834 trace_xfs_link(tdp, target_name);
1913 xfs_itrace_entry(sip);
1914 1835
1915 ASSERT(!S_ISDIR(sip->i_d.di_mode)); 1836 ASSERT(!S_ISDIR(sip->i_d.di_mode));
1916 1837
1917 if (XFS_FORCED_SHUTDOWN(mp)) 1838 if (XFS_FORCED_SHUTDOWN(mp))
1918 return XFS_ERROR(EIO); 1839 return XFS_ERROR(EIO);
1919 1840
1920 if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
1921 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
1922 tdp, DM_RIGHT_NULL,
1923 sip, DM_RIGHT_NULL,
1924 target_name->name, NULL, 0, 0, 0);
1925 if (error)
1926 return error;
1927 }
1928
1929 /* Return through std_return after this point. */
1930
1931 error = xfs_qm_dqattach(sip, 0); 1841 error = xfs_qm_dqattach(sip, 0);
1932 if (error) 1842 if (error)
1933 goto std_return; 1843 goto std_return;
@@ -1953,15 +1863,8 @@ xfs_link(
1953 1863
1954 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); 1864 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1955 1865
1956 /* 1866 xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
1957 * Increment vnode ref counts since xfs_trans_commit & 1867 xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
1958 * xfs_trans_cancel will both unlock the inodes and
1959 * decrement the associated ref counts.
1960 */
1961 IHOLD(sip);
1962 IHOLD(tdp);
1963 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1964 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1965 1868
1966 /* 1869 /*
1967 * If the source has too many links, we can't make any more to it. 1870 * If the source has too many links, we can't make any more to it.
@@ -2014,27 +1917,14 @@ xfs_link(
2014 goto abort_return; 1917 goto abort_return;
2015 } 1918 }
2016 1919
2017 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1920 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2018 if (error)
2019 goto std_return;
2020
2021 /* Fall through to std_return with error = 0. */
2022std_return:
2023 if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2024 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2025 tdp, DM_RIGHT_NULL,
2026 sip, DM_RIGHT_NULL,
2027 target_name->name, NULL, 0, error, 0);
2028 }
2029 return error;
2030 1921
2031 abort_return: 1922 abort_return:
2032 cancel_flags |= XFS_TRANS_ABORT; 1923 cancel_flags |= XFS_TRANS_ABORT;
2033 /* FALLTHROUGH */
2034
2035 error_return: 1924 error_return:
2036 xfs_trans_cancel(tp, cancel_flags); 1925 xfs_trans_cancel(tp, cancel_flags);
2037 goto std_return; 1926 std_return:
1927 return error;
2038} 1928}
2039 1929
2040int 1930int
@@ -2074,7 +1964,7 @@ xfs_symlink(
2074 ip = NULL; 1964 ip = NULL;
2075 tp = NULL; 1965 tp = NULL;
2076 1966
2077 xfs_itrace_entry(dp); 1967 trace_xfs_symlink(dp, link_name);
2078 1968
2079 if (XFS_FORCED_SHUTDOWN(mp)) 1969 if (XFS_FORCED_SHUTDOWN(mp))
2080 return XFS_ERROR(EIO); 1970 return XFS_ERROR(EIO);
@@ -2086,17 +1976,6 @@ xfs_symlink(
2086 if (pathlen >= MAXPATHLEN) /* total string too long */ 1976 if (pathlen >= MAXPATHLEN) /* total string too long */
2087 return XFS_ERROR(ENAMETOOLONG); 1977 return XFS_ERROR(ENAMETOOLONG);
2088 1978
2089 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2090 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2091 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2092 link_name->name,
2093 (unsigned char *)target_path, 0, 0, 0);
2094 if (error)
2095 return error;
2096 }
2097
2098 /* Return through std_return after this point. */
2099
2100 udqp = gdqp = NULL; 1979 udqp = gdqp = NULL;
2101 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1980 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2102 prid = dp->i_d.di_projid; 1981 prid = dp->i_d.di_projid;
@@ -2180,8 +2059,7 @@ xfs_symlink(
2180 * transaction cancel unlocking dp so don't do it explicitly in the 2059 * transaction cancel unlocking dp so don't do it explicitly in the
2181 * error path. 2060 * error path.
2182 */ 2061 */
2183 IHOLD(dp); 2062 xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
2184 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2185 unlock_dp_on_error = B_FALSE; 2063 unlock_dp_on_error = B_FALSE;
2186 2064
2187 /* 2065 /*
@@ -2215,7 +2093,7 @@ xfs_symlink(
2215 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, 2093 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
2216 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, 2094 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
2217 &first_block, resblks, mval, &nmaps, 2095 &first_block, resblks, mval, &nmaps,
2218 &free_list, NULL); 2096 &free_list);
2219 if (error) { 2097 if (error) {
2220 goto error1; 2098 goto error1;
2221 } 2099 }
@@ -2278,21 +2156,8 @@ xfs_symlink(
2278 xfs_qm_dqrele(udqp); 2156 xfs_qm_dqrele(udqp);
2279 xfs_qm_dqrele(gdqp); 2157 xfs_qm_dqrele(gdqp);
2280 2158
2281 /* Fall through to std_return with error = 0 or errno from 2159 *ipp = ip;
2282 * xfs_trans_commit */ 2160 return 0;
2283std_return:
2284 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
2285 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
2286 dp, DM_RIGHT_NULL,
2287 error ? NULL : ip,
2288 DM_RIGHT_NULL, link_name->name,
2289 (unsigned char *)target_path,
2290 0, error, 0);
2291 }
2292
2293 if (!error)
2294 *ipp = ip;
2295 return error;
2296 2161
2297 error2: 2162 error2:
2298 IRELE(ip); 2163 IRELE(ip);
@@ -2306,8 +2171,8 @@ std_return:
2306 2171
2307 if (unlock_dp_on_error) 2172 if (unlock_dp_on_error)
2308 xfs_iunlock(dp, XFS_ILOCK_EXCL); 2173 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2309 2174 std_return:
2310 goto std_return; 2175 return error;
2311} 2176}
2312 2177
2313int 2178int
@@ -2333,13 +2198,12 @@ xfs_set_dmattrs(
2333 return error; 2198 return error;
2334 } 2199 }
2335 xfs_ilock(ip, XFS_ILOCK_EXCL); 2200 xfs_ilock(ip, XFS_ILOCK_EXCL);
2336 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2201 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
2337 2202
2338 ip->i_d.di_dmevmask = evmask; 2203 ip->i_d.di_dmevmask = evmask;
2339 ip->i_d.di_dmstate = state; 2204 ip->i_d.di_dmstate = state;
2340 2205
2341 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2206 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2342 IHOLD(ip);
2343 error = xfs_trans_commit(tp, 0); 2207 error = xfs_trans_commit(tp, 0);
2344 2208
2345 return error; 2209 return error;
@@ -2390,7 +2254,7 @@ xfs_alloc_file_space(
2390 int committed; 2254 int committed;
2391 int error; 2255 int error;
2392 2256
2393 xfs_itrace_entry(ip); 2257 trace_xfs_alloc_file_space(ip);
2394 2258
2395 if (XFS_FORCED_SHUTDOWN(mp)) 2259 if (XFS_FORCED_SHUTDOWN(mp))
2396 return XFS_ERROR(EIO); 2260 return XFS_ERROR(EIO);
@@ -2412,25 +2276,9 @@ xfs_alloc_file_space(
2412 startoffset_fsb = XFS_B_TO_FSBT(mp, offset); 2276 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
2413 allocatesize_fsb = XFS_B_TO_FSB(mp, count); 2277 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
2414 2278
2415 /* Generate a DMAPI event if needed. */
2416 if (alloc_type != 0 && offset < ip->i_size &&
2417 (attr_flags & XFS_ATTR_DMI) == 0 &&
2418 DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
2419 xfs_off_t end_dmi_offset;
2420
2421 end_dmi_offset = offset+len;
2422 if (end_dmi_offset > ip->i_size)
2423 end_dmi_offset = ip->i_size;
2424 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
2425 end_dmi_offset - offset, 0, NULL);
2426 if (error)
2427 return error;
2428 }
2429
2430 /* 2279 /*
2431 * Allocate file space until done or until there is an error 2280 * Allocate file space until done or until there is an error
2432 */ 2281 */
2433retry:
2434 while (allocatesize_fsb && !error) { 2282 while (allocatesize_fsb && !error) {
2435 xfs_fileoff_t s, e; 2283 xfs_fileoff_t s, e;
2436 2284
@@ -2451,15 +2299,22 @@ retry:
2451 e = allocatesize_fsb; 2299 e = allocatesize_fsb;
2452 } 2300 }
2453 2301
2302 /*
2303 * The transaction reservation is limited to a 32-bit block
2304 * count, hence we need to limit the number of blocks we are
2305 * trying to reserve to avoid an overflow. We can't allocate
2306 * more than @nimaps extents, and an extent is limited on disk
2307 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
2308 */
2309 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
2454 if (unlikely(rt)) { 2310 if (unlikely(rt)) {
2455 resrtextents = qblocks = (uint)(e - s); 2311 resrtextents = qblocks = resblks;
2456 resrtextents /= mp->m_sb.sb_rextsize; 2312 resrtextents /= mp->m_sb.sb_rextsize;
2457 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 2313 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2458 quota_flag = XFS_QMOPT_RES_RTBLKS; 2314 quota_flag = XFS_QMOPT_RES_RTBLKS;
2459 } else { 2315 } else {
2460 resrtextents = 0; 2316 resrtextents = 0;
2461 resblks = qblocks = \ 2317 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
2462 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
2463 quota_flag = XFS_QMOPT_RES_REGBLKS; 2318 quota_flag = XFS_QMOPT_RES_REGBLKS;
2464 } 2319 }
2465 2320
@@ -2488,8 +2343,7 @@ retry:
2488 if (error) 2343 if (error)
2489 goto error1; 2344 goto error1;
2490 2345
2491 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2346 xfs_trans_ijoin(tp, ip);
2492 xfs_trans_ihold(tp, ip);
2493 2347
2494 /* 2348 /*
2495 * Issue the xfs_bmapi() call to allocate the blocks 2349 * Issue the xfs_bmapi() call to allocate the blocks
@@ -2498,7 +2352,7 @@ retry:
2498 error = xfs_bmapi(tp, ip, startoffset_fsb, 2352 error = xfs_bmapi(tp, ip, startoffset_fsb,
2499 allocatesize_fsb, bmapi_flag, 2353 allocatesize_fsb, bmapi_flag,
2500 &firstfsb, 0, imapp, &nimaps, 2354 &firstfsb, 0, imapp, &nimaps,
2501 &free_list, NULL); 2355 &free_list);
2502 if (error) { 2356 if (error) {
2503 goto error0; 2357 goto error0;
2504 } 2358 }
@@ -2527,17 +2381,6 @@ retry:
2527 startoffset_fsb += allocated_fsb; 2381 startoffset_fsb += allocated_fsb;
2528 allocatesize_fsb -= allocated_fsb; 2382 allocatesize_fsb -= allocated_fsb;
2529 } 2383 }
2530dmapi_enospc_check:
2531 if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
2532 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
2533 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
2534 ip, DM_RIGHT_NULL,
2535 ip, DM_RIGHT_NULL,
2536 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
2537 if (error == 0)
2538 goto retry; /* Maybe DMAPI app. has made space */
2539 /* else fall through with error from XFS_SEND_DATA */
2540 }
2541 2384
2542 return error; 2385 return error;
2543 2386
@@ -2548,7 +2391,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
2548error1: /* Just cancel transaction */ 2391error1: /* Just cancel transaction */
2549 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 2392 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
2550 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2393 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2551 goto dmapi_enospc_check; 2394 return error;
2552} 2395}
2553 2396
2554/* 2397/*
@@ -2598,7 +2441,7 @@ xfs_zero_remaining_bytes(
2598 offset_fsb = XFS_B_TO_FSBT(mp, offset); 2441 offset_fsb = XFS_B_TO_FSBT(mp, offset);
2599 nimap = 1; 2442 nimap = 1;
2600 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, 2443 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
2601 NULL, 0, &imap, &nimap, NULL, NULL); 2444 NULL, 0, &imap, &nimap, NULL);
2602 if (error || nimap < 1) 2445 if (error || nimap < 1)
2603 break; 2446 break;
2604 ASSERT(imap.br_blockcount >= 1); 2447 ASSERT(imap.br_blockcount >= 1);
@@ -2661,7 +2504,6 @@ xfs_free_file_space(
2661{ 2504{
2662 int committed; 2505 int committed;
2663 int done; 2506 int done;
2664 xfs_off_t end_dmi_offset;
2665 xfs_fileoff_t endoffset_fsb; 2507 xfs_fileoff_t endoffset_fsb;
2666 int error; 2508 int error;
2667 xfs_fsblock_t firstfsb; 2509 xfs_fsblock_t firstfsb;
@@ -2680,7 +2522,7 @@ xfs_free_file_space(
2680 2522
2681 mp = ip->i_mount; 2523 mp = ip->i_mount;
2682 2524
2683 xfs_itrace_entry(ip); 2525 trace_xfs_free_file_space(ip);
2684 2526
2685 error = xfs_qm_dqattach(ip, 0); 2527 error = xfs_qm_dqattach(ip, 0);
2686 if (error) 2528 if (error)
@@ -2691,19 +2533,7 @@ xfs_free_file_space(
2691 return error; 2533 return error;
2692 rt = XFS_IS_REALTIME_INODE(ip); 2534 rt = XFS_IS_REALTIME_INODE(ip);
2693 startoffset_fsb = XFS_B_TO_FSB(mp, offset); 2535 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
2694 end_dmi_offset = offset + len; 2536 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
2695 endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
2696
2697 if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
2698 DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
2699 if (end_dmi_offset > ip->i_size)
2700 end_dmi_offset = ip->i_size;
2701 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
2702 offset, end_dmi_offset - offset,
2703 AT_DELAY_FLAG(attr_flags), NULL);
2704 if (error)
2705 return error;
2706 }
2707 2537
2708 if (attr_flags & XFS_ATTR_NOLOCK) 2538 if (attr_flags & XFS_ATTR_NOLOCK)
2709 need_iolock = 0; 2539 need_iolock = 0;
@@ -2731,7 +2561,7 @@ xfs_free_file_space(
2731 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { 2561 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
2732 nimap = 1; 2562 nimap = 1;
2733 error = xfs_bmapi(NULL, ip, startoffset_fsb, 2563 error = xfs_bmapi(NULL, ip, startoffset_fsb,
2734 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); 2564 1, 0, NULL, 0, &imap, &nimap, NULL);
2735 if (error) 2565 if (error)
2736 goto out_unlock_iolock; 2566 goto out_unlock_iolock;
2737 ASSERT(nimap == 0 || nimap == 1); 2567 ASSERT(nimap == 0 || nimap == 1);
@@ -2746,7 +2576,7 @@ xfs_free_file_space(
2746 } 2576 }
2747 nimap = 1; 2577 nimap = 1;
2748 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 2578 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
2749 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); 2579 1, 0, NULL, 0, &imap, &nimap, NULL);
2750 if (error) 2580 if (error)
2751 goto out_unlock_iolock; 2581 goto out_unlock_iolock;
2752 ASSERT(nimap == 0 || nimap == 1); 2582 ASSERT(nimap == 0 || nimap == 1);
@@ -2814,8 +2644,7 @@ xfs_free_file_space(
2814 if (error) 2644 if (error)
2815 goto error1; 2645 goto error1;
2816 2646
2817 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2647 xfs_trans_ijoin(tp, ip);
2818 xfs_trans_ihold(tp, ip);
2819 2648
2820 /* 2649 /*
2821 * issue the bunmapi() call to free the blocks 2650 * issue the bunmapi() call to free the blocks
@@ -2823,7 +2652,7 @@ xfs_free_file_space(
2823 xfs_bmap_init(&free_list, &firstfsb); 2652 xfs_bmap_init(&free_list, &firstfsb);
2824 error = xfs_bunmapi(tp, ip, startoffset_fsb, 2653 error = xfs_bunmapi(tp, ip, startoffset_fsb,
2825 endoffset_fsb - startoffset_fsb, 2654 endoffset_fsb - startoffset_fsb,
2826 0, 2, &firstfsb, &free_list, NULL, &done); 2655 0, 2, &firstfsb, &free_list, &done);
2827 if (error) { 2656 if (error) {
2828 goto error0; 2657 goto error0;
2829 } 2658 }
@@ -2883,8 +2712,6 @@ xfs_change_file_space(
2883 xfs_trans_t *tp; 2712 xfs_trans_t *tp;
2884 struct iattr iattr; 2713 struct iattr iattr;
2885 2714
2886 xfs_itrace_entry(ip);
2887
2888 if (!S_ISREG(ip->i_d.di_mode)) 2715 if (!S_ISREG(ip->i_d.di_mode))
2889 return XFS_ERROR(EINVAL); 2716 return XFS_ERROR(EINVAL);
2890 2717
@@ -2985,8 +2812,7 @@ xfs_change_file_space(
2985 2812
2986 xfs_ilock(ip, XFS_ILOCK_EXCL); 2813 xfs_ilock(ip, XFS_ILOCK_EXCL);
2987 2814
2988 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2815 xfs_trans_ijoin(tp, ip);
2989 xfs_trans_ihold(tp, ip);
2990 2816
2991 if ((attr_flags & XFS_ATTR_DMI) == 0) { 2817 if ((attr_flags & XFS_ATTR_DMI) == 0) {
2992 ip->i_d.di_mode &= ~S_ISUID; 2818 ip->i_d.di_mode &= ~S_ISUID;